Weighted P-frame prediction

author Dylan Yudaken <dyudaken@gmail.com>

Mon, 9 Nov 2009 01:59:08 +0000 (17:59 -0800)

committer Fiona Glaser <fiona@x264.com>

Mon, 9 Nov 2009 04:21:51 +0000 (20:21 -0800)
author Dylan Yudaken <dyudaken@gmail.com>
Mon, 9 Nov 2009 01:59:08 +0000 (17:59 -0800)
committer Fiona Glaser <fiona@x264.com>
Mon, 9 Nov 2009 04:21:51 +0000 (20:21 -0800)
diff --git a/common/common.c b/common/common.c

index fc8b06a145458252dbc5a0e9d42b607125fd87c3..42f759b1803b043bcb870aba5b57b476243b5b6f 100644 (file)
--- a/common/common.c
+++ b/common/common.c
@@ -136,6 +136,7 @@ void    x264_param_default( x264_param_t *param )
      param->analyse.i_chroma_qp_offset = 0;
      param->analyse.b_fast_pskip = 1;
      param->analyse.b_weighted_bipred = 1;
+    param->analyse.i_weighted_pred = X264_WEIGHTP_SMART;
      param->analyse.b_dct_decimate = 1;
      param->analyse.b_transform_8x8 = 1;
      param->analyse.i_trellis = 1;
@@ -489,6 +490,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
          p->analyse.b_transform_8x8 = atobool(value);
      OPT2("weightb", "weight-b")
          p->analyse.b_weighted_bipred = atobool(value);
+    OPT("weightp")
+        p->analyse.i_weighted_pred = atoi(value);
      OPT2("direct", "direct-pred")
          b_error |= parse_enum( value, x264_direct_pred_names, &p->analyse.i_direct_mv_pred );
      OPT("chroma-qp-offset")
@@ -903,6 +906,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
                        p->i_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias,
                        p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred );
      }
+    s += sprintf( s, " wpredp=%d", p->analyse.i_weighted_pred > 0 ? p->analyse.i_weighted_pred : 0 );
  
      s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d",
                    p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold );
diff --git a/common/common.h b/common/common.h

index 2496d0f22d493445c0debfe10f220ccd8474183c..8bd71d358f5a6566492499f829e5c73be53b4144 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -60,6 +60,13 @@ do {\
  // 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
  #define X264_THREAD_HEIGHT 24
  
+/* WEIGHTP_FAKE is set when mb_tree & psy are enabled, but normal weightp is disabled
+ * (such as in baseline). It checks for fades in lookahead and adjusts qp accordingly
+ * to increase quality. Defined as (-1) so that if(i_weighted_pred > 0) is true only when
+ * real weights are being used. */
+
+#define X264_WEIGHTP_FAKE (-1)
+
  /****************************************************************************
   * Includes
   ****************************************************************************/
@@ -233,6 +240,9 @@ typedef struct
          int arg;
      } ref_pic_list_order[2][16];
  
+    /* P-frame weighting */
+    x264_weight_t weight[16][3];
+
      int i_mmco_remove_from_end;
      int i_mmco_command_count;
      struct /* struct for future expansion */
@@ -390,6 +400,9 @@ struct x264_t
          /* Unused frames: 0 = fenc, 1 = fdec */
          x264_frame_t **unused[2];
  
+        /* Unused blank frames (for duplicates) */
+        x264_frame_t **blank_unused;
+
          /* frames used for reference + sentinels */
          x264_frame_t *reference[16+2];
  
@@ -502,6 +515,9 @@ struct x264_t
          uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
          uint8_t (*nnz_backup)[16];          /* when using cavlc + 8x8dct, the deblocker uses a modified nnz */
  
+         /* buffer for weighted versions of the reference frames */
+        uint8_t *p_weight_buf[16];
+
          /* current value */
          int     i_type;
          int     i_partition;
@@ -564,6 +580,7 @@ struct x264_t
              /* pointer over mb of the references */
              int i_fref[2];
              uint8_t *p_fref[2][32][4+2]; /* last: lN, lH, lV, lHV, cU, cV */
+            uint8_t *p_fref_w[32];  /* weighted fullpel luma */
              uint16_t *p_integral[2][16];
  
              /* fref stride */
@@ -681,6 +698,8 @@ struct x264_t
          /* */
          int     i_direct_score[2];
          int     i_direct_frames[2];
+        /* num p-frames weighted */
+        int     i_wpred[3];
  
      } stat;
  
diff --git a/common/frame.c b/common/frame.c

index 6ce1a531bf2b47b36ff196758275b17f564f3f0a..4760ca1a9af3e3d50af1578fab0e882bebfff16d 100644 (file)
--- a/common/frame.c
+++ b/common/frame.c
@@ -73,6 +73,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
      frame->i_frame_num = -1;
      frame->i_lines_completed = -1;
      frame->b_fdec = b_fdec;
+    frame->orig = frame;
  
      /* all 4 luma planes allocated together, since the cacheline split code
       * requires them to be in-phase wrt cacheline alignment. */
@@ -86,9 +87,11 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
      else
      {
          CHECKED_MALLOC( frame->buffer[0], luma_plane_size);
-        frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
+        frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
      }
  
+    frame->b_duplicate = 0;
+
      if( b_fdec ) /* fdec frame */
      {
          CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
@@ -168,38 +171,43 @@ fail:
  void x264_frame_delete( x264_frame_t *frame )
  {
      int i, j;
-    for( i = 0; i < 4; i++ )
-        x264_free( frame->buffer[i] );
-    for( i = 0; i < 4; i++ )
-        x264_free( frame->buffer_lowres[i] );
-    for( i = 0; i < X264_BFRAME_MAX+2; i++ )
-        for( j = 0; j < X264_BFRAME_MAX+2; j++ )
-            x264_free( frame->i_row_satds[i][j] );
-    for( j = 0; j < 2; j++ )
-        for( i = 0; i <= X264_BFRAME_MAX; i++ )
-        {
-            x264_free( frame->lowres_mvs[j][i] );
-            x264_free( frame->lowres_mv_costs[j][i] );
-        }
-    x264_free( frame->i_propagate_cost );
-    for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
-        for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
-        {
-            x264_free( frame->lowres_costs[j][i] );
-            x264_free( frame->lowres_inter_types[j][i] );
-        }
-    x264_free( frame->f_qp_offset );
-    x264_free( frame->f_qp_offset_aq );
-    x264_free( frame->i_inv_qscale_factor );
-    x264_free( frame->i_row_bits );
-    x264_free( frame->i_row_qp );
-    x264_free( frame->mb_type );
-    x264_free( frame->mv[0] );
-    x264_free( frame->mv[1] );
-    x264_free( frame->ref[0] );
-    x264_free( frame->ref[1] );
-    x264_pthread_mutex_destroy( &frame->mutex );
-    x264_pthread_cond_destroy( &frame->cv );
+    /* Duplicate frames are blank copies of real frames (including pointers),
+     * so freeing those pointers would cause a double free later. */
+    if( !frame->b_duplicate )
+    {
+        for( i = 0; i < 4; i++ )
+            x264_free( frame->buffer[i] );
+        for( i = 0; i < 4; i++ )
+            x264_free( frame->buffer_lowres[i] );
+        for( i = 0; i < X264_BFRAME_MAX+2; i++ )
+            for( j = 0; j < X264_BFRAME_MAX+2; j++ )
+                x264_free( frame->i_row_satds[i][j] );
+        for( j = 0; j < 2; j++ )
+            for( i = 0; i <= X264_BFRAME_MAX; i++ )
+            {
+                x264_free( frame->lowres_mvs[j][i] );
+                x264_free( frame->lowres_mv_costs[j][i] );
+            }
+        x264_free( frame->i_propagate_cost );
+        for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
+            for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
+            {
+                x264_free( frame->lowres_costs[j][i] );
+                x264_free( frame->lowres_inter_types[j][i] );
+            }
+        x264_free( frame->f_qp_offset );
+        x264_free( frame->f_qp_offset_aq );
+        x264_free( frame->i_inv_qscale_factor );
+        x264_free( frame->i_row_bits );
+        x264_free( frame->i_row_qp );
+        x264_free( frame->mb_type );
+        x264_free( frame->mv[0] );
+        x264_free( frame->mv[1] );
+        x264_free( frame->ref[0] );
+        x264_free( frame->ref[1] );
+        x264_pthread_mutex_destroy( &frame->mutex );
+        x264_pthread_cond_destroy( &frame->cv );
+    }
      x264_free( frame );
  }
  
@@ -747,7 +755,15 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
                              int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
                              int i4p= mb_4x4+x+y*s4x4;\
                              int i4q= mbn_4x4+xn+yn*s4x4;\
-                            if((h->mb.ref[0][i8p] != h->mb.ref[0][i8q] ||\
+                            int refs_equal;\
+                            if( h->mb.ref[0][i8p] < 0 || h->mb.ref[0][i8q] < 0 )\
+                                refs_equal = h->mb.ref[0][i8p] == h->mb.ref[0][i8q];\
+                            else if( !h->mb.b_interlaced )\
+                                refs_equal = h->fref0[h->mb.ref[0][i8p]]->i_poc == h->fref0[h->mb.ref[0][i8q]]->i_poc;\
+                            else\
+                                refs_equal = ( h->fref0[h->mb.ref[0][i8p]>>1]->i_poc == h->fref0[h->mb.ref[0][i8q]>>1]->i_poc ) &&\
+                                                 ( (h->mb.ref[0][i8p]&1) == (h->mb.ref[0][i8q]&1) );\
+                            if((!refs_equal ||\
                                  abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
                                  abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
                                 (h->sh.i_type == SLICE_TYPE_B &&\
@@ -992,6 +1008,32 @@ x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
      frame->i_reference_count = 1;
      frame->b_intra_calculated = 0;
      frame->b_scenecut = 1;
+
+    memset( frame->weight, 0, sizeof(frame->weight) );
+    memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
+
+    return frame;
+}
+
+void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
+{
+    assert( frame->i_reference_count > 0 );
+    frame->i_reference_count--;
+    if( frame->i_reference_count == 0 )
+        x264_frame_push( h->frames.blank_unused, frame );
+}
+
+x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
+{
+    x264_frame_t *frame;
+    if( h->frames.blank_unused[0] )
+        frame = x264_frame_pop( h->frames.blank_unused );
+    else
+        frame = x264_malloc( sizeof(x264_frame_t) );
+    if( !frame )
+        return NULL;
+    frame->b_duplicate = 1;
+    frame->i_reference_count = 1;
      return frame;
  }
  
@@ -1015,9 +1057,27 @@ void x264_frame_sort( x264_frame_t **list, int b_dts )
      } while( !b_ok );
  }
  
+void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
+                         int i_width, int i_height, x264_weight_t *w )
+{
+    int x;
+    /* Weight horizontal strips of height 16. This was found to be the optimal height
+     * in terms of the cache loads. */
+    while( i_height > 0 )
+    {
+        for( x = 0; x < i_width ; x += 16 )
+            w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
+        i_height -= 16;
+        dst += 16 * i_dst_stride;
+        src += 16 * i_src_stride;
+    }
+}
+
  void x264_frame_delete_list( x264_frame_t **list )
  {
      int i = 0;
+    if( !list )
+        return;
      while( list[i] )
          x264_frame_delete( list[i++] );
      x264_free( list );
diff --git a/common/frame.h b/common/frame.h

index 02cc31a4219362d9712a8c38fee73ea80c8fb55e..fbc0cdd039a45b2d773fb827ffbcb66eb755ba09 100644 (file)
--- a/common/frame.h
+++ b/common/frame.h
@@ -28,7 +28,7 @@
  #define PADH 32
  #define PADV 32
  
-typedef struct
+typedef struct x264_frame
  {
      /* */
      int     i_poc;
@@ -65,6 +65,11 @@ typedef struct
      uint8_t *buffer[4];
      uint8_t *buffer_lowres[4];
  
+    x264_weight_t weight[16][3]; /* the weights for the P frames used to encode this frame */
+    uint8_t *weighted[16]; /* plane[0] weighted of the reference frames */
+    int b_duplicate;
+    struct x264_frame *orig;
+
      /* motion data */
      int8_t  *mb_type;
      int16_t (*mv[2])[2];
@@ -96,6 +101,7 @@ typedef struct
      uint16_t *i_propagate_cost;
      uint16_t *i_inv_qscale_factor;
      int     b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
+    float   f_weighted_cost_delta[X264_BFRAME_MAX+2];
  
      /* vbv */
      uint8_t i_planned_type[X264_LOOKAHEAD_MAX+1];
@@ -103,6 +109,7 @@ typedef struct
  
      /* threading */
      int     i_lines_completed; /* in pixels */
+    int     i_lines_weighted; /* FIXME: this only supports weighting of one reference frame */
      int     i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */
      x264_pthread_mutex_t mutex;
      x264_pthread_cond_t  cv;
@@ -160,6 +167,10 @@ x264_frame_t *x264_frame_pop( x264_frame_t **list );
  void          x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame );
  x264_frame_t *x264_frame_shift( x264_frame_t **list );
  void          x264_frame_push_unused( x264_t *h, x264_frame_t *frame );
+void          x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame );
+x264_frame_t *x264_frame_pop_blank_unused( x264_t *h );
+void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
+                              int i_width, int i_height, x264_weight_t *w );
  x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec );
  void          x264_frame_sort( x264_frame_t **list, int b_dts );
  void          x264_frame_delete_list( x264_frame_t **list );
diff --git a/common/macroblock.c b/common/macroblock.c

index 0e0151577f95b8c998efcf0e32e9dac6008a3b3d..c747241c509661eee7a0ac07cb4331dd30e49356 100644 (file)
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -477,7 +477,7 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h
  
      h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
                     h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0],
-                   mvx, mvy, 4*width, 4*height );
+                   mvx, mvy, 4*width, 4*height, &h->sh.weight[i_ref][0] );
  
      // chroma is offset if MCing from a field of opposite parity
      if( h->mb.b_interlaced & i_ref )
@@ -487,9 +487,20 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h
                       h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
                       mvx, mvy, 2*width, 2*height );
  
+    if( h->sh.weight[i_ref][1].weightfn )
+        h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                                                   &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                                                   &h->sh.weight[i_ref][1], height*2 );
+
      h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
                       h->mb.pic.p_fref[0][i_ref][5], h->mb.pic.i_stride[2],
                       mvx, mvy, 2*width, 2*height );
+
+    if( h->sh.weight[i_ref][2].weightfn )
+        h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                                                   &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                                                   &h->sh.weight[i_ref][2],height*2 );
+
  }
  static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
  {
@@ -500,7 +511,7 @@ static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int h
  
      h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
                     h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0],
-                   mvx, mvy, 4*width, 4*height );
+                   mvx, mvy, 4*width, 4*height, weight_none );
  
      if( h->mb.b_interlaced & i_ref )
          mvy += (h->mb.i_mb_y & 1)*4 - 2;
@@ -531,9 +542,9 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
      uint8_t *src0, *src1;
  
      src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0],
-                          mvx0, mvy0, 4*width, 4*height );
+                          mvx0, mvy0, 4*width, 4*height, weight_none );
      src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
-                          mvx1, mvy1, 4*width, 4*height );
+                          mvx1, mvy1, 4*width, 4*height, weight_none );
      h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
                         src0, i_stride0, src1, i_stride1, weight );
  
@@ -701,10 +712,55 @@ int x264_macroblock_cache_init( x264_t *h )
      for( i=0; i<2; i++ )
      {
          int i_refs = X264_MIN(16, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << h->param.b_interlaced;
+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+            i_refs = X264_MIN(16, i_refs + 2); //smart weights add two duplicate frames
+        else if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND )
+            i_refs = X264_MIN(16, i_refs + 1); //blind weights add one duplicate frame
+
          for( j=0; j < i_refs; j++ )
              CHECKED_MALLOC( h->mb.mvr[i][j], 2 * i_mb_count * sizeof(int16_t) );
      }
  
+    if( h->param.analyse.i_weighted_pred )
+    {
+        int i_padv = PADV << h->param.b_interlaced;
+#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
+        int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
+        int i_stride, luma_plane_size;
+        int numweightbuf;
+
+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE )
+        {
+            // only need buffer for lookahead thread
+            if( !h->param.i_sync_lookahead || h == h->thread[h->param.i_threads] )
+            {
+                // Fake analysis only works on lowres
+                i_stride = ALIGN( h->sps->i_mb_width*8 + 2*PADH, align );
+                luma_plane_size = i_stride * (h->sps->i_mb_height*8+2*i_padv);
+                // Only need 1 buffer for analysis
+                numweightbuf = 1;
+            }
+            else
+                numweightbuf = 0;
+        }
+        else
+        {
+            i_stride = ALIGN( h->sps->i_mb_width*16 + 2*PADH, align );
+            luma_plane_size = i_stride * (h->sps->i_mb_height*16+2*i_padv);
+
+            if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+                //SMART can weight one ref and one offset -1
+                numweightbuf = 2;
+            else
+                //blind only has one weighted copy (offset -1)
+                numweightbuf = 1;
+        }
+
+        for( i = 0; i < numweightbuf; i++ )
+            CHECKED_MALLOC( h->mb.p_weight_buf[i], luma_plane_size );
+#undef ALIGN
+    }
+
      for( i=0; i<=h->param.b_interlaced; i++ )
          for( j=0; j<3; j++ )
          {
@@ -765,6 +821,9 @@ void x264_macroblock_cache_end( x264_t *h )
      for( i=0; i<2; i++ )
          for( j=0; j<32; j++ )
              x264_free( h->mb.mvr[i][j] );
+    for( i=0; i<16; i++ )
+        x264_free( h->mb.p_weight_buf[i] );
+
      if( h->param.b_cabac )
      {
          x264_free( h->mb.chroma_pred_mode );
@@ -866,8 +925,14 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb
      {
          h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
          if( i == 0 )
+        {
              for( k = 1; k < 4; k++ )
                  h->mb.pic.p_fref[0][j][k] = &fref[0][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]];
+            if( h->sh.weight[j][0].weightfn )
+                h->mb.pic.p_fref_w[j] = &h->fenc->weighted[j >> h->mb.b_interlaced][ref_pix_offset[j&1]];
+            else
+                h->mb.pic.p_fref_w[j] = h->mb.pic.p_fref[0][j][0];
+        }
      }
      if( h->sh.i_type == SLICE_TYPE_B )
          for( j = 0; j < h->mb.pic.i_fref[1]; j++ )
diff --git a/common/mc.c b/common/mc.c

index b6d3581c176160e1b62ddcc821cd9a40031d8614..959e9e0f6191aa5e019ed51e558b8ac9b9b1b1c4 100644 (file)
--- a/common/mc.c
+++ b/common/mc.c
@@ -120,6 +120,67 @@ PIXEL_AVG_C( pixel_avg_4x2,   4, 2 )
  PIXEL_AVG_C( pixel_avg_2x4,   2, 4 )
  PIXEL_AVG_C( pixel_avg_2x2,   2, 2 )
  
+static void x264_weight_cache( x264_t *h, x264_weight_t *w )
+{
+    w->weightfn = h->mc.weight;
+}
+#define opscale(x) dst[x] = x264_clip_uint8( ( ( ( src[x] * weight->i_scale ) + (1<<(weight->i_denom - 1) ) )>> weight->i_denom ) + weight->i_offset )
+#define opscale_noden(x) dst[x] = x264_clip_uint8( ( src[x] * weight->i_scale ) + weight->i_offset )
+static inline void mc_weight( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height )
+{
+
+    int x, y;
+    if( weight->i_denom >= 1 )
+    {
+        for( y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
+        {
+            for( x = 0; x < i_width; x++ )
+                opscale( x );
+        }
+    }
+    else
+    {
+        for( y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
+            for( x = 0; x < i_width; x++ )
+                opscale_noden( x );
+    }
+}
+
+#define MC_WEIGHT_C( name, lx ) \
+    static void name( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, const x264_weight_t *weight, int height ) \
+{ \
+    int x, y; \
+    if( weight->i_denom >= 1 ) \
+    { \
+        for( y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \
+            for( x = 0; x < lx; x++ ) \
+                opscale( x ); \
+    } \
+    else \
+    { \
+        for( y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \
+            for( x = 0; x < lx; x++ ) \
+                opscale_noden( x ); \
+    } \
+}
+
+MC_WEIGHT_C( mc_weight_w20, 20 )
+MC_WEIGHT_C( mc_weight_w16, 16 )
+MC_WEIGHT_C( mc_weight_w12, 12 )
+MC_WEIGHT_C( mc_weight_w8,   8 )
+MC_WEIGHT_C( mc_weight_w4,   4 )
+MC_WEIGHT_C( mc_weight_w2,   2 )
+
+static weight_fn_t x264_mc_weight_wtab[6] =
+{
+    mc_weight_w2,
+    mc_weight_w4,
+    mc_weight_w8,
+    mc_weight_w12,
+    mc_weight_w16,
+    mc_weight_w20,
+};
+const x264_weight_t weight_none[3] = { {{0}} };
  static void mc_copy( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
  {
      int y;
@@ -163,7 +224,7 @@ static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
  static void mc_luma( uint8_t *dst,    int i_dst_stride,
                       uint8_t *src[4], int i_src_stride,
                       int mvx, int mvy,
-                     int i_width, int i_height )
+                     int i_width, int i_height, const x264_weight_t *weight )
  {
      int qpel_idx = ((mvy&3)<<2) + (mvx&3);
      int offset = (mvy>>2)*i_src_stride + (mvx>>2);
@@ -174,17 +235,19 @@ static void mc_luma( uint8_t *dst,    int i_dst_stride,
          uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
          pixel_avg( dst, i_dst_stride, src1, i_src_stride,
                     src2, i_src_stride, i_width, i_height );
+        if( weight->weightfn )
+            mc_weight( dst, i_dst_stride, dst, i_dst_stride, weight, i_width, i_height );
      }
+    else if( weight->weightfn )
+        mc_weight( dst, i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
      else
-    {
          mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height );
-    }
  }
  
  static uint8_t *get_ref( uint8_t *dst,   int *i_dst_stride,
                           uint8_t *src[4], int i_src_stride,
                           int mvx, int mvy,
-                         int i_width, int i_height )
+                         int i_width, int i_height, const x264_weight_t *weight )
  {
      int qpel_idx = ((mvy&3)<<2) + (mvx&3);
      int offset = (mvy>>2)*i_src_stride + (mvx>>2);
@@ -195,6 +258,13 @@ static uint8_t *get_ref( uint8_t *dst,   int *i_dst_stride,
          uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
          pixel_avg( dst, *i_dst_stride, src1, i_src_stride,
                     src2, i_src_stride, i_width, i_height );
+        if( weight->weightfn )
+            mc_weight( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_width, i_height );
+        return dst;
+    }
+    else if( weight->weightfn )
+    {
+        mc_weight( dst, *i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
          return dst;
      }
      else
@@ -403,6 +473,11 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
      pf->avg[PIXEL_2x4]  = pixel_avg_2x4;
      pf->avg[PIXEL_2x2]  = pixel_avg_2x2;
  
+    pf->weight    = x264_mc_weight_wtab;
+    pf->offsetadd = x264_mc_weight_wtab;
+    pf->offsetsub = x264_mc_weight_wtab;
+    pf->weight_cache = x264_weight_cache;
+
      pf->copy_16x16_unaligned = mc_copy_w16;
      pf->copy[PIXEL_16x16] = mc_copy_w16;
      pf->copy[PIXEL_8x8]   = mc_copy_w8;
diff --git a/common/mc.h b/common/mc.h

index 556ae838bcb68a381b7efc8cc93e4879453e046e..68bba4885621c358884a4d1d8f86765a5465915f 100644 (file)
--- a/common/mc.h
+++ b/common/mc.h
@@ -21,6 +21,33 @@
  #ifndef X264_MC_H
  #define X264_MC_H
  
+struct x264_weight_t;
+typedef void (* weight_fn_t)( uint8_t *, int, uint8_t *,int, const struct x264_weight_t *, int );
+typedef struct x264_weight_t
+{
+    /* aligning the first member is a gcc hack to force the struct to be
+     * 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
+    ALIGNED_16( int16_t cachea[8] );
+    int16_t cacheb[8];
+    int32_t i_denom;
+    int32_t i_scale;
+    int32_t i_offset;
+    weight_fn_t *weightfn;
+} ALIGNED_16( x264_weight_t );
+
+extern const x264_weight_t weight_none[3];
+
+#define SET_WEIGHT( w, b, s, d, o )\
+{\
+    (w).i_scale = (s);\
+    (w).i_denom = (d);\
+    (w).i_offset = (o);\
+    if( b )\
+        h->mc.weight_cache( h, &w );\
+    else\
+        w.weightfn = NULL;\
+}
+
  /* Do the MC
   * XXX: Only width = 4, 8 or 16 are valid
   * width == 4 -> height == 4 or 8
@@ -32,12 +59,12 @@ typedef struct
  {
      void (*mc_luma)(uint8_t *dst, int i_dst, uint8_t **src, int i_src,
                      int mvx, int mvy,
-                    int i_width, int i_height );
+                    int i_width, int i_height, const x264_weight_t *weight );
  
      /* may round up the dimensions if they're not a power of 2 */
      uint8_t* (*get_ref)(uint8_t *dst, int *i_dst, uint8_t **src, int i_src,
                          int mvx, int mvy,
-                        int i_width, int i_height );
+                        int i_width, int i_height, const x264_weight_t *weight );
  
      /* mc_chroma may write up to 2 bytes of garbage to the right of dst,
       * so it must be run from left to right. */
@@ -74,6 +101,10 @@ typedef struct
  
      void (*frame_init_lowres_core)( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
                                      int src_stride, int dst_stride, int width, int height );
+    weight_fn_t *weight;
+    weight_fn_t *offsetadd;
+    weight_fn_t *offsetsub;
+    void (*weight_cache)( x264_t *, x264_weight_t * );
  
      void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                     uint16_t *inter_costs, uint16_t *inv_qscales, int len );
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm

index 720f33cc3de705016ab16a30d9c45edac3c5eafb..a53c14b3f9d73e7cec049806d0ef83ee54f1f7a3 100644 (file)
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -6,6 +6,7 @@
  ;* Authors: Loren Merritt <lorenm@u.washington.edu>
  ;*          Fiona Glaser <fiona@x264.com>
  ;*          Laurent Aimar <fenrir@via.ecp.fr>
+;*          Dylan Yudaken <dyudaken@gmail.com>
  ;*          Min Chen <chenm001.163.com>
  ;*
  ;* This program is free software; you can redistribute it and/or modify
@@ -28,6 +29,7 @@
  SECTION_RODATA 32
  
  ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0
+pw_1:  times 8 dw  1
  pw_4:  times 8 dw  4
  pw_8:  times 8 dw  8
  pw_32: times 8 dw 32
@@ -37,9 +39,8 @@ sw_64: dd 64
  SECTION .text
  
  ;=============================================================================
-; weighted prediction
+; implicit weighted biprediction
  ;=============================================================================
-; implicit bipred only:
  ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
  %ifdef ARCH_X86_64
      DECLARE_REG_TMP 0,1,2,3,4,5,10,11
@@ -64,12 +65,12 @@ SECTION .text
      %endmacro
  %endif
  
-%macro SPLATW 2
+%macro SPLATW 2-3 0
  %if mmsize==16
-    pshuflw  %1, %2, 0
+    pshuflw  %1, %2, %3*0x55
      punpcklqdq %1, %1
  %else
-    pshufw   %1, %2, 0
+    pshufw   %1, %2, %3*0x55
  %endif
  %endmacro
  
@@ -175,6 +176,225 @@ INIT_XMM
  AVG_WEIGHT ssse3, 8,  7
  AVG_WEIGHT ssse3, 16, 7
  
+;=============================================================================
+; P frame explicit weighted prediction
+;=============================================================================
+
+%macro WEIGHT_START 1
+    mova     m3, [r4]
+    mova     m6, [r4+16]
+    movd     m5, [r4+32]
+    pxor     m2, m2
+%if (%1 == 20 || %1 == 12) && mmsize == 16
+    movdq2q mm3, xmm3
+    movdq2q mm4, xmm4
+    movdq2q mm5, xmm5
+    movdq2q mm6, xmm6
+    pxor    mm2, mm2
+%endif
+%endmacro
+
+%macro WEIGHT_START_SSSE3 1
+    mova     m3, [r4]
+    mova     m4, [r4+16]
+    pxor     m2, m2
+%if ( %1 == 20 || %1 == 12 )
+    movdq2q mm3, xmm3
+    movdq2q mm4, xmm4
+    pxor    mm2, mm2
+%endif
+%endmacro
+
+;; macro to weight mmsize bytes taking half from %1 and half from %2
+%macro WEIGHT 2             ; (src1,src2)
+    movh      m0, [%1]
+    movh      m1, [%2]
+    punpcklbw m0, m2        ;setup
+    punpcklbw m1, m2        ;setup
+    pmullw    m0, m3        ;scale
+    pmullw    m1, m3        ;scale
+    paddsw    m0, m6        ;1<<(denom-1)+(offset<<denom)
+    paddsw    m1, m6        ;1<<(denom-1)+(offset<<denom)
+    psraw     m0, m5        ;denom
+    psraw     m1, m5        ;denom
+%endmacro
+
+%macro WEIGHT_SSSE3 2
+    movh      m0, [%1]
+    movh      m1, [%2]
+    punpcklbw m0, m2
+    punpcklbw m1, m2
+    psllw     m0, 7
+    psllw     m1, 7
+    pmulhrsw  m0, m3
+    pmulhrsw  m1, m3
+    paddw     m0, m4
+    paddw     m1, m4
+%endmacro
+
+%macro WEIGHT_SAVE_ROW 3        ;(src,dst,width)
+%if %3 == 16
+    mova     [%2], %1
+%elif %3 == 8
+    movq     [%2], %1
+%else
+    movd     [%2], %1       ; width 2 can write garbage for last 2 bytes
+%endif
+%endmacro
+
+%macro WEIGHT_ROW 3         ; (src,dst,width)
+    ;; load weights
+    WEIGHT           %1, (%1+(mmsize/2))
+    packuswb         m0, m1        ;put bytes into m0
+    WEIGHT_SAVE_ROW  m0, %2, %3
+%endmacro
+
+%macro WEIGHT_SAVE_COL 2        ;(dst,size)
+%if %2 == 8
+    packuswb     m0, m1
+    movq       [%1], m0
+    movhps  [%1+r1], m0
+%else
+    packuswb     m0, m0
+    packuswb     m1, m1
+    movd       [%1], m0    ; width 2 can write garbage for last 2 bytes
+    movd    [%1+r1], m1
+%endif
+%endmacro
+
+%macro WEIGHT_COL 3     ; (src,dst,width)
+%if %3 <= 4 && mmsize == 16
+    INIT_MMX
+    ;; load weights
+    WEIGHT           %1, (%1+r3)
+    WEIGHT_SAVE_COL  %2, %3
+    INIT_XMM
+%else
+    WEIGHT           %1, (%1+r3)
+    WEIGHT_SAVE_COL  %2, %3
+%endif
+
+%endmacro
+
+%macro WEIGHT_TWO_ROW 3 ; (src,dst,width)
+%assign x 0
+%rep %3
+%if (%3-x) >= mmsize
+    WEIGHT_ROW    (%1+x),    (%2+x), mmsize     ; weight 1 mmsize
+    WEIGHT_ROW (%1+r3+x), (%2+r1+x), mmsize     ; weight 1 mmsize
+    %assign x (x+mmsize)
+%else
+    WEIGHT_COL (%1+x),(%2+x),(%3-x)
+    %exitrep
+%endif
+%if x >= %3
+    %exitrep
+%endif
+%endrep
+%endmacro
+
+
+;void x264_mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src,int i_src_stride, x264_weight_t *weight,int h)
+
+%ifdef ARCH_X86_64
+%define NUMREGS 6
+%define LOAD_HEIGHT
+%define HEIGHT_REG r5d
+%else
+%define NUMREGS 5
+%define LOAD_HEIGHT mov r4d, r5m
+%define HEIGHT_REG r4d
+%endif
+
+%macro WEIGHTER 2
+    cglobal x264_mc_weight_w%1_%2, NUMREGS, NUMREGS, 7
+    WEIGHT_START %1
+    LOAD_HEIGHT
+.loop:
+    WEIGHT_TWO_ROW r2, r0, %1
+    lea  r0, [r0+r1*2]
+    lea  r2, [r2+r3*2]
+    sub HEIGHT_REG, 2
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_MMX
+WEIGHTER  4, mmxext
+WEIGHTER  8, mmxext
+WEIGHTER 12, mmxext
+WEIGHTER 16, mmxext
+WEIGHTER 20, mmxext
+INIT_XMM
+WEIGHTER  8, sse2
+WEIGHTER 16, sse2
+WEIGHTER 20, sse2
+%define WEIGHT WEIGHT_SSSE3
+%define WEIGHT_START WEIGHT_START_SSSE3
+INIT_MMX
+WEIGHTER  4, ssse3
+INIT_XMM
+WEIGHTER  8, ssse3
+WEIGHTER 16, ssse3
+WEIGHTER 20, ssse3
+
+%macro OFFSET_OP 7
+    mov%6        m0, [%1]
+    mov%6        m1, [%2]
+    p%5usb       m0, m2
+    p%5usb       m1, m2
+    mov%7      [%3], m0
+    mov%7      [%4], m1
+%endmacro
+
+%macro OFFSET_TWO_ROW 4
+%assign x 0
+%rep %3
+%if (%3-x) >= mmsize
+    OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
+    %assign x (x+mmsize)
+%else
+    OFFSET_OP (%1+x),(%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
+    %exitrep
+%endif
+%if x >= %3
+    %exitrep
+%endif
+%endrep
+%endmacro
+
+;void x264_mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, x264_weight_t *w, int h )
+%macro OFFSET 3
+    cglobal x264_mc_offset%3_w%1_%2, NUMREGS, NUMREGS
+    mova m2, [r4]
+    LOAD_HEIGHT
+.loop:
+    OFFSET_TWO_ROW r2, r0, %1, %3
+    lea  r0, [r0+r1*2]
+    lea  r2, [r2+r3*2]
+    sub HEIGHT_REG, 2
+    jg .loop
+    REP_RET
+%endmacro
+
+%macro OFFSETPN 2
+       OFFSET %1, %2, add
+       OFFSET %1, %2, sub
+%endmacro
+INIT_MMX
+OFFSETPN  4, mmxext
+OFFSETPN  8, mmxext
+OFFSETPN 12, mmxext
+OFFSETPN 16, mmxext
+OFFSETPN 20, mmxext
+INIT_XMM
+OFFSETPN 12, sse2
+OFFSETPN 16, sse2
+OFFSETPN 20, sse2
+%undef LOAD_HEIGHT
+%undef HEIGHT_REG
+%undef NUMREGS
+
  
  
  ;=============================================================================
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index da28249e02c6f88b9d202439b6051290e0c65dc7..abf4cfc75d49cdf233fe2a3123469275f2abca1e 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -42,6 +42,32 @@ DECL_SUF( x264_pixel_avg_8x4,   ( uint8_t *, int, uint8_t *, int, uint8_t *, int
  DECL_SUF( x264_pixel_avg_4x8,   ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
  DECL_SUF( x264_pixel_avg_4x4,   ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
  DECL_SUF( x264_pixel_avg_4x2,   ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
+
+#define MC_WEIGHT(w,type) \
+    extern void x264_mc_weight_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int );
+
+#define MC_WEIGHT_OFFSET(w,type) \
+    extern void x264_mc_offsetadd_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
+    extern void x264_mc_offsetsub_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
+    MC_WEIGHT(w,type)
+
+MC_WEIGHT_OFFSET( 4, mmxext )
+MC_WEIGHT_OFFSET( 8, mmxext )
+MC_WEIGHT_OFFSET( 12, mmxext )
+MC_WEIGHT_OFFSET( 16, mmxext )
+MC_WEIGHT_OFFSET( 20, mmxext )
+MC_WEIGHT_OFFSET( 12, sse2 )
+MC_WEIGHT_OFFSET( 16, sse2 )
+MC_WEIGHT_OFFSET( 20, sse2 )
+MC_WEIGHT( 8, sse2  )
+MC_WEIGHT( 4, ssse3 )
+MC_WEIGHT( 8, ssse3 )
+MC_WEIGHT( 12, ssse3 )
+MC_WEIGHT( 16, ssse3 )
+MC_WEIGHT( 20, ssse3 )
+#undef MC_OFFSET
+#undef MC_WEIGHT
+
  extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
  extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
  extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
@@ -137,6 +163,70 @@ static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, i
  MC_COPY_WTAB(mmx,mmx,mmx,mmx)
  MC_COPY_WTAB(sse2,mmx,mmx,sse2)
  
+#define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
+    static void (* x264_mc_##function##_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int ) =\
+{\
+    x264_mc_##function##_w4_##name1,\
+    x264_mc_##function##_w4_##name1,\
+    x264_mc_##function##_w8_##name2,\
+    x264_mc_##function##_w##w12version##_##instr,\
+    x264_mc_##function##_w16_##instr,\
+    x264_mc_##function##_w20_##instr,\
+};
+
+MC_WEIGHT_WTAB(weight,mmxext,mmxext,mmxext,12)
+MC_WEIGHT_WTAB(offsetadd,mmxext,mmxext,mmxext,12)
+MC_WEIGHT_WTAB(offsetsub,mmxext,mmxext,mmxext,12)
+MC_WEIGHT_WTAB(weight,sse2,mmxext,sse2,16)
+MC_WEIGHT_WTAB(offsetadd,sse2,mmxext,mmxext,16)
+MC_WEIGHT_WTAB(offsetsub,sse2,mmxext,mmxext,16)
+MC_WEIGHT_WTAB(weight,ssse3,ssse3,ssse3,16)
+
+static void x264_weight_cache_mmxext( x264_t *h, x264_weight_t *w )
+{
+    int i;
+    int16_t den1;
+
+    if( w->i_scale == 1<<w->i_denom )
+    {
+        if( w->i_offset < 0 )
+            w->weightfn = h->mc.offsetsub;
+        else
+            w->weightfn = h->mc.offsetadd;
+        memset( w->cachea, abs(w->i_offset), sizeof(w->cachea) );
+        return;
+    }
+    w->weightfn = h->mc.weight;
+    den1 = 1 << ( w->i_denom - 1 ) | w->i_offset << w->i_denom;
+    for( i = 0; i < 8; i++ )
+    {
+        w->cachea[i] = w->i_scale;
+        w->cacheb[i] = den1;
+    }
+}
+
+static void x264_weight_cache_ssse3( x264_t *h, x264_weight_t *w )
+{
+    int i, den1;
+    if( w->i_scale == 1<<w->i_denom )
+    {
+        if( w->i_offset < 0 )
+            w->weightfn = h->mc.offsetsub;
+        else
+            w->weightfn = h->mc.offsetadd;
+
+        memset( w->cachea, abs( w->i_offset ), sizeof(w->cachea) );
+        return;
+    }
+    w->weightfn = h->mc.weight;
+    den1 = ( w->i_scale ) << ( 8- w->i_denom );
+    for(i = 0;i<8;i++)
+    {
+        w->cachea[i] = den1 ;
+        w->cacheb[i] = w->i_offset;
+    }
+}
+
  static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
  static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
  
@@ -144,7 +234,7 @@ static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
  static void mc_luma_##name( uint8_t *dst,    int i_dst_stride,\
                    uint8_t *src[4], int i_src_stride,\
                    int mvx, int mvy,\
-                  int i_width, int i_height )\
+                  int i_width, int i_height, const x264_weight_t *weight )\
  {\
      int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
      int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
@@ -155,12 +245,13 @@ static void mc_luma_##name( uint8_t *dst,    int i_dst_stride,\
          x264_pixel_avg_wtab_##instr1[i_width>>2](\
                  dst, i_dst_stride, src1, i_src_stride,\
                  src2, i_height );\
+        if( weight->weightfn )\
+            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );\
      }\
+    else if( weight->weightfn )\
+        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );\
      else\
-    {\
-        x264_mc_copy_wtab_##instr2[i_width>>2](\
-                dst, i_dst_stride, src1, i_src_stride, i_height );\
-    }\
+        x264_mc_copy_wtab_##instr2[i_width>>2](dst, i_dst_stride, src1, i_src_stride, i_height );\
  }
  
  MC_LUMA(mmxext,mmxext,mmx)
@@ -176,7 +267,7 @@ MC_LUMA(cache64_ssse3,cache64_ssse3,sse2)
  static uint8_t *get_ref_##name( uint8_t *dst,   int *i_dst_stride,\
                           uint8_t *src[4], int i_src_stride,\
                           int mvx, int mvy,\
-                         int i_width, int i_height )\
+                         int i_width, int i_height, const x264_weight_t *weight )\
  {\
      int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
      int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
@@ -187,6 +278,13 @@ static uint8_t *get_ref_##name( uint8_t *dst,   int *i_dst_stride,\
          x264_pixel_avg_wtab_##name[i_width>>2](\
                  dst, *i_dst_stride, src1, i_src_stride,\
                  src2, i_height );\
+        if( weight->weightfn )                                       \
+            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height ); \
+        return dst;\
+    }\
+    else if( weight->weightfn ) \
+    {\
+        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );\
          return dst;\
      }\
      else\
@@ -266,6 +364,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      pf->get_ref = get_ref_mmxext;
      pf->mc_chroma = x264_mc_chroma_mmxext;
  
+    pf->weight = x264_mc_weight_wtab_mmxext;
+    pf->offsetadd = x264_mc_offsetadd_wtab_mmxext;
+    pf->offsetsub = x264_mc_offsetsub_wtab_mmxext;
+    pf->weight_cache = x264_weight_cache_mmxext;
+
      pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext;
      pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_mmxext;
      pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_mmxext;
@@ -310,6 +413,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      if( cpu&X264_CPU_SSE2_IS_SLOW )
          return;
  
+    pf->weight = x264_mc_weight_wtab_sse2;
+    pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
+    pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
+
      pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
      pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
      pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
@@ -355,6 +462,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
          pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
          pf->mc_luma = mc_luma_cache64_ssse3;
          pf->get_ref = get_ref_cache64_ssse3;
+
+        /* ssse3 weight is slower on Nehalem, so only assign here. */
+        pf->weight_cache = x264_weight_cache_ssse3;
+        pf->weight = x264_mc_weight_wtab_ssse3;
      }
  
      if( cpu&X264_CPU_SHUFFLE_IS_FAST )
diff --git a/encoder/analyse.c b/encoder/analyse.c

index 47aa0c231ce9c3764e8c493e84540f194e01c07d..92737888e6ffb2c4d64e2d9fea0e901487e01927 100644 (file)
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -377,14 +377,44 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
                      int i_ref = i ? h->i_ref1 : h->i_ref0;
                      for( j=0; j<i_ref; j++ )
                      {
-                        x264_frame_cond_wait( fref[j], thresh );
+                        x264_frame_cond_wait( fref[j]->orig, thresh );
+                        fref[j]->i_lines_completed = fref[j]->orig->i_lines_completed;
                          thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->i_lines_completed - pix_y );
                      }
                  }
+
                  if( h->param.b_deterministic )
                      thread_mvy_range = h->param.analyse.i_mv_range_thread;
                  if( h->mb.b_interlaced )
                      thread_mvy_range >>= 1;
+
+                for( j=0; j<h->i_ref0; j++ )
+                {
+                    if( h->sh.weight[j][0].weightfn )
+                    {
+                        x264_frame_t *frame = h->fref0[j];
+                        int width = frame->i_width[0] + 2*PADH;
+                        int i_padv = PADV << h->param.b_interlaced;
+                        int offset, height;
+                        uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
+                        int k;
+                        height = X264_MIN( 16 + thread_mvy_range + pix_y + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
+                        offset = h->fenc->i_lines_weighted*frame->i_stride[0];
+                        h->fenc->i_lines_weighted += height;
+                        if( height )
+                        {
+                            for( k = j; k < h->i_ref0; k++ )
+                                if( h->sh.weight[k][0].weightfn )
+                                {
+                                    uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
+                                    x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
+                                                             src + offset, frame->i_stride[0],
+                                                             width, height, &h->sh.weight[k][0] );
+                                }
+                        }
+                        break;
+                    }
+                }
              }
  
              h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
@@ -1155,6 +1185,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
  }
  
  #define LOAD_FENC( m, src, xoff, yoff) \
+    (m)->p_cost_mv = a->p_cost_mv; \
      (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
      (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
      (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
@@ -1162,13 +1193,19 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
      (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
  
  #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
-    (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
+    (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
      (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
      (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
      (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
      (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
      (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
-    (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]];
+    (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
+    (m)->weight = weight_none; \
+    (m)->i_ref = ref;
+
+#define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
+    (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
+    (m)->weight = h->sh.weight[i_ref];
  
  #define REF_COST(list, ref) \
      (a->p_cost_ref##list[ref])
@@ -1183,7 +1220,6 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
  
      /* 16x16 Search on all ref frame */
      m.i_pixel = PIXEL_16x16;
-    m.p_cost_mv = a->p_cost_mv;
      LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
  
      a->l0.me16x16.cost = INT_MAX;
@@ -1192,10 +1228,11 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
          const int i_ref_cost = REF_COST( 0, i_ref );
          i_halfpel_thresh -= i_ref_cost;
          m.i_ref_cost = i_ref_cost;
-        m.i_ref = i_ref;
  
          /* search with ref */
          LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
+        LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
+
          x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
          x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
          x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
@@ -1280,7 +1317,6 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
          const int y8 = i/2;
  
          m.i_pixel = PIXEL_8x8;
-        m.p_cost_mv = a->p_cost_mv;
  
          LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
          l0m->cost = INT_MAX;
@@ -1289,9 +1325,10 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
              const int i_ref_cost = REF_COST( 0, i_ref );
              i_halfpel_thresh -= i_ref_cost;
              m.i_ref_cost = i_ref_cost;
-            m.i_ref = i_ref;
  
              LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
+            LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
+
              x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
              x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
              x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
@@ -1326,7 +1363,6 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
  {
      const int i_ref = a->l0.me16x16.i_ref;
      const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
-    uint8_t  **p_fref = h->mb.pic.p_fref[0][i_ref];
      uint8_t  **p_fenc = h->mb.pic.p_fenc;
      int i_mvc;
      int16_t (*mvc)[2] = a->l0.mvc[i_ref];
@@ -1345,12 +1381,12 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
          const int y8 = i/2;
  
          m->i_pixel = PIXEL_8x8;
-        m->p_cost_mv = a->p_cost_mv;
          m->i_ref_cost = i_ref_cost;
-        m->i_ref = i_ref;
  
          LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
-        LOAD_HPELS( m, p_fref, 0, i_ref, 8*x8, 8*y8 );
+        LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
+        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
+
          x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
          x264_me_search( h, m, mvc, i_mvc );
  
@@ -1392,7 +1428,6 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
          const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
  
          m.i_pixel = PIXEL_16x8;
-        m.p_cost_mv = a->p_cost_mv;
  
          LOAD_FENC( &m, p_fenc, 0, 8*i );
          l0m->cost = INT_MAX;
@@ -1401,7 +1436,6 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
              const int i_ref = ref8[j];
              const int i_ref_cost = REF_COST( 0, i_ref );
              m.i_ref_cost = i_ref_cost;
-            m.i_ref = i_ref;
  
              /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
              *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
@@ -1409,6 +1443,8 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
              *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
  
              LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
+            LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
+
              x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
              x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
              x264_me_search( h, &m, mvc, 3 );
@@ -1442,7 +1478,6 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
          const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
  
          m.i_pixel = PIXEL_8x16;
-        m.p_cost_mv = a->p_cost_mv;
  
          LOAD_FENC( &m, p_fenc, 8*i, 0 );
          l0m->cost = INT_MAX;
@@ -1451,13 +1486,14 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
              const int i_ref = ref8[j];
              const int i_ref_cost = REF_COST( 0, i_ref );
              m.i_ref_cost = i_ref_cost;
-            m.i_ref = i_ref;
  
              *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
              *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
              *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
  
              LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
+            LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
+
              x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
              x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
              x264_me_search( h, &m, mvc, 3 );
@@ -1483,10 +1519,16 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
      const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
      const int i_ref = a->l0.me8x8[i8x8].i_ref;
      const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+    x264_weight_t *weight = h->sh.weight[i_ref];
  
  #define CHROMA4x4MC( width, height, me, x, y ) \
      h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
-    h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height );
+    if( weight[1].weightfn ) \
+        weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
+    h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
+    if( weight[2].weightfn ) \
+        weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
+
  
      if( pixel == PIXEL_4x4 )
      {
@@ -1533,10 +1575,10 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8
          x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
  
          m->i_pixel = PIXEL_4x4;
-        m->p_cost_mv = a->p_cost_mv;
  
          LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
          LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
+        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
  
          x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
          x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
@@ -1573,10 +1615,10 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8
          x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
  
          m->i_pixel = PIXEL_8x4;
-        m->p_cost_mv = a->p_cost_mv;
  
          LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
          LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
+        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
  
          x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
          x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
@@ -1610,10 +1652,10 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8
          x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
  
          m->i_pixel = PIXEL_4x8;
-        m->p_cost_mv = a->p_cost_mv;
  
          LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
          LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
+        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
  
          x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
          x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
@@ -1665,7 +1707,8 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
  
      /* 16x16 Search on all ref frame */
      m.i_pixel = PIXEL_16x16;
-    m.p_cost_mv = a->p_cost_mv;
+    m.weight = weight_none;
+
      LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
  
      /* ME for List 0 */
@@ -1690,6 +1733,8 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
          /* save mv for predicting neighbors */
          *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
      }
+    a->l0.me16x16.i_ref = a->l0.i_ref;
+
      /* subtract ref cost, so we don't have to add it for the other MB types */
      a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref );
  
@@ -1717,6 +1762,8 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
          /* save mv for predicting neighbors */
          *(uint32_t*)h->mb.mvr[1][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
      }
+    a->l1.me16x16.i_ref = a->l1.i_ref;
+
      /* subtract ref cost, so we don't have to add it for the other MB types */
      a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref );
  
@@ -1726,11 +1773,11 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
  
      /* get cost of BI mode */
      src0 = h->mc.get_ref( pix0, &stride0,
-                           h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
-                           a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 );
+                          h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
+                          a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16, weight_none );
      src1 = h->mc.get_ref( pix1, &stride1,
-                           h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
-                           a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 );
+                          h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
+                          a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16, weight_none );
  
      h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
  
@@ -1859,7 +1906,6 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
              x264_me_t *m = &lX->me8x8[i];
  
              m->i_pixel = PIXEL_8x8;
-            m->p_cost_mv = a->p_cost_mv;
  
              LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
              LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
@@ -1871,7 +1917,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
  
              /* BI mode */
              src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
-                                    m->mv[0], m->mv[1], 8, 8 );
+                                    m->mv[0], m->mv[1], 8, 8, weight_none );
              i_part_cost_bi += m->cost_mv;
              /* FIXME: ref cost */
          }
@@ -1922,7 +1968,6 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
              x264_me_t *m = &lX->me16x8[i];
  
              m->i_pixel = PIXEL_16x8;
-            m->p_cost_mv = a->p_cost_mv;
  
              LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
              LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
@@ -1935,7 +1980,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
  
              /* BI mode */
              src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
-                                    m->mv[0], m->mv[1], 16, 8 );
+                                    m->mv[0], m->mv[1], 16, 8, weight_none );
              /* FIXME: ref cost */
              i_part_cost_bi += m->cost_mv;
          }
@@ -1991,7 +2036,6 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
              x264_me_t *m = &lX->me8x16[i];
  
              m->i_pixel = PIXEL_8x16;
-            m->p_cost_mv = a->p_cost_mv;
  
              LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
              LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
@@ -2004,7 +2048,7 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
  
              /* BI mode */
              src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref,  m->i_stride[0],
-                                    m->mv[0], m->mv[1], 8, 16 );
+                                    m->mv[0], m->mv[1], 8, 16, weight_none );
              /* FIXME: ref cost */
              i_part_cost_bi += m->cost_mv;
          }
diff --git a/encoder/analyse.h b/encoder/analyse.h

index 5342d04bd3015c72abb811b30fdcdeb35867c78d..cdf0d2e6403093f6ad50c92ef398cc9f6fd05b4d 100644 (file)
--- a/encoder/analyse.h
+++ b/encoder/analyse.h
@@ -31,6 +31,9 @@ void x264_slicetype_decide( x264_t *h );
  
  void x264_slicetype_analyse( x264_t *h, int keyframe );
  
+int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lowres, int b_lookahead );
+
  int  x264_lookahead_init( x264_t *h, int i_slicetype_length );
  int  x264_lookahead_is_empty( x264_t *h );
  void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame );
diff --git a/encoder/encoder.c b/encoder/encoder.c

index 32fab040e0913ffa8d6a74f2d36f4ef18bac0b08..02ac5381fa36362fef96e6dd1a9f773a1148ed6f 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -257,10 +257,36 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
          }
      }
  
-    if( ( sh->pps->b_weighted_pred && ( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_SP ) ) ||
-        ( sh->pps->b_weighted_bipred == 1 && sh->i_type == SLICE_TYPE_B ) )
+    if( sh->pps->b_weighted_pred && ( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_SP ) )
      {
-        /* FIXME */
+        /* pred_weight_table() */
+        bs_write_ue( s, sh->weight[0][0].i_denom );
+        bs_write_ue( s, sh->weight[0][1].i_denom );
+        for( i = 0; i < sh->i_num_ref_idx_l0_active; i++ )
+        {
+            int luma_weight_l0_flag = !!sh->weight[i][0].weightfn;
+            int chroma_weight_l0_flag = !!sh->weight[i][1].weightfn || !!sh->weight[i][2].weightfn;
+            bs_write1( s, luma_weight_l0_flag );
+            if( luma_weight_l0_flag )
+            {
+                bs_write_se( s, sh->weight[i][0].i_scale );
+                bs_write_se( s, sh->weight[i][0].i_offset );
+            }
+            bs_write1( s, chroma_weight_l0_flag );
+            if( chroma_weight_l0_flag )
+            {
+                int j;
+                for( j = 1; j < 3; j++ )
+                {
+                    bs_write_se( s, sh->weight[i][j].i_scale );
+                    bs_write_se( s, sh->weight[i][j].i_offset );
+                }
+            }
+        }
+    }
+    else if( sh->pps->b_weighted_bipred == 1 && sh->i_type == SLICE_TYPE_B )
+    {
+      /* TODO */
      }
  
      if( i_nal_ref_idc != 0 )
@@ -397,6 +423,11 @@ static int x264_validate_parameters( x264_t *h )
              x264_log( h, X264_LOG_WARNING, "interlace + direct=temporal is not implemented\n" );
              h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
          }
+        if( h->param.analyse.i_weighted_pred > 0 )
+        {
+            x264_log( h, X264_LOG_WARNING, "interlace + weightp is not implemented\n" );
+            h->param.analyse.i_weighted_pred = X264_WEIGHTP_NONE;
+        }
      }
  
      /* Detect default ffmpeg settings and terminate with an error. */
@@ -641,6 +672,10 @@ static int x264_validate_parameters( x264_t *h )
              h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 512 >> h->param.b_interlaced);
      }
  
+    h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, 0, X264_WEIGHTP_SMART );
+    if( !h->param.analyse.i_weighted_pred && h->param.rc.b_mb_tree && h->param.analyse.b_psy && !h->param.b_interlaced )
+        h->param.analyse.i_weighted_pred = X264_WEIGHTP_FAKE;
+
      if( h->param.i_threads > 1 )
      {
          int r = h->param.analyse.i_mv_range_thread;
@@ -828,7 +863,8 @@ x264_t *x264_encoder_open( x264_param_t *param )
      CHECKED_MALLOCZERO( h->frames.unused[1], (h->param.i_threads + 20) * sizeof(x264_frame_t *) );
      CHECKED_MALLOCZERO( h->frames.current, (h->param.i_sync_lookahead + h->param.i_bframe
                          + h->param.i_threads + 3) * sizeof(x264_frame_t *) );
-
+    if( h->param.analyse.i_weighted_pred > 0 )
+        CHECKED_MALLOCZERO( h->frames.blank_unused, h->param.i_threads * 4 * sizeof(x264_frame_t *) );
      h->i_ref0 = 0;
      h->i_ref1 = 0;
  
@@ -1111,6 +1147,113 @@ static inline void x264_reference_check_reorder( x264_t *h )
          }
  }
  
+/* return -1 on failure, else return the index of the new reference frame */
+int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w )
+{
+    int i = h->i_ref0;
+    int j;
+    x264_frame_t *newframe;
+    if( i <= 1 ) /* empty list, definitely can't duplicate frame */
+        return -1;
+
+    /* Find a place to insert the duplicate in the reference list. */
+    for( j = 0; j < i; j++ )
+        if( h->fref0[i_ref]->i_frame != h->fref0[j]->i_frame )
+        {
+            /* found a place, after j, make sure there is not already a duplicate there */
+            if( j == i-1 || ( h->fref0[j+1] && h->fref0[i_ref]->i_frame != h->fref0[j+1]->i_frame ) )
+                break;
+        }
+
+    if( j == i ) /* No room in the reference list for the duplicate. */
+        return -1;
+    j++;
+
+    newframe = x264_frame_pop_blank_unused( h );
+
+    //FIXME: probably don't need to copy everything
+    *newframe = *h->fref0[i_ref];
+    newframe->i_reference_count = 1;
+    newframe->orig = h->fref0[i_ref];
+    newframe->b_duplicate = 1;
+    memcpy( h->fenc->weight[j], w, sizeof(h->fenc->weight[i]) );
+
+    /* shift the frames to make space for the dupe. */
+    h->b_ref_reorder[0] = 1;
+    if( h->i_ref0 < 16 )
+        ++h->i_ref0;
+    h->fref0[15] = NULL;
+    x264_frame_unshift( &h->fref0[j], newframe );
+
+    return j;
+}
+
+static void x264_weighted_pred_init( x264_t *h )
+{
+    int i_ref;
+    int i;
+
+    /* for now no analysis and set all weights to nothing */
+    for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
+        h->fenc->weighted[i_ref] = h->fref0[i_ref]->filtered[0];
+
+    // FIXME: This only supports weighting of one reference frame
+    // and duplicates of that frame.
+    h->fenc->i_lines_weighted = 0;
+
+    for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
+        for( i = 0; i < 3; i++ )
+            h->sh.weight[i_ref][i].weightfn = NULL;
+
+
+    if( h->sh.i_type != SLICE_TYPE_P || h->param.analyse.i_weighted_pred <= 0 )
+        return;
+
+    int i_padv = PADV << h->param.b_interlaced;
+    int denom = -1;
+    int weightluma = 0;
+    int buffer_next = 0;
+    int j;
+    //FIXME: when chroma support is added, move this into loop
+    h->sh.weight[0][1].weightfn = h->sh.weight[0][2].weightfn = NULL;
+    h->sh.weight[0][1].i_denom = h->sh.weight[0][2].i_denom = 0;
+    for( j = 0; j < h->i_ref0; j++ )
+    {
+        if( h->fenc->weight[j][0].weightfn )
+        {
+            h->sh.weight[j][0] = h->fenc->weight[j][0];
+            // if weight is useless, don't write it to stream
+            if( h->sh.weight[j][0].i_scale == 1<<h->sh.weight[j][0].i_denom && h->sh.weight[j][0].i_offset == 0 )
+                h->sh.weight[j][0].weightfn = NULL;
+            else
+            {
+                if( !weightluma )
+                {
+                    weightluma = 1;
+                    h->sh.weight[0][0].i_denom = denom = h->sh.weight[j][0].i_denom;
+                }
+                assert( h->sh.weight[j][0].i_denom == denom );
+                h->fenc->weighted[j] = h->mb.p_weight_buf[buffer_next++] +
+                    h->fenc->i_stride[0] * i_padv + PADH;
+            }
+        }
+
+        //scale full resolution frame
+        if( h->sh.weight[j][0].weightfn && h->param.i_threads == 1 )
+        {
+            uint8_t *src = h->fref0[j]->filtered[0] - h->fref0[j]->i_stride[0]*i_padv - PADH;
+            uint8_t *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH;
+            int stride = h->fenc->i_stride[0];
+            int width = h->fenc->i_width[0] + PADH*2;
+            int height = h->fenc->i_lines[0] + i_padv*2;
+            x264_weight_scale_plane( h, dst, stride, src, stride, width, height, &h->sh.weight[j][0] );
+            h->fenc->i_lines_weighted = height;
+        }
+    }
+    if( !weightluma )
+        h->sh.weight[0][0].i_denom = 0;
+}
+
  static inline void x264_reference_build_list( x264_t *h, int i_poc )
  {
      int i;
@@ -1174,6 +1317,49 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
      h->i_ref1 = X264_MIN( h->i_ref1, h->frames.i_max_ref1 );
      h->i_ref0 = X264_MIN( h->i_ref0, h->frames.i_max_ref0 );
      h->i_ref0 = X264_MIN( h->i_ref0, h->param.i_frame_reference ); // if reconfig() has lowered the limit
+
+    /* add duplicates */
+    if( h->fenc->i_type == X264_TYPE_P )
+    {
+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+        {
+            x264_weight_t w[3];
+            w[1].weightfn = w[2].weightfn = NULL;
+            if( h->param.rc.b_stat_read )
+                x264_ratecontrol_set_weights( h, h->fenc );
+            else if( h->param.i_threads == 1 )
+                x264_weights_analyse( h, h->fenc, h->fref0[0], 0, 0 );
+
+            if( !h->fenc->weight[0][0].weightfn )
+            {
+                h->fenc->weight[0][0].i_denom = 0;
+                SET_WEIGHT( w[0], 1, 1, 0, -1 );
+                x264_weighted_reference_duplicate( h, 0, w );
+            }
+            else
+            {
+                if( h->fenc->weight[0][0].i_scale == 1<<h->fenc->weight[0][0].i_denom )
+                {
+                    SET_WEIGHT( h->fenc->weight[0][0], 1, 1, 0, h->fenc->weight[0][0].i_offset );
+                }
+                x264_weighted_reference_duplicate( h, 0, weight_none );
+                w[0] = h->fenc->weight[0][0];
+                w[0].i_offset--;
+                h->mc.weight_cache( h, &w[0] );
+                x264_weighted_reference_duplicate( h, 0, w );
+            }
+        }
+        else if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND )
+        {
+            //weighted offset=-1
+            x264_weight_t w[3];
+            SET_WEIGHT( w[0], 1, 1, 0, -1 );
+            h->fenc->weight[0][0].i_denom = 0;
+            w[1].weightfn = w[2].weightfn = NULL;
+            x264_weighted_reference_duplicate( h, 0, w );
+        }
+    }
+
      assert( h->i_ref0 + h->i_ref1 <= 16 );
      h->mb.pic.i_fref[0] = h->i_ref0;
      h->mb.pic.i_fref[1] = h->i_ref1;
@@ -1926,6 +2112,9 @@ int     x264_encoder_encode( x264_t *h,
      if( h->sh.i_type == SLICE_TYPE_B )
          x264_macroblock_bipred_init( h );
  
+    /*------------------------- Weights -------------------------------------*/
+    x264_weighted_pred_init( h );
+
      /* ------------------------ Create slice header  ----------------------- */
      x264_slice_init( h, i_nal_type, i_global_qp );
  
@@ -2026,7 +2215,19 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
              for( i = 0; i < 32; i++ )
                  h->stat.i_mb_count_ref[h->sh.i_type][i_list][i] += h->stat.frame.i_mb_count_ref[i_list][i];
      if( h->sh.i_type == SLICE_TYPE_P )
+    {
          h->stat.i_consecutive_bframes[h->fdec->i_frame - h->fref0[0]->i_frame - 1]++;
+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+        {
+            for( i = 0; i < 3; i++ )
+                for( j = 0; j < h->i_ref0; j++ )
+                    if( h->sh.weight[0][i].i_denom != 0 )
+                    {
+                        h->stat.i_wpred[i]++;
+                        break;
+                    }
+        }
+    }
      if( h->sh.i_type == SLICE_TYPE_B )
      {
          h->stat.i_direct_frames[ h->sh.b_direct_spatial_mv_pred ] ++;
@@ -2110,6 +2311,15 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
  }
  #endif
  
+    /* Remove duplicates, must be done near the end as breaks h->fref0 array
+     * by freeing some of its pointers. */
+     for( i = 0; i < h->i_ref0; i++ )
+         if( h->fref0[i] && h->fref0[i]->b_duplicate )
+         {
+             x264_frame_push_blank_unused( h, h->fref0[i] );
+             h->fref0[i] = 0;
+         }
+
      if( h->param.psz_dump_yuv )
          x264_frame_dump( h );
  
@@ -2351,6 +2561,10 @@ void    x264_encoder_close  ( x264_t *h )
                            fixed_pred_modes[i][8] * 100.0 / sum_pred_modes[i] );
          }
  
+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+            x264_log( h, X264_LOG_INFO, "Weighted P-Frames: Y:%.1f%%\n",
+                      h->stat.i_wpred[0] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P] );
+
          for( i_list = 0; i_list < 2; i_list++ )
          {
              int i_slice;
@@ -2414,6 +2628,7 @@ void    x264_encoder_close  ( x264_t *h )
      x264_frame_delete_list( h->frames.unused[0] );
      x264_frame_delete_list( h->frames.unused[1] );
      x264_frame_delete_list( h->frames.current );
+    x264_frame_delete_list( h->frames.blank_unused );
  
      h = h->thread[0];
  
diff --git a/encoder/macroblock.c b/encoder/macroblock.c

index fb1e1d6f95b9ed70e78e686a01c48b158c32e221..2ddbc473d319a13775d957aaaa08ff95acc4c69a 100644 (file)
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -503,15 +503,25 @@ static void x264_macroblock_encode_pskip( x264_t *h )
      {
          h->mc.mc_luma( h->mb.pic.p_fdec[0],    FDEC_STRIDE,
                         h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
-                       mvx, mvy, 16, 16 );
+                       mvx, mvy, 16, 16, &h->sh.weight[0][0] );
  
          h->mc.mc_chroma( h->mb.pic.p_fdec[1],       FDEC_STRIDE,
                           h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
                           mvx, mvy, 8, 8 );
  
+        if( h->sh.weight[0][1].weightfn )
+            h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
+                                               h->mb.pic.p_fdec[1], FDEC_STRIDE,
+                                               &h->sh.weight[0][1], 8 );
+
          h->mc.mc_chroma( h->mb.pic.p_fdec[2],       FDEC_STRIDE,
                           h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
                           mvx, mvy, 8, 8 );
+
+        if( h->sh.weight[0][2].weightfn )
+            h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
+                                               h->mb.pic.p_fdec[2], FDEC_STRIDE,
+                                               &h->sh.weight[0][2], 8 );
      }
  
      x264_macroblock_encode_skip( h );
@@ -930,7 +940,7 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
          /* Motion compensation */
          h->mc.mc_luma( h->mb.pic.p_fdec[0],    FDEC_STRIDE,
                         h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
-                       mvp[0], mvp[1], 16, 16 );
+                       mvp[0], mvp[1], 16, 16, &h->sh.weight[0][0] );
      }
  
      for( i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
@@ -966,6 +976,11 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
              h->mc.mc_chroma( h->mb.pic.p_fdec[1+ch],       FDEC_STRIDE,
                               h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch],
                               mvp[0], mvp[1], 8, 8 );
+
+            if( h->sh.weight[0][1+ch].weightfn )
+                h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
+                                                      h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
+                                                      &h->sh.weight[0][1+ch], 8 );
          }
  
          /* there is almost never a termination during chroma, but we can't avoid the check entirely */
@@ -1172,7 +1187,8 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
      const int mvy   = x264_clip3( h->mb.cache.mv[0][x264_scan8[i4]][1], h->mb.mv_min[1], h->mb.mv_max[1] );
      int nz;
  
-    h->mc.mc_luma( p_fdec, FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0], mvx + 4*4*block_idx_x[i4], mvy + 4*4*block_idx_y[i4], 4, 4 );
+    h->mc.mc_luma( p_fdec, FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0],
+                   mvx + 4*4*block_idx_x[i4], mvy + 4*4*block_idx_y[i4], 4, 4, &h->sh.weight[i_ref][0] );
  
      if( h->mb.b_lossless )
      {
diff --git a/encoder/me.c b/encoder/me.c

index 5a525bb974dc5a38d395fca3459f1460e0619542..4828d5ba412dce0e145db62fd3b8c06fe2408322 100644 (file)
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -59,7 +59,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
  #define COST_MV( mx, my )\
  {\
      int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE,\
-                   &p_fref[(my)*stride+(mx)], stride )\
+                   &p_fref_w[(my)*stride+(mx)], stride )\
               + BITS_MVD(mx,my);\
      COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\
  }
@@ -67,7 +67,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
  #define COST_MV_HPEL( mx, my ) \
  { \
      int stride2 = 16; \
-    uint8_t *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh ); \
+    uint8_t *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \
      int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \
               + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
      COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
@@ -75,7 +75,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
  
  #define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
  {\
-    uint8_t *pix_base = p_fref + bmx + bmy*stride;\
+    uint8_t *pix_base = p_fref_w + bmx + bmy*stride;\
      h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\
          pix_base + (m0x) + (m0y)*stride,\
          pix_base + (m1x) + (m1y)*stride,\
@@ -88,7 +88,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
  
  #define COST_MV_X4_DIR( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs )\
  {\
-    uint8_t *pix_base = p_fref + bmx + bmy*stride;\
+    uint8_t *pix_base = p_fref_w + bmx + bmy*stride;\
      h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
          pix_base + (m0x) + (m0y)*stride,\
          pix_base + (m1x) + (m1y)*stride,\
@@ -103,7 +103,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
  
  #define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
  {\
-    uint8_t *pix_base = p_fref + omx + omy*stride;\
+    uint8_t *pix_base = p_fref_w + omx + omy*stride;\
      h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
          pix_base + (m0x) + (m0y)*stride,\
          pix_base + (m1x) + (m1y)*stride,\
@@ -123,9 +123,9 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
  #define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
  {\
      h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\
-        p_fref + (m0x) + (m0y)*stride,\
-        p_fref + (m1x) + (m1y)*stride,\
-        p_fref + (m2x) + (m2y)*stride,\
+        p_fref_w + (m0x) + (m0y)*stride,\
+        p_fref_w + (m1x) + (m1y)*stride,\
+        p_fref_w + (m2x) + (m2y)*stride,\
          stride, costs );\
      costs[0] += p_cost_mvx[(m0x)<<2]; /* no cost_mvy */\
      costs[1] += p_cost_mvx[(m1x)<<2];\
@@ -181,7 +181,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
      int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX;
      int omx, omy, pmx, pmy;
      uint8_t *p_fenc = m->p_fenc[0];
-    uint8_t *p_fref = m->p_fref[0];
+    uint8_t *p_fref_w = m->p_fref_w;
      ALIGNED_ARRAY_16( uint8_t, pix,[16*16] );
  
      int i, j;
@@ -478,7 +478,7 @@ me_hex2:
                  else
                  {
                      int dir = 0;
-                    uint8_t *pix_base = p_fref + omx + (omy-4*i)*stride;
+                    uint8_t *pix_base = p_fref_w + omx + (omy-4*i)*stride;
                      int dy = i*stride;
  #define SADS(k,x0,y0,x1,y1,x2,y2,x3,y3)\
                      h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
@@ -587,7 +587,7 @@ me_hex2:
                  mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15));
                  int nmvsad = 0, limit;
                  int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
-                int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref+bmy*stride+bmx, stride )
+                int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+bmy*stride+bmx, stride )
                           + BITS_MVD( bmx, bmy );
                  for( my = min_y; my <= max_y; my++ )
                  {
@@ -599,7 +599,7 @@ me_hex2:
                                                 cost_fpel_mvx+min_x, xs, width, bsad*17/16 );
                      for( i=0; i<xn-2; i+=3 )
                      {
-                        uint8_t *ref = p_fref+min_x+my*stride;
+                        uint8_t *ref = p_fref_w+min_x+my*stride;
                          int sads[3];
                          h->pixf.sad_x3[i_pixel]( p_fenc, ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads );
                          for( j=0; j<3; j++ )
@@ -618,7 +618,7 @@ me_hex2:
                      for( ; i<xn; i++ )
                      {
                          int mx = min_x+xs[i];
-                        int sad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref+mx+my*stride, stride )
+                        int sad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+mx+my*stride, stride )
                                  + cost_fpel_mvx[xs[i]];
                          if( sad < bsad*sad_thresh>>3 )
                          {
@@ -725,14 +725,14 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
  
      if( m->i_pixel <= PIXEL_8x8 && h->sh.i_type == SLICE_TYPE_P )
          m->cost -= m->i_ref_cost;
-       
+
      refine_subpel( h, m, hpel, qpel, NULL, 1 );
  }
  
  #define COST_MV_SAD( mx, my ) \
  { \
      int stride = 16; \
-    uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
+    uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
      int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
               + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
      COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \
@@ -742,17 +742,23 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
  if( b_refine_qpel || (dir^1) != odir ) \
  { \
      int stride = 16; \
-    uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
+    uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
      int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
               + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
      if( b_chroma_me && cost < bcost ) \
      { \
          h->mc.mc_chroma( pix[0], 8, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \
+        if( m->weight[1].weightfn ) \
+            m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix[0], 8, pix[0], 8, \
+                                                                  &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \
          cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix[0], 8 ); \
          if( cost < bcost ) \
          { \
              h->mc.mc_chroma( pix[0], 8, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \
              cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix[0], 8 ); \
+            if( m->weight[2].weightfn ) \
+                m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix[0], 8, pix[0], 8, \
+                                                                      &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \
          } \
      } \
      if( cost < bcost ) \
@@ -799,8 +805,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
          int costs[4];
          int stride = 32; // candidates are either all hpel or all qpel, so one stride is enough
          uint8_t *src0, *src1, *src2, *src3;
-        src0 = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1 );
-        src2 = h->mc.get_ref( pix[1], &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh );
+        src0 = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1, &m->weight[0] );
+        src2 = h->mc.get_ref( pix[1], &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh, &m->weight[0] );
          src1 = src0 + stride;
          src3 = src2 + 1;
          h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
@@ -863,7 +869,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
      int mvx = om##list##x+dx;\
      int mvy = om##list##y+dy;\
      stride##list[i] = bw;\
-    src##list[i] = h->mc.get_ref( pixy_buf[list][i], &stride##list[i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh ); \
+    src##list[i] = h->mc.get_ref( pixy_buf[list][i], &stride##list[i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none ); \
      if( rd )\
      {\
          h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
@@ -1018,7 +1024,7 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
      if( !avoid_mvp || !(mx == pmx && my == pmy) ) \
      { \
          int stride = 16; \
-        uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw*4, bh*4 ); \
+        uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw*4, bh*4, &m->weight[0] ); \
          dst = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
              + p_cost_mvx[mx] + p_cost_mvy[my]; \
          COPY1_IF_LT( bsatd, dst ); \
diff --git a/encoder/me.h b/encoder/me.h

index ed75da34ea05e4738e7a9069d1aeaab039b0db2b..b8d8f728c65a471550d40b42e1ae38cb2c35a72b 100644 (file)
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -29,13 +29,17 @@
  
  typedef struct
  {
+    /* aligning the first member is a gcc hack to force the struct to be
+     * 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
      /* input */
-    int      i_pixel;   /* PIXEL_WxH */
+    ALIGNED_16( int i_pixel );   /* PIXEL_WxH */
      uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */
      int      i_ref_cost;
      int      i_ref;
+    const x264_weight_t *weight;
  
      uint8_t *p_fref[6];
+    uint8_t *p_fref_w;
      uint8_t *p_fenc[3];
      uint16_t *integral;
      int      i_stride[2];
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c

index 1d848a8c77031d0496f103876e3a59105882161e..22b5067427f042ce7335e2002fab2110e18a3ccb 100644 (file)
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -50,6 +50,8 @@ typedef struct
      int s_count;
      float blurred_complexity;
      char direct_mode;
+    int8_t weight[2];
+    int8_t i_weight_denom;
      int refcount[16];
      int refs;
  } ratecontrol_entry_t;
@@ -314,12 +316,17 @@ int x264_reference_build_list_optimal( x264_t *h )
  {
      ratecontrol_entry_t *rce = h->rc->rce;
      x264_frame_t *frames[16];
+    x264_weight_t weights[16][3];
+    int refcount[16];
      int ref, i;
  
      if( rce->refs != h->i_ref0 )
          return -1;
  
      memcpy( frames, h->fref0, sizeof(frames) );
+    memcpy( refcount, rce->refcount, sizeof(refcount) );
+    memcpy( weights, h->fenc->weight, sizeof(weights) );
+    memset( h->fenc->weight, 0, sizeof(h->fenc->weight) );
  
      /* For now don't reorder ref 0; it seems to lower quality
         in most cases due to skips. */
@@ -327,11 +334,18 @@ int x264_reference_build_list_optimal( x264_t *h )
      {
          int max = -1;
          int bestref = 1;
+
          for( i = 1; i < h->i_ref0; i++ )
-            /* Favor lower POC as a tiebreaker. */
-            COPY2_IF_GT( max, rce->refcount[i], bestref, i );
-        rce->refcount[bestref] = -1;
+            if( !frames[i]->b_duplicate || frames[i]->i_frame != h->fref0[ref-1]->i_frame )
+                /* Favor lower POC as a tiebreaker. */
+                COPY2_IF_GT( max, refcount[i], bestref, i );
+
+        /* FIXME: If there are duplicates from frames other than ref0 then it is possible
+         * that the optimal ordering doesnt place every duplicate. */
+
+        refcount[bestref] = -1;
          h->fref0[ref] = frames[bestref];
+        memcpy( h->fenc->weight[ref], weights[bestref], sizeof(weights[bestref]) );
      }
  
      return 0;
@@ -541,6 +555,13 @@ int x264_ratecontrol_new( x264_t *h )
                  return -1;
              }
  
+            if( ( p = strstr( opts, "wpredp=" ) ) && sscanf( p, "wpredp=%d", &i ) &&
+                X264_MAX( 0, h->param.analyse.i_weighted_pred ) != i )
+            {
+                x264_log( h, X264_LOG_ERROR, "different weightp option than 1st pass (had weightp=%d)\n", i );
+                return -1;
+            }
+
              /* since B-adapt doesn't (yet) take into account B-pyramid,
               * the converse is not a problem */
              if( h->param.i_bframe )
@@ -659,6 +680,13 @@ int x264_ratecontrol_new( x264_t *h )
              }
              rce->refs = ref;
  
+            /* find weights */
+            rce->i_weight_denom = -1;
+            char *w = strchr( p, 'w' );
+            if( w )
+                if( sscanf( w, "w:%hhd,%hhd,%hhd", &rce->i_weight_denom, &rce->weight[0], &rce->weight[1] ) != 3 )
+                    rce->i_weight_denom = -1;
+
              switch(pict_type)
              {
                  case 'I': rce->kept_as_ref = 1;
@@ -1240,6 +1268,15 @@ int x264_ratecontrol_slice_type( x264_t *h, int frame_num )
      }
  }
  
+void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm )
+{
+    ratecontrol_entry_t *rce = &h->rc->entry[frm->i_frame];
+    if( h->param.analyse.i_weighted_pred <= 0 )
+        return;
+    if( rce->i_weight_denom >= 0 )
+        SET_WEIGHT( frm->weight[0][0], 1, rce->weight[0], rce->i_weight_denom, rce->weight[1] );
+}
+
  /* After encoding one frame, save stats and update ratecontrol state */
  int x264_ratecontrol_end( x264_t *h, int bits )
  {
@@ -1282,16 +1319,25 @@ int x264_ratecontrol_end( x264_t *h, int bits )
                   c_direct) < 0 )
              goto fail;
  
-        for( i = 0; i < h->i_ref0; i++ )
+        /* Only write information for reference reordering once. */
+        int use_old_stats = h->param.rc.b_stat_read && rc->rce->refs > 1;
+        for( i = 0; i < (use_old_stats ? rc->rce->refs : h->i_ref0); i++ )
          {
-            int refcount = h->param.b_interlaced ? h->stat.frame.i_mb_count_ref[0][i*2]
-                                                 + h->stat.frame.i_mb_count_ref[0][i*2+1] :
-                                                   h->stat.frame.i_mb_count_ref[0][i];
+            int refcount = use_old_stats         ? rc->rce->refcount[i]
+                         : h->param.b_interlaced ? h->stat.frame.i_mb_count_ref[0][i*2]
+                                                 + h->stat.frame.i_mb_count_ref[0][i*2+1]
+                         :                         h->stat.frame.i_mb_count_ref[0][i];
              if( fprintf( rc->p_stat_file_out, "%d ", refcount ) < 0 )
                  goto fail;
          }
  
-        if( fprintf( rc->p_stat_file_out, ";\n" ) < 0 )
+        if( h->sh.weight[0][0].weightfn )
+        {
+            if( fprintf( rc->p_stat_file_out, "w:%d,%d,%d", h->sh.weight[0][0].i_denom, h->sh.weight[0][0].i_scale, h->sh.weight[0][0].i_offset ) < 0 )
+                goto fail;
+        }
+
+        if( fprintf( rc->p_stat_file_out, ";\n") < 0 )
              goto fail;
  
          /* Don't re-write the data in multi-pass mode. */
diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h

index d3b9becb2e82e5ec08cad19e2ac0212b956dcce5..b9d552b5c78af86528a582ade4c64f447dea5e8d 100644 (file)
--- a/encoder/ratecontrol.h
+++ b/encoder/ratecontrol.h
@@ -34,6 +34,7 @@ int  x264_reference_build_list_optimal( x264_t *h );
  void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
  void x264_ratecontrol_start( x264_t *, int i_force_qp, int overhead );
  int  x264_ratecontrol_slice_type( x264_t *, int i_frame );
+void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm );
  void x264_ratecontrol_mb( x264_t *, int bits );
  int  x264_ratecontrol_qp( x264_t * );
  int  x264_ratecontrol_end( x264_t *, int bits );
@@ -41,6 +42,7 @@ void x264_ratecontrol_summary( x264_t * );
  void x264_ratecontrol_set_estimated_size( x264_t *, int bits );
  int  x264_ratecontrol_get_estimated_size( x264_t const *);
  int  x264_rc_analyse_slice( x264_t *h );
+int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
  
  #endif
  
diff --git a/encoder/set.c b/encoder/set.c

index fd1a22ee1b7616f7e8a4c2f031a475b46c49c5e0..f108055a3f3994cabf23778f9fff227491150508 100644 (file)
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -80,7 +80,7 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
          sps->i_profile_idc  = PROFILE_HIGH444_PREDICTIVE;
      else if( param->analyse.b_transform_8x8 || param->i_cqm_preset != X264_CQM_FLAT )
          sps->i_profile_idc  = PROFILE_HIGH;
-    else if( param->b_cabac || param->i_bframe > 0 || param->b_interlaced )
+    else if( param->b_cabac || param->i_bframe > 0 || param->b_interlaced || param->analyse.i_weighted_pred > 0 )
          sps->i_profile_idc  = PROFILE_MAIN;
      else
          sps->i_profile_idc  = PROFILE_BASELINE;
@@ -375,7 +375,7 @@ void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *
      pps->i_num_ref_idx_l0_active = 1;
      pps->i_num_ref_idx_l1_active = 1;
  
-    pps->b_weighted_pred = 0;
+    pps->b_weighted_pred = param->analyse.i_weighted_pred > 0;
      pps->b_weighted_bipred = param->analyse.b_weighted_bipred ? 2 : 0;
  
      pps->i_pic_init_qp = param->rc.i_rc_method == X264_RC_ABR ? 26 : param->rc.i_qp_constant;
diff --git a/encoder/slicetype.c b/encoder/slicetype.c

index c1040a6dcd58ee4fb11eddb5baf789eb1eab6d09..b8495ed84d053ed3244bd7942014307e6cc67edf 100644 (file)
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -3,8 +3,9 @@
   *****************************************************************************
   * Copyright (C) 2005-2008 x264 project
   *
- * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Fiona Glaser <fiona@x264.com>
+ * Authors: Fiona Glaser <fiona@x264.com>
+ *          Loren Merritt <lorenm@u.washington.edu>
+ *          Dylan Yudaken <dyudaken@gmail.com>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -28,6 +29,9 @@
  #include "macroblock.h"
  #include "me.h"
  
+static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
+                                      x264_frame_t **frames, int p0, int p1, int b,
+                                      int b_intra_penalty );
  
  static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
  {
@@ -39,9 +43,215 @@ static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
      h->mb.b_chroma_me = 0;
  }
  
+/* makes a non-h264 weight (ie. a multipler with a denominator of 128 ), into an h264 weight */
+static void get_h264_weight( unsigned int weight_nonh264, int offset, x264_weight_t *w )
+{
+    w->i_offset = offset;
+    w->i_denom = 7;
+    w->i_scale = weight_nonh264;
+    while( w->i_denom > 0 && (w->i_scale > 127 || !(w->i_scale & 1)) )
+    {
+        w->i_denom--;
+        w->i_scale >>= 1;
+    }
+    w->i_scale = X264_MIN( w->i_scale, 127 );
+}
+/* due to a GCC bug on some platforms (win32), flat[16] may not actually be aligned. */
+ALIGNED_16( static uint8_t flat[17] ) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1};
+
+static void weights_plane_analyse( x264_t *h, uint8_t *plane, int width, int height, int stride, unsigned int *sum, uint64_t *var )
+{
+    int x,y;
+    unsigned int sad = 0;
+    uint64_t ssd = 0;
+    uint8_t *p = plane;
+    for( y = 0; y < height>>4; y++, p += stride*16 )
+        for( x = 0; x < width; x+=16 )
+        {
+            sad += h->pixf.sad_aligned[PIXEL_16x16]( p + x, stride, flat, 0 );
+            ssd += h->pixf.ssd[PIXEL_16x16]( p + x, stride, flat, 0 );
+        }
+
+    *sum = sad;
+    *var = ssd - (uint64_t) sad * sad / ( width * height );
+    x264_emms();
+}
+
+#define LOAD_HPELS_LUMA(dst, src) \
+{ \
+   (dst)[0] = &(src)[0][i_pel_offset]; \
+   (dst)[1] = &(src)[1][i_pel_offset]; \
+   (dst)[2] = &(src)[2][i_pel_offset]; \
+   (dst)[3] = &(src)[3][i_pel_offset]; \
+}
+
+static uint8_t *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, uint8_t *dest, int b_lowres )
+{
+    uint8_t **ref_planes = b_lowres ? ref->lowres : ref->filtered;
+    int ref0_distance = fenc->i_frame - ref->i_frame - 1;
+    /* Note: this will never run during lookahead as weights_analyse is only called if no
+     * motion search has been done. */
+    if( h->frames.b_have_lowres && fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF
+        && ( h->param.analyse.i_subpel_refine || h->param.i_threads > 1 ))
+    {
+        uint8_t *src[4];
+        int i_stride = b_lowres ? fenc->i_stride_lowres : fenc->i_stride[0];
+        int i_lines = b_lowres ? fenc->i_lines_lowres : fenc->i_lines[0];
+        int i_width = b_lowres ? fenc->i_width_lowres : fenc->i_width[0];
+        int i_mb_xy = 0;
+        int mbsizeshift = b_lowres ? 3 : 4;
+        int mbsize  = 1 << mbsizeshift;
+        int x,y;
+        int i_pel_offset = 0;
+
+        for( y = 0; y < i_lines; y += mbsize, i_pel_offset = y*i_stride )
+            for( x = 0; x < i_width; x += mbsize, i_mb_xy++, i_pel_offset += mbsize )
+            {
+                uint8_t *pix = &dest[ i_pel_offset ];
+                int mvx = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][0] << !b_lowres;
+                int mvy = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][1] << !b_lowres;
+                LOAD_HPELS_LUMA( src, ref_planes );
+                h->mc.mc_luma( pix, i_stride, src, i_stride,
+                               mvx, mvy, mbsize, mbsize, weight_none );
+            }
+        return dest;
+    }
+    return ref_planes[0];
+}
+#undef LOAD_HPELS_LUMA
+
+static unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, uint8_t *src, x264_weight_t *w, int b_lowres )
+{
+    int x, y;
+    unsigned int cost = 0;
+    int mbsize = b_lowres ? 8 : 16;
+    int pixelsize = mbsize == 8 ? PIXEL_8x8 : PIXEL_16x16;
+    int i_stride = b_lowres ? fenc->i_stride_lowres : fenc->i_stride[0];
+    int i_lines = b_lowres ? fenc->i_lines_lowres : fenc->i_lines[0];
+    int i_width = b_lowres ? fenc->i_width_lowres : fenc->i_width[0];
+    uint8_t *fenc_plane = b_lowres ? fenc->lowres[0] : fenc->plane[0];
+    ALIGNED_16( uint8_t buf[16*16] );
+    int pixoff = 0;
+    int i_mb = 0;
+
+    if( w )
+        for( y = 0; y < i_lines; y += mbsize, pixoff = ( y*i_stride ) )
+            for( x = 0; x < i_width; x += mbsize, i_mb++, pixoff += mbsize)
+            {
+                w->weightfn[mbsize>>2]( buf, 16, &src[pixoff], i_stride, w, mbsize );
+                cost += X264_MIN( h->pixf.mbcmp[pixelsize]( buf, 16, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
+            }
+    else
+        for( y = 0; y < i_lines; y += mbsize, pixoff = ( y*i_stride ) )
+            for( x = 0; x < i_width; x+=mbsize, i_mb++, pixoff += mbsize )
+                cost += X264_MIN( h->pixf.mbcmp[pixelsize]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
+
+    int lambda = b_lowres ? 1 : 4;
+    if( w )
+    {
+        int numslices;
+        if( h->param.i_slice_count )
+            numslices = h->param.i_slice_count;
+        else if ( h->param.i_slice_max_mbs )
+            numslices = ( h->sps->i_mb_width * h->sps->i_mb_height + h->param.i_slice_max_mbs-1 ) / h->param.i_slice_max_mbs;
+        else
+            numslices = 1;
+        // FIXME still need to calculate for --slice-max-size
+        // Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
+        cost += lambda * numslices * ( 10 + 2 * ( bs_size_ue( w[0].i_denom ) + bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset ) ) );
+    }
+    return cost;
+}
+
+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lowres, int b_lookahead )
+{
+    unsigned int fenc_sum, ref_sum;
+    float fenc_mean, ref_mean;
+    uint64_t fenc_var, ref_var;
+    int i_off, offset_search;
+    int minoff, minscale, mindenom;
+    unsigned int minscore, origscore;
+    int i_delta_index = fenc->i_frame - ref->i_frame - 1;
+    /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
+    const float epsilon = 1.0/128.0;
+
+    float guess_scale;
+    int found;
+    x264_weight_t *weights = fenc->weight[0];
+
+    weights_plane_analyse( h, fenc->plane[0], fenc->i_width[0], fenc->i_lines[0], fenc->i_stride[0], &fenc_sum, &fenc_var );
+    weights_plane_analyse( h, ref->plane[0], ref->i_width[0], ref->i_lines[0], ref->i_stride[0], &ref_sum, &ref_var );
+    fenc_var = round( sqrt( fenc_var ) );
+    ref_var = round( sqrt( ref_var ) );
+    fenc_mean = (float)fenc_sum / ( fenc->i_lines[0] * fenc->i_width[0] );
+    ref_mean = (float)ref_sum / ( fenc->i_lines[0] * fenc->i_width[0] );
+
+    //early termination
+
+    if( fabs( ref_mean - fenc_mean ) < 0.5 && fabsf( 1 - ( (float)fenc_var / ref_var ) ) < epsilon )
+        return;
+
+    guess_scale = ref_var ? (float)fenc_var/ref_var : 0;
+    get_h264_weight( round( guess_scale * 128 ), 0, &weights[0] );
+
+    found = 0;
+    mindenom = weights[0].i_denom;
+    minscale = weights[0].i_scale;
+    minoff = 0;
+    offset_search = x264_clip3( floor( fenc_mean - ref_mean * minscale / (1 << mindenom) + 0.5f*b_lookahead ), -128, 126 );
+
+    if( !fenc->b_intra_calculated )
+    {
+        x264_mb_analysis_t a;
+        x264_lowres_context_init( h, &a );
+        x264_slicetype_frame_cost( h, &a, &fenc, 0, 0, 0, 0 );
+    }
+    uint8_t *mcbuf = x264_weight_cost_init_luma( h, fenc, ref, h->mb.p_weight_buf[0], b_lowres );
+    origscore = minscore = x264_weight_cost( h, fenc, mcbuf, 0, b_lowres );
+
+    if( !minscore )
+        return;
+
+    // This gives a slight improvement due to rounding errors but only tests
+    // one offset on lookahead.
+    // TODO: currently searches only offset +1.  try other offsets/multipliers/combinations thereof?
+    for( i_off = offset_search; i_off <= offset_search+!b_lookahead; i_off++ )
+    {
+        SET_WEIGHT( weights[0], 1, minscale, mindenom, i_off );
+        unsigned int s = x264_weight_cost( h, fenc, mcbuf, &weights[0], b_lowres );
+        COPY3_IF_LT( minscore, s, minoff, i_off, found, 1 );
+    }
+    x264_emms();
+
+    /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
+    if( !found || ( minscale == 1<<mindenom && minoff == 0 ) || minscore >= fenc->i_width[0] * fenc->i_lines[0] * ( b_lowres ? 2 : 8 ) )
+    {
+        SET_WEIGHT( weights[0], 0, 1, 0, 0 );
+        return;
+    }
+    else
+        SET_WEIGHT( weights[0], 1, minscale, mindenom, minoff );
+
+    if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE && weights[0].weightfn )
+        fenc->f_weighted_cost_delta[i_delta_index] = (float)minscore / origscore;
+
+    if( weights[0].weightfn && b_lookahead )
+    {
+        //scale lowres in lookahead for slicetype_frame_cost
+        int i_padv = PADV<<h->param.b_interlaced;
+        uint8_t *src = ref->buffer_lowres[0];
+        uint8_t *dst = h->mb.p_weight_buf[0];
+        int width = ref->i_width_lowres + PADH*2;
+        int height = ref->i_lines_lowres + i_padv*2;
+        x264_weight_scale_plane( h, dst, ref->i_stride_lowres, src, ref->i_stride_lowres,
+                                 width, height, &weights[0] );
+        fenc->weighted[0] = h->mb.p_weight_buf[0] + PADH + ( ref->i_stride_lowres * i_padv );
+    }
+}
+
  static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
-                            x264_frame_t **frames, int p0, int p1, int b,
-                            int dist_scale_factor, int do_search[2] )
+                                   x264_frame_t **frames, int p0, int p1, int b,
+                                   int dist_scale_factor, int do_search[2], const x264_weight_t *w )
  {
      x264_frame_t *fref0 = frames[p0];
      x264_frame_t *fref1 = frames[p1];
@@ -90,6 +300,9 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
          (dst)[2] = &(src)[2][i_pel_offset]; \
          (dst)[3] = &(src)[3][i_pel_offset]; \
      }
+#define LOAD_WPELS_LUMA(dst,src) \
+    (dst) = &(src)[i_pel_offset];
+
  #define CLIP_MV( mv ) \
      { \
          mv[0] = x264_clip3( mv[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); \
@@ -101,9 +314,9 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
          uint8_t *src1, *src2; \
          int i_cost; \
          src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \
-                              (mv0)[0], (mv0)[1], 8, 8 ); \
+                              (mv0)[0], (mv0)[1], 8, 8, w ); \
          src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \
-                              (mv1)[0], (mv1)[1], 8, 8 ); \
+                              (mv1)[0], (mv1)[1], 8, 8, w ); \
          h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \
          i_cost = penalty + h->pixf.mbcmp[PIXEL_8x8]( \
                             m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \
@@ -114,7 +327,11 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
      m[0].p_cost_mv = a->p_cost_mv;
      m[0].i_stride[0] = i_stride;
      m[0].p_fenc[0] = h->mb.pic.p_fenc[0];
+    m[0].weight = w;
      LOAD_HPELS_LUMA( m[0].p_fref, fref0->lowres );
+    m[0].p_fref_w = m[0].p_fref[0];
+    if( w[0].weightfn )
+        LOAD_WPELS_LUMA( m[0].p_fref_w, fenc->weighted[0] );
  
      if( b_bidir )
      {
@@ -122,7 +339,10 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
          int dmv[2][2];
  
          h->mc.memcpy_aligned( &m[1], &m[0], sizeof(x264_me_t) );
+        m[1].i_ref = p1;
+        m[1].weight = weight_none;
          LOAD_HPELS_LUMA( m[1].p_fref, fref1->lowres );
+        m[1].p_fref_w = m[1].p_fref[0];
  
          dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8;
          dmv[0][1] = ( mvr[1] * dist_scale_factor + 128 ) >> 8;
@@ -268,15 +488,15 @@ lowres_intra_mb:
      h->sps->i_mb_width * h->sps->i_mb_height)
  
  static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
-                               x264_frame_t **frames, int p0, int p1, int b,
-                               int b_intra_penalty )
+                                      x264_frame_t **frames, int p0, int p1, int b,
+                                      int b_intra_penalty )
  {
  
      int i_score = 0;
      /* Don't use the AQ'd scores for slicetype decision. */
      int i_score_aq = 0;
      int do_search[2];
-
+    const x264_weight_t *w = weight_none;
      /* Check whether we already evaluated this frame
       * If we have tried this frame as P, then we have also tried
       * the preceding frames as B. (is this still true?) */
@@ -293,7 +513,16 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
          /* For each list, check to see whether we have lowres motion-searched this reference frame before. */
          do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
          do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
-        if( do_search[0] ) frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+        if( do_search[0] )
+        {
+            if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART
+                  || h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
+            {
+                x264_weights_analyse( h, frames[b], frames[p0], 1, 1 );
+                w = frames[b]->weight[0];
+            }
+            frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+        }
          if( do_search[1] ) frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
  
          if( b == p1 )
@@ -318,7 +547,7 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
                  row_satd[ h->mb.i_mb_y ] = 0;
                  for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
                  {
-                    int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search );
+                    int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
                      int i_mb_cost_aq = i_mb_cost;
                      if( h->param.rc.i_aq_mode )
                          i_mb_cost_aq = (i_mb_cost_aq * frames[b]->i_inv_qscale_factor[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride] + 128) >> 8;
@@ -339,7 +568,7 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
              for( h->mb.i_mb_y = h->sps->i_mb_height - 2; h->mb.i_mb_y > 0; h->mb.i_mb_y-- )
                  for( h->mb.i_mb_x = h->sps->i_mb_width - 2; h->mb.i_mb_x > 0; h->mb.i_mb_x-- )
                  {
-                    int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search );
+                    int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
                      int i_mb_cost_aq = i_mb_cost;
                      if( h->param.rc.i_aq_mode )
                          i_mb_cost_aq = (i_mb_cost_aq * frames[b]->i_inv_qscale_factor[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride] + 128) >> 8;
@@ -396,10 +625,14 @@ static int x264_slicetype_frame_cost_recalculate( x264_t *h, x264_frame_t **fram
      return i_score;
  }
  
-static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame )
+static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, int ref0_distance )
  {
      int mb_index;
      x264_emms();
+    float weightdelta = 0.0;
+    if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 )
+        weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]);
+
      /* Allow the strength to be adjusted via qcompress, since the two
       * concepts are very similar. */
      float strength = 5.0f * (1.0f - h->param.rc.f_qcompress);
@@ -410,7 +643,7 @@ static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame )
          {
              int propagate_cost = frame->i_propagate_cost[mb_index];
              float log2_ratio = x264_log2(intra_cost + propagate_cost) - x264_log2(intra_cost);
-            frame->f_qp_offset[mb_index] = frame->f_qp_offset_aq[mb_index] - strength * log2_ratio;
+            frame->f_qp_offset[mb_index] = frame->f_qp_offset_aq[mb_index] - strength * ( log2_ratio + weightdelta );
          }
      }
  }
@@ -491,7 +724,7 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
      }
  
      if( h->param.rc.i_vbv_buffer_size && b == p1 )
-        x264_macroblock_tree_finish( h, frames[b] );
+        x264_macroblock_tree_finish( h, frames[b], b-p0 );
  }
  
  static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int num_frames, int b_intra )
@@ -530,7 +763,7 @@ static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t
          last_nonb = cur_nonb;
      }
  
-    x264_macroblock_tree_finish( h, frames[last_nonb] );
+    x264_macroblock_tree_finish( h, frames[last_nonb], last_nonb );
  }
  
  static int x264_vbv_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b )
@@ -1031,6 +1264,14 @@ void x264_slicetype_decide( x264_t *h )
          }
      }
  
+    /* Analyse for weighted P frames */
+    if( h->lookahead->next.list[bframes]->i_type == X264_TYPE_P )
+    {
+        memset( h->lookahead->next.list[bframes]->weight, 0, sizeof(h->lookahead->next.list[bframes]->weight) );
+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->param.i_threads > 1 )
+            x264_weights_analyse( h, h->lookahead->next.list[bframes], h->lookahead->last_nonb, 1, 0 );
+    }
+
      /* shift sequence to coded order.
         use a small temporary list to avoid shifting the entire next buffer around */
      int i_dts = h->lookahead->next.list[0]->i_frame;
diff --git a/tools/checkasm.c b/tools/checkasm.c

index 45c0c202a0ac7db2ff29e007bb5381e833287856..24e2b72f1cd8e90c74d7a7e40d135f443ec3819b 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -773,12 +773,13 @@ static int check_mc( int cpu_ref, int cpu_new )
  #define MC_TEST_LUMA( w, h ) \
          if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \
          { \
+            const x264_weight_t *weight = weight_none; \
              set_func_name( "mc_luma_%dx%d", w, h );\
              used_asm = 1; \
              memset(buf3, 0xCD, 1024); \
              memset(buf4, 0xCD, 1024); \
-            call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h ); \
-            call_a( mc_a.mc_luma, dst2, 32, src2, 64, dx, dy, w, h ); \
+            call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \
+            call_a( mc_a.mc_luma, dst2, 32, src2, 64, dx, dy, w, h, weight ); \
              if( memcmp( buf3, buf4, 1024 ) ) \
              { \
                  fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
@@ -789,12 +790,13 @@ static int check_mc( int cpu_ref, int cpu_new )
          { \
              uint8_t *ref = dst2; \
              int ref_stride = 32; \
+            const x264_weight_t *weight = weight_none; \
              set_func_name( "get_ref_%dx%d", w, h );\
              used_asm = 1; \
              memset(buf3, 0xCD, 1024); \
              memset(buf4, 0xCD, 1024); \
-            call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h ); \
-            ref = (uint8_t*) call_a( mc_a.get_ref, ref, &ref_stride, src2, 64, dx, dy, w, h ); \
+            call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \
+            ref = (uint8_t*) call_a( mc_a.get_ref, ref, &ref_stride, src2, 64, dx, dy, w, h, weight ); \
              for( i=0; i<h; i++ ) \
                  if( memcmp( dst1+i*32, ref+i*ref_stride, w ) ) \
                  { \
@@ -882,6 +884,79 @@ static int check_mc( int cpu_ref, int cpu_new )
          MC_TEST_AVG( avg, w );
      report( "mc wpredb :" );
  
+#define MC_TEST_WEIGHT( name, weight, aligned ) \
+    int align_off = (aligned ? 0 : rand()%16); \
+    for( i = 1, ok = 1, used_asm = 0; i <= 5; i++ ) \
+    { \
+        ALIGNED_16( uint8_t buffC[640] ); \
+        ALIGNED_16( uint8_t buffA[640] ); \
+        j = X264_MAX( i*4, 2 ); \
+        memset( buffC, 0, 640 ); \
+        memset( buffA, 0, 640 ); \
+        x264_t ha; \
+        ha.mc = mc_a; \
+        /* w12 is the same as w16 in some cases */ \
+        if( i == 3 && mc_a.name[i] == mc_a.name[i+1] ) \
+            continue; \
+        if( mc_a.name[i] != mc_ref.name[i] ) \
+        { \
+            int k; \
+            set_func_name( "%s_w%d", #name, j ); \
+            used_asm = 1; \
+            call_c1( mc_c.weight[i], buffC, 32, buf2+align_off, 32, &weight, 16 ); \
+            mc_a.weight_cache(&ha, &weight); \
+            call_a1( weight.weightfn[i], buffA, 32, buf2+align_off, 32, &weight, 16 ); \
+            for( k = 0; k < 16; k++ ) \
+                if( memcmp( &buffC[k*32], &buffA[k*32], j ) ) \
+                { \
+                    ok = 0; \
+                    fprintf( stderr, #name "[%d]: [FAILED] s:%d o:%d d%d\n", i, s, o, d ); \
+                    break; \
+                } \
+            call_c2( mc_c.weight[i], buffC, 32, buf2+align_off, 32, &weight, 16 ); \
+            call_a2( weight.weightfn[i], buffA, 32, buf2+align_off, 32, &weight, 16 ); \
+        } \
+    }
+
+    ok = 1; used_asm = 0;
+
+    int s,o,d;
+    int align_cnt = 0;
+    for( s = 0; s <= 127 && ok; s++ )
+    {
+        for( o = -128; o <= 127 && ok; o++ )
+        {
+            if( rand() & 2047 ) continue;
+            for( d = 0 ; d <= 7 && ok; d++ )
+            {
+                if( s == 1<<d )
+                    continue;
+                x264_weight_t weight = { .i_scale = s, .i_denom = d, .i_offset = o };
+                MC_TEST_WEIGHT( weight, weight, (align_cnt++ % 4) );
+            }
+        }
+
+    }
+    report( "mc weight :" );
+
+    ok = 1; used_asm = 0;
+    s = 1; d = 0;
+    for( o = 0; o <= 127 && ok; o++ )
+    {
+        if( rand() & 15 ) continue;
+        x264_weight_t weight = { .i_scale = 1, .i_denom = 0, .i_offset = o };
+        MC_TEST_WEIGHT( offsetadd, weight, (align_cnt++ % 4) );
+    }
+    report( "mc offsetadd :" );
+    ok = 1; used_asm = 0;
+    for( o = -128; o < 0 && ok; o++ )
+    {
+        if( rand() & 15 ) continue;
+        x264_weight_t weight = { .i_scale = 1, .i_denom = 0, .i_offset = o };
+        MC_TEST_WEIGHT( offsetsub, weight, (align_cnt++ % 4) );
+    }
+    report( "mc offsetsub :" );
+
      if( mc_a.hpel_filter != mc_ref.hpel_filter )
      {
          uint8_t *src = buf1+8+2*64;
@@ -990,6 +1065,7 @@ static int check_mc( int cpu_ref, int cpu_new )
          call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, 400 );
          call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, 400 );
          // I don't care about exact rounding, this is just how close the floating-point implementation happens to be
+        x264_emms();
          for( i=0; i<400; i++ )
              ok &= abs(dstc[i]-dsta[i]) <= (abs(dstc[i])>512) || fabs((double)dstc[i]/dsta[i]-1) < 1e-6;
          report( "mbtree propagate :" );
diff --git a/x264.c b/x264.c

index 48c5ccaf0286eb78ac8157b69cdaec0c7b2e4956..d2ee1a9e411970283e91ee1bec8e675850189c58 100644 (file)
--- a/x264.c
+++ b/x264.c
@@ -273,6 +273,10 @@ static void Help( x264_param_t *defaults, int longhelp )
          "                                  - none, spatial, temporal, auto\n",
                                         strtable_lookup( x264_direct_pred_names, defaults->analyse.i_direct_mv_pred ) );
      H2( "      --no-weightb            Disable weighted prediction for B-frames\n" );
+    H1( "      --weightp               Weighted prediction for P-frames [2]\n"
+        "                              - 0: Disabled\n"
+        "                              - 1: Blind offset\n"
+        "                              - 2: Smart analysis\n");
      H1( "      --me <string>           Integer pixel motion estimation method [\"%s\"]\n",
                                         strtable_lookup( x264_motion_est_names, defaults->analyse.i_me_method ) );
      H2( "                                  - dia: diamond search, radius 1 (fast)\n"
@@ -454,6 +458,7 @@ static struct option long_options[] =
      { "direct",      required_argument, NULL, 0 },
      { "weightb",           no_argument, NULL, 'w' },
      { "no-weightb",        no_argument, NULL, 0 },
+    { "weightp",     required_argument, NULL, 0 },
      { "me",          required_argument, NULL, 0 },
      { "merange",     required_argument, NULL, 0 },
      { "mvrange",     required_argument, NULL, 0 },
@@ -651,6 +656,7 @@ static int  Parse( int argc, char **argv,
                  param->analyse.i_trellis = 0;
                  param->i_bframe_adaptive = X264_B_ADAPT_NONE;
                  param->rc.b_mb_tree = 0;
+                param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
              }
              else if( !strcasecmp( optarg, "veryfast" ) )
              {
@@ -661,6 +667,7 @@ static int  Parse( int argc, char **argv,
                  param->analyse.b_mixed_references = 0;
                  param->analyse.i_trellis = 0;
                  param->rc.b_mb_tree = 0;
+                param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
              }
              else if( !strcasecmp( optarg, "faster" ) )
              {
@@ -668,6 +675,7 @@ static int  Parse( int argc, char **argv,
                  param->i_frame_reference = 2;
                  param->analyse.i_subpel_refine = 4;
                  param->rc.b_mb_tree = 0;
+                param->analyse.i_weighted_pred = X264_WEIGHTP_BLIND;
              }
              else if( !strcasecmp( optarg, "fast" ) )
              {
@@ -789,6 +797,7 @@ static int  Parse( int argc, char **argv,
                  param->b_deblocking_filter = 0;
                  param->b_cabac = 0;
                  param->analyse.b_weighted_bipred = 0;
+                param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
              }
              else if( !strcasecmp( optarg, "touhou" ) )
              {
@@ -979,6 +988,7 @@ generic_option:
              param->b_cabac = 0;
              param->i_cqm_preset = X264_CQM_FLAT;
              param->i_bframe = 0;
+            param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
              if( param->b_interlaced )
              {
                  fprintf( stderr, "x264 [error]: baseline profile doesn't support interlacing\n" );
diff --git a/x264.h b/x264.h

index 0c10c8ca9769f122484cda3b30390be1f14ba4dd..99cabbae3469663f44440a874956bf3db9ad2205 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@
  
  #include <stdarg.h>
  
-#define X264_BUILD 78
+#define X264_BUILD 79
  
  /* x264_t:
   *      opaque handler for encoder */
@@ -95,6 +95,9 @@ typedef struct x264_t x264_t;
  #define X264_B_ADAPT_NONE            0
  #define X264_B_ADAPT_FAST            1
  #define X264_B_ADAPT_TRELLIS         2
+#define X264_WEIGHTP_NONE            0
+#define X264_WEIGHTP_BLIND           1
+#define X264_WEIGHTP_SMART           2
  #define X264_B_PYRAMID_NONE          0
  #define X264_B_PYRAMID_STRICT        1
  #define X264_B_PYRAMID_NORMAL        2
@@ -235,6 +238,7 @@ typedef struct x264_param_t
          unsigned int inter;     /* inter partitions */
  
          int          b_transform_8x8;
+        int          i_weighted_pred; /* weighting for P-frames */
          int          b_weighted_bipred; /* implicit weighting for B-frames */
          int          i_direct_mv_pred; /* spatial vs temporal mv prediction */
          int          i_chroma_qp_offset;
author	Dylan Yudaken <dyudaken@gmail.com>
	Mon, 9 Nov 2009 01:59:08 +0000 (17:59 -0800)
committer	Fiona Glaser <fiona@x264.com>
	Mon, 9 Nov 2009 04:21:51 +0000 (20:21 -0800)
common/common.c		patch \| blob \| history
common/common.h		patch \| blob \| history
common/frame.c		patch \| blob \| history
common/frame.h		patch \| blob \| history
common/macroblock.c		patch \| blob \| history
common/mc.c		patch \| blob \| history
common/mc.h		patch \| blob \| history
common/x86/mc-a.asm		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history
encoder/analyse.c		patch \| blob \| history
encoder/analyse.h		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
encoder/macroblock.c		patch \| blob \| history
encoder/me.c		patch \| blob \| history
encoder/me.h		patch \| blob \| history
encoder/ratecontrol.c		patch \| blob \| history
encoder/ratecontrol.h		patch \| blob \| history
encoder/set.c		patch \| blob \| history
encoder/slicetype.c		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history
x264.c		patch \| blob \| history
x264.h		patch \| blob \| history