Improve subme7 at low QPs and add subme7 support in lossless mode

author Fiona Glaser <fiona@x264.com>

Sat, 9 Aug 2008 15:34:37 +0000 (09:34 -0600)

committer Fiona Glaser <fiona@x264.com>

Sat, 9 Aug 2008 15:41:03 +0000 (09:41 -0600)
author Fiona Glaser <fiona@x264.com>
Sat, 9 Aug 2008 15:34:37 +0000 (09:34 -0600)
committer Fiona Glaser <fiona@x264.com>
Sat, 9 Aug 2008 15:41:03 +0000 (09:41 -0600)
diff --git a/encoder/analyse.c b/encoder/analyse.c

index 270b90ae0d2e8543152b55e0f67c1b12296e10f8..5362ba132d7135e083fcb4f417be668b43c828d4 100644 (file)
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -782,7 +782,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
      uint8_t  *p_dst = h->mb.pic.p_fdec[0];
  
      int i, j, idx, x, y;
-    int i_max, i_satd, i_best, i_mode, i_thresh;
+    int i_max, i_mode, i_thresh;
+    uint64_t i_satd, i_best;
      int i_pred_mode;
      int predict_mode[9];
      h->mb.i_skip_intra = 0;
@@ -810,7 +811,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
          for( idx = 0; idx < 16; idx++ )
          {
              uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
-            i_best = COST_MAX;
+            i_best = COST_MAX64;
  
              i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
  
@@ -860,7 +861,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
              int j;
              i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
  
-            i_best = COST_MAX;
+            i_best = COST_MAX64;
              i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
              x = idx&1;
              y = idx>>1;
diff --git a/encoder/encoder.c b/encoder/encoder.c

index 76fd1454888ce10a10c1e309b4cc6d736f98a7df..cf6997008a93609c4af82e076628a3a3b1306bfd 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -410,7 +410,6 @@ static int x264_validate_parameters( x264_t *h )
          h->param.analyse.i_trellis = 0;
          h->param.analyse.b_fast_pskip = 0;
          h->param.analyse.i_noise_reduction = 0;
-        h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 6 );
      }
      if( h->param.rc.i_rc_method == X264_RC_CQP )
      {
diff --git a/encoder/macroblock.c b/encoder/macroblock.c

index 51c56840361f474665ed98cb6520e0a20d645c6a..8c14302f300b86348633840a3152b35a162111c3 100644 (file)
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -747,76 +747,96 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
      uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
      uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
      int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
-    int nnz8x8;
+    int nnz8x8 = 0;
      int ch;
  
      x264_mb_mc_8x8( h, i8 );
  
-    if( h->mb.b_transform_8x8 )
+    if( h->mb.b_lossless )
      {
-        DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
-        h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
-        h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] );
-        h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
-
-        if( b_decimate )
-            nnz8x8 = 4 <= x264_mb_decimate_score( h->dct.luma8x8[i8], 64 );
-        else
-            nnz8x8 = array_non_zero( dct8x8 );
-
-        if( nnz8x8 )
+        int i4;
+        for( i4 = i8*4; i4 < i8*4+4; i4++ )
          {
-            h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
-            h->dctf.add8x8_idct8( p_fdec, dct8x8 );
+            h->zigzagf.sub_4x4( h->dct.luma4x4[i4],
+                                h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4],
+                                h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] );
+            nnz8x8 |= array_non_zero( h->dct.luma4x4[i4] );
+        }
+        for( ch = 0; ch < 2; ch++ )
+        {
+            p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
+            p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
+            h->zigzagf.sub_4x4( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec );
+            h->dct.luma4x4[16+i8+ch*4][0] = 0;
          }
      }
      else
      {
-        int i4;
-        DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
-        h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
-        for( i4 = 0; i4 < 4; i4++ )
-            h->quantf.quant_4x4( dct4x4[i4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
-        for( i4 = 0; i4 < 4; i4++ )
-            h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
-
-        if( b_decimate )
+        if( h->mb.b_transform_8x8 )
          {
-            int i_decimate_8x8 = 0;
-            for( i4 = 0; i4 < 4 && i_decimate_8x8 < 4; i4++ )
-                i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[i8*4+i4], 16 );
-            nnz8x8 = 4 <= i_decimate_8x8;
+            DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
+            h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
+            h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] );
+            h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
+
+            if( b_decimate )
+                nnz8x8 = 4 <= x264_mb_decimate_score( h->dct.luma8x8[i8], 64 );
+            else
+                nnz8x8 = array_non_zero( dct8x8 );
+
+            if( nnz8x8 )
+            {
+                h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
+                h->dctf.add8x8_idct8( p_fdec, dct8x8 );
+            }
          }
          else
-            nnz8x8 = array_non_zero( dct4x4 );
-
-        if( nnz8x8 )
          {
+            int i4;
+            DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
+            h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
              for( i4 = 0; i4 < 4; i4++ )
-                h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
-            h->dctf.add8x8_idct( p_fdec, dct4x4 );
+                h->quantf.quant_4x4( dct4x4[i4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
+            for( i4 = 0; i4 < 4; i4++ )
+                h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
+
+            if( b_decimate )
+            {
+                int i_decimate_8x8 = 0;
+                for( i4 = 0; i4 < 4 && i_decimate_8x8 < 4; i4++ )
+                    i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[i8*4+i4], 16 );
+                nnz8x8 = 4 <= i_decimate_8x8;
+            }
+            else
+                nnz8x8 = array_non_zero( dct4x4 );
+
+            if( nnz8x8 )
+            {
+                for( i4 = 0; i4 < 4; i4++ )
+                    h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
+                h->dctf.add8x8_idct( p_fdec, dct4x4 );
+            }
          }
-    }
  
-    i_qp = h->mb.i_chroma_qp;
+        i_qp = h->mb.i_chroma_qp;
  
-    for( ch = 0; ch < 2; ch++ )
-    {
-        DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
-        p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
-        p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
-
-        h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
-        h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
-        h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*4], dct4x4 );
-        h->dct.luma4x4[16+i8+ch*4][0] = 0;
-        if( array_non_zero( dct4x4 ) )
+        for( ch = 0; ch < 2; ch++ )
          {
-            h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
-            h->dctf.add4x4_idct( p_fdec, dct4x4 );
+            DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
+            p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
+            p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
+
+            h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
+            h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
+            h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*4], dct4x4 );
+            h->dct.luma4x4[16+i8+ch*4][0] = 0;
+            if( array_non_zero( dct4x4 ) )
+            {
+                h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
+                h->dctf.add4x4_idct( p_fdec, dct4x4 );
+            }
          }
      }
-
      h->mb.i_cbp_luma &= ~(1 << i8);
      h->mb.i_cbp_luma |= nnz8x8 << i8;
      h->mb.i_cbp_chroma = 0x02;
diff --git a/encoder/me.c b/encoder/me.c

index d4f3eaa6f70a2c5a585e0352fa3bbaebe732d2df..f4f7e502ed1a4352dd26a8eb6030f7f74a03dcf7 100644 (file)
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -913,7 +913,7 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight
  { \
      if( satd <= bsatd * SATD_THRESH )\
      { \
-        int cost; \
+        uint64_t cost; \
          *(uint32_t*)cache_mv = *(uint32_t*)cache_mv2 = pack16to32_mask(mx,my); \
          cost = x264_rd_cost_part( h, i_lambda2, i8, m->i_pixel ); \
          COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
@@ -934,7 +934,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 )
      const int i_pixel = m->i_pixel;
  
      DECLARE_ALIGNED_16( uint8_t pix[16*16] );
-    int bcost = m->i_pixel == PIXEL_16x16 ? m->cost : COST_MAX;
+    uint64_t bcost = m->i_pixel == PIXEL_16x16 ? m->cost : COST_MAX64;
      int bmx = m->mv[0];
      int bmy = m->mv[1];
      int omx = bmx;
diff --git a/encoder/me.h b/encoder/me.h

index 34806e120c0d13c3ff3f91dde1b8e07b4b3538b5..655c2a140c5804137bbfd128fa720a8383a8e552 100644 (file)
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -25,6 +25,7 @@
  #define X264_ME_H
  
  #define COST_MAX (1<<28)
+#define COST_MAX64 (1ULL<<60)
  
  typedef struct
  {
@@ -54,7 +55,7 @@ static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], i
  void x264_me_refine_qpel( x264_t *h, x264_me_t *m );
  void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 );
  int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
-int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel );
+uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel );
  
  extern uint16_t *x264_cost_mv_fpel[52][4];
  
diff --git a/encoder/rdo.c b/encoder/rdo.c

index 76bf57bed985fe9ff8b3c5796cba67c4eca6c008..650b5ae5709143403f9afeeee20aec4afbbb249c 100644 (file)
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -101,9 +101,11 @@ static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
      return i_ssd + i_bits;
  }
  
-int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel )
+/* subpartition RD functions use 8 bits more precision to avoid large rounding errors at low QPs */
+
+uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel )
  {
-    int i_ssd, i_bits;
+    uint64_t i_ssd, i_bits;
  
      if( i_pixel == PIXEL_16x16 )
      {
@@ -128,19 +130,19 @@ int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel )
          x264_cabac_t cabac_tmp;
          COPY_CABAC;
          x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
-        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
+        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
      }
      else
      {
-        i_bits = ( x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2 + 128 ) >> 8;
+        i_bits = x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2;
      }
  
-    return i_ssd + i_bits;
+    return (i_ssd<<8) + i_bits;
  }
  
-int x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
+uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
  {
-    int i_ssd, i_bits;
+    uint64_t i_ssd, i_bits;
  
      x264_mb_encode_i8x8( h, i8, h->mb.i_qp );
      i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 );
@@ -150,19 +152,19 @@ int x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
          x264_cabac_t cabac_tmp;
          COPY_CABAC;
          x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
-        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
+        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
      }
      else
      {
-        i_bits = ( x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2 + 128 ) >> 8;
+        i_bits = x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2;
      }
  
-    return i_ssd + i_bits;
+    return (i_ssd<<8) + i_bits;
  }
  
-int x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
+uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
  {
-    int i_ssd, i_bits;
+    uint64_t i_ssd, i_bits;
  
      x264_mb_encode_i4x4( h, i4, h->mb.i_qp );
      i_ssd = ssd_plane( h, PIXEL_4x4, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
@@ -172,19 +174,19 @@ int x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
          x264_cabac_t cabac_tmp;
          COPY_CABAC;
          x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
-        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
+        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
      }
      else
      {
-        i_bits = ( x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2 + 128 ) >> 8;
+        i_bits = x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2;
      }
  
-    return i_ssd + i_bits;
+    return (i_ssd<<8) + i_bits;
  }
  
-int x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
+uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
  {
-    int i_ssd, i_bits;
+    uint64_t i_ssd, i_bits;
  
      if( b_dct )
          x264_mb_encode_8x8_chroma( h, 0, h->mb.i_chroma_qp );
@@ -198,14 +200,14 @@ int x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
          x264_cabac_t cabac_tmp;
          COPY_CABAC;
          x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
-        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
+        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
      }
      else
      {
-        i_bits = ( x264_i8x8_chroma_size_cavlc( h ) * i_lambda2 + 128 ) >> 8;
+        i_bits = x264_i8x8_chroma_size_cavlc( h ) * i_lambda2;
      }
  
-    return i_ssd + i_bits;
+    return (i_ssd<<8) + i_bits;
  }
  /****************************************************************************
   * Trellis RD quantization
author	Fiona Glaser <fiona@x264.com>
	Sat, 9 Aug 2008 15:34:37 +0000 (09:34 -0600)
committer	Fiona Glaser <fiona@x264.com>
	Sat, 9 Aug 2008 15:41:03 +0000 (09:41 -0600)
encoder/analyse.c		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
encoder/macroblock.c		patch \| blob \| history
encoder/me.c		patch \| blob \| history
encoder/me.h		patch \| blob \| history
encoder/rdo.c		patch \| blob \| history