Optimizations and cosmetics in macroblock.c

author Fiona Glaser <fiona@x264.com>

Wed, 2 Jul 2008 05:42:39 +0000 (23:42 -0600)

committer Fiona Glaser <fiona@x264.com>

Wed, 2 Jul 2008 05:43:34 +0000 (23:43 -0600)
author Fiona Glaser <fiona@x264.com>
Wed, 2 Jul 2008 05:42:39 +0000 (23:42 -0600)
committer Fiona Glaser <fiona@x264.com>
Wed, 2 Jul 2008 05:43:34 +0000 (23:43 -0600)
diff --git a/encoder/analyse.c b/encoder/analyse.c

index 9200ace71e1c81e9a4965a88d3e0291e8c9c8768..d9ff0bc33115c73f44bb81240a84c6b1ac9abe9c 100644 (file)
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -128,7 +128,7 @@ typedef struct
  } x264_mb_analysis_t;
  
  /* lambda = pow(2,qp/6-2) */
-static const int i_qp0_cost_table[52] = {
+const int x264_lambda_tab[52] = {
     1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
     1, 1, 1, 1,              /*  8-11 */
     1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
@@ -139,7 +139,7 @@ static const int i_qp0_cost_table[52] = {
  };
  
  /* lambda2 = pow(lambda,2) * .9 * 256 */
-static const int i_qp0_cost2_table[52] = {
+const int x264_lambda2_tab[52] = {
      14,      18,      22,      28,     36,     45,     57,     72, /*  0 -  7 */
      91,     115,     145,     182,    230,    290,    365,    460, /*  8 - 15 */
     580,     731,     921,    1161,   1462,   1843,   2322,   2925, /* 16 - 23 */
@@ -205,8 +205,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
      /* conduct the analysis using this lamda and QP */
      a->i_qp = h->mb.i_qp = i_qp;
      h->mb.i_chroma_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
-    a->i_lambda = i_qp0_cost_table[i_qp];
-    a->i_lambda2 = i_qp0_cost2_table[i_qp];
+    a->i_lambda = x264_lambda_tab[i_qp];
+    a->i_lambda2 = x264_lambda2_tab[i_qp];
      a->b_mbrd = h->param.analyse.i_subpel_refine >= 6 &&
                  ( h->sh.i_type != SLICE_TYPE_B || h->param.analyse.b_bframe_rdo );
  
@@ -924,7 +924,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
  
          if( i_max > 0 )
          {
-            int i_chroma_lambda = i_qp0_cost2_table[h->mb.i_chroma_qp];
+            int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
              /* the previous thing encoded was x264_intra_rd(), so the pixels and
               * coefs for the current chroma mode are still around, so we only
               * have to recount the bits. */
diff --git a/encoder/macroblock.c b/encoder/macroblock.c

index 7d03e41b21c6008950ea831b3ab6ae8e1de5eda5..66d034cbd6234515799a1833de4e36e3da23fdba 100644 (file)
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -100,11 +100,16 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
      else
          h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
  
-    h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 );
-    h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
+    if( array_non_zero( dct4x4 ) )
+    {
+        h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 );
+        h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
  
-    /* output samples to fdec */
-    h->dctf.add4x4_idct( p_dst, dct4x4 );
+        /* output samples to fdec */
+        h->dctf.add4x4_idct( p_dst, dct4x4 );
+    }
+    else
+        memset( h->dct.luma4x4[idx], 0, sizeof(h->dct.luma4x4[idx]));
  }
  
  void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )
@@ -132,7 +137,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
      uint8_t  *p_src = h->mb.pic.p_fenc[0];
      uint8_t  *p_dst = h->mb.pic.p_fdec[0];
  
-    DECLARE_ALIGNED_16( int16_t dct4x4[16+1][4][4] );
+    DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
+    DECLARE_ALIGNED_16( int16_t dct_dc4x4[4][4] );
  
      int i;
  
@@ -143,46 +149,46 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
              int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
              int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
              h->zigzagf.sub_4x4( h->dct.luma4x4[i], p_src+oe, p_dst+od );
-            dct4x4[0][block_idx_x[i]][block_idx_y[i]] = h->dct.luma4x4[i][0];
+            dct_dc4x4[block_idx_x[i]][block_idx_y[i]] = h->dct.luma4x4[i][0];
              h->dct.luma4x4[i][0] = 0;
          }
-        h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct4x4[0] );
+        h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
          return;
      }
  
-    h->dctf.sub16x16_dct( &dct4x4[1], p_src, p_dst );
+    h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
      for( i = 0; i < 16; i++ )
      {
          /* copy dc coeff */
-        dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
-        dct4x4[1+i][0][0] = 0;
+        dct_dc4x4[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
+        dct4x4[i][0][0] = 0;
  
          /* quant/scan/dequant */
          if( h->mb.b_trellis )
-            x264_quant_4x4_trellis( h, dct4x4[1+i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 );
+            x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 );
          else
-            h->quantf.quant_4x4( dct4x4[1+i], h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
+            h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
  
-        h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[1+i] );
-        h->quantf.dequant_4x4( dct4x4[1+i], h->dequant4_mf[CQM_4IY], i_qscale );
+        h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] );
+        h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qscale );
      }
  
-    h->dctf.dct4x4dc( dct4x4[0] );
-    h->quantf.quant_4x4_dc( dct4x4[0], h->quant4_mf[CQM_4IY][i_qscale][0]>>1, h->quant4_bias[CQM_4IY][i_qscale][0]<<1 );
-    h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct4x4[0] );
+    h->dctf.dct4x4dc( dct_dc4x4 );
+    h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qscale][0]>>1, h->quant4_bias[CQM_4IY][i_qscale][0]<<1 );
+    h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
  
      /* output samples to fdec */
-    h->dctf.idct4x4dc( dct4x4[0] );
-    x264_mb_dequant_4x4_dc( dct4x4[0], h->dequant4_mf[CQM_4IY], i_qscale );  /* XXX not inversed */
+    h->dctf.idct4x4dc( dct_dc4x4 );
+    x264_mb_dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qscale );  /* XXX not inversed */
  
      /* calculate dct coeffs */
      for( i = 0; i < 16; i++ )
      {
          /* copy dc coeff */
-        dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
+        dct4x4[i][0][0] = dct_dc4x4[block_idx_y[i]][block_idx_x[i]];
      }
      /* put pixels to fdec */
-    h->dctf.add16x16_idct( p_dst, &dct4x4[1] );
+    h->dctf.add16x16_idct( p_dst, dct4x4 );
  }
  
  void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
@@ -617,7 +623,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
  
      int i_qp = h->mb.i_qp;
      int mvp[2];
-    int ch;
+    int ch, thresh;
  
      int i8x8, i4x4;
      int i_decimate_mb;
@@ -656,6 +662,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
  
      /* encode chroma */
      i_qp = h->mb.i_chroma_qp;
+    thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
  
      for( ch = 0; ch < 2; ch++ )
      {
@@ -669,6 +676,11 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
                               mvp[0], mvp[1], 8, 8 );
          }
  
+        /* there is almost never a termination during chroma, but we can't avoid the check entirely */
+        /* so instead we check SSD and skip the actual check if the score is low enough. */
+        if( h->pixf.ssd[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) < thresh )
+            continue;
+
          h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
  
          /* calculate dct DC */
diff --git a/encoder/macroblock.h b/encoder/macroblock.h

index 5ac58349ce7d4ac06310f86849e42c8dc90047d0..ba7be6906920e4b7386cb364e18ce338eda548ff 100644 (file)
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -26,6 +26,9 @@
  
  #include "common/macroblock.h"
  
+extern const int x264_lambda2_tab[52];
+extern const int x264_lambda_tab[52];
+
  void x264_rdo_init( );
  
  int x264_macroblock_probe_skip( x264_t *h, int b_bidir );
diff --git a/encoder/slicetype.c b/encoder/slicetype.c

index d72e40a567a77ffdf914909593bdb55d5b913210..27257c6036424c551b0e755fe80bc646af7a703a 100644 (file)
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -32,7 +32,7 @@
  static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
  {
      a->i_qp = 12; // arbitrary, but low because SATD scores are 1/4 normal
-    a->i_lambda = i_qp0_cost_table[ a->i_qp ];
+    a->i_lambda = x264_lambda_tab[ a->i_qp ];
      x264_mb_analyse_load_costs( h, a );
      h->mb.i_me_method = X264_MIN( X264_ME_HEX, h->param.analyse.i_me_method ); // maybe dia?
      h->mb.i_subpel_refine = 4; // 3 should be enough, but not tweaking for speed now
author	Fiona Glaser <fiona@x264.com>
	Wed, 2 Jul 2008 05:42:39 +0000 (23:42 -0600)
committer	Fiona Glaser <fiona@x264.com>
	Wed, 2 Jul 2008 05:43:34 +0000 (23:43 -0600)
encoder/analyse.c		patch \| blob \| history
encoder/macroblock.c		patch \| blob \| history
encoder/macroblock.h		patch \| blob \| history
encoder/slicetype.c		patch \| blob \| history