Optimize rounding of luma and chroma DC coefficients

author Fiona Glaser <fiona@x264.com>

Thu, 10 Sep 2009 09:55:21 +0000 (02:55 -0700)

committer Fiona Glaser <fiona@x264.com>

Sun, 13 Sep 2009 02:44:59 +0000 (19:44 -0700)
author Fiona Glaser <fiona@x264.com>
Thu, 10 Sep 2009 09:55:21 +0000 (02:55 -0700)
committer Fiona Glaser <fiona@x264.com>
Sun, 13 Sep 2009 02:44:59 +0000 (19:44 -0700)
diff --git a/encoder/macroblock.c b/encoder/macroblock.c

index 60e14b8fd3c2732acf7d8cdffb85919142455fc9..ccee06a8f3a1e50963c96339871b3d102b87eb77 100644 (file)
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -59,13 +59,13 @@ static inline void idct_dequant_2x2_dc( int16_t dct[2][2], int16_t dct4x4[4][4][
      dct4x4[3][0][0] = (d2 - d3) * dmf >> -qbits;
  }
  
-static inline void idct_dequant_2x2_dconly( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qp )
+static inline void idct_dequant_2x2_dconly( int16_t out[2][2], int16_t dct[2][2], int dequant_mf[6][4][4], int i_qp )
  {
      IDCT_DEQUANT_START
-    dct[0][0] = (d0 + d1) * dmf >> -qbits;
-    dct[0][1] = (d0 - d1) * dmf >> -qbits;
-    dct[1][0] = (d2 + d3) * dmf >> -qbits;
-    dct[1][1] = (d2 - d3) * dmf >> -qbits;
+    out[0][0] = (d0 + d1) * dmf >> -qbits;
+    out[0][1] = (d0 - d1) * dmf >> -qbits;
+    out[1][0] = (d2 + d3) * dmf >> -qbits;
+    out[1][1] = (d2 - d3) * dmf >> -qbits;
  }
  
  static inline void dct2x2dc( int16_t d[2][2], int16_t dct4x4[4][4][4] )
@@ -276,6 +276,64 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
          h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
  }
  
+static inline int idct_dequant_round_2x2_dc( int16_t ref[2][2], int16_t dct[2][2], int dequant_mf[6][4][4], int i_qp )
+{
+    int16_t out[2][2];
+    idct_dequant_2x2_dconly( out, dct, dequant_mf, i_qp );
+    return ((ref[0][0] ^ (out[0][0]+32))
+          | (ref[0][1] ^ (out[0][1]+32))
+          | (ref[1][0] ^ (out[1][0]+32))
+          | (ref[1][1] ^ (out[1][1]+32))) >> 6;
+}
+
+/* Round down coefficients losslessly in DC-only chroma blocks.
+ * Unlike luma blocks, this can't be done with a lookup table or
+ * other shortcut technique because of the interdependencies
+ * between the coefficients due to the chroma DC transform. */
+static inline int x264_mb_optimize_chroma_dc( x264_t *h, int b_inter, int i_qp, int16_t dct2x2[2][2] )
+{
+    int16_t dct2x2_orig[2][2];
+    int coeff;
+    int nz = 0;
+
+    /* If the QP is too high, there's no benefit to rounding optimization. */
+    if( h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0][0] << (i_qp/6) > 32*64 )
+        return 1;
+
+    idct_dequant_2x2_dconly( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+    dct2x2_orig[0][0] += 32;
+    dct2x2_orig[0][1] += 32;
+    dct2x2_orig[1][0] += 32;
+    dct2x2_orig[1][1] += 32;
+
+    /* If the DC coefficients already round to zero, terminate early. */
+    if( !((dct2x2_orig[0][0]|dct2x2_orig[0][1]|dct2x2_orig[1][0]|dct2x2_orig[1][1])>>6) )
+        return 0;
+
+    /* Start with the highest frequency coefficient... is this the best option? */
+    for( coeff = 3; coeff >= 0; coeff-- )
+    {
+        int sign = dct2x2[0][coeff] < 0 ? -1 : 1;
+        int level = dct2x2[0][coeff];
+
+        if( !level )
+            continue;
+
+        while( level )
+        {
+            dct2x2[0][coeff] = level - sign;
+            if( idct_dequant_round_2x2_dc( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) )
+                break;
+            level -= sign;
+        }
+
+        nz |= level;
+        dct2x2[0][coeff] = level;
+    }
+
+    return !!nz;
+}
+
  void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
  {
      int i, ch, nz, nz_dc;
@@ -315,11 +373,14 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
                      else
                          nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<
      1 );
+
                      if( nz_dc )
                      {
+                        if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) )
+                            continue;
                          h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
                          zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
-                        idct_dequant_2x2_dconly( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+                        idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
                          h->dctf.add8x8_idct_dc( h->mb.pic.p_fdec[1+ch], dct2x2 );
                          h->mb.i_cbp_chroma = 1;
                      }
@@ -388,9 +449,14 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
              h->mb.cache.non_zero_count[x264_scan8[16+3]+24*ch] = 0;
              if( !nz_dc ) /* Whole block is empty */
                  continue;
+            if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) )
+            {
+                h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 0;
+                continue;
+            }
              /* DC-only */
              zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
-            idct_dequant_2x2_dconly( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+            idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
              h->dctf.add8x8_idct_dc( p_dst, dct2x2 );
          }
          else
diff --git a/encoder/rdo.c b/encoder/rdo.c

index 453ccb5b38a1ce6c4014fa3967b038001a47a5c7..8abce17cc8442a579443bbbbf3f29efc56d67606 100644 (file)
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -560,7 +560,14 @@ static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, int16_t *dct,
                      n.score += (uint64_t)f8_bits * i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
                  }
  
-                n.score += ssd;
+                if( j || i || dc )
+                    n.score += ssd;
+                /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. */
+                else
+                {
+                    d = i_coef * signs[0] - ((unquant_abs_level * signs[0] + 8)&~15);
+                    n.score += (int64_t)d*d * coef_weight[i];
+                }
  
                  /* save the node if it's better than any existing node with the same cabac ctx */
                  if( n.score < nodes_cur[node_ctx].score )
author	Fiona Glaser <fiona@x264.com>
	Thu, 10 Sep 2009 09:55:21 +0000 (02:55 -0700)
committer	Fiona Glaser <fiona@x264.com>
	Sun, 13 Sep 2009 02:44:59 +0000 (19:44 -0700)
encoder/macroblock.c		patch \| blob \| history
encoder/rdo.c		patch \| blob \| history