Make fdct32 computation flow within 16bit range

author Jingning Han <jingning@google.com>

Fri, 14 Jun 2013 18:28:56 +0000 (11:28 -0700)

committer Jingning Han <jingning@google.com>

Tue, 18 Jun 2013 16:46:24 +0000 (09:46 -0700)
author Jingning Han <jingning@google.com>
Fri, 14 Jun 2013 18:28:56 +0000 (11:28 -0700)
committer Jingning Han <jingning@google.com>
Tue, 18 Jun 2013 16:46:24 +0000 (09:46 -0700)
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h

index af35432c44cc8998ccea332801a5fc124ad2aa69..64f14c993ed065c7fe6b0c226264a0fed2e713ec 100644 (file)
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -71,12 +71,6 @@ static INLINE int dct_const_round_shift(int input) {
    return rv;
  }
  
-static INLINE int dct_32_round(int input) {
-  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  assert(-131072 <= rv && rv <= 131071);
-  return rv;
-}
-
  typedef void (*transform_1d)(int16_t*, int16_t*);
  
  typedef struct {
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh

index 8c78b7bbde40a4185f5dfb0980fddff93ddc4618..221718b9b0d83ca8664fbd60b2520782a7b363a0 100644 (file)
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -577,6 +577,9 @@ specialize vp9_short_fdct8x4 sse2
  prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
  specialize vp9_short_fdct32x32
  
+prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch"
+specialize vp9_short_fdct32x32_rd
+
  prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
  specialize vp9_short_fdct16x16 sse2
  
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h

index e78f54eb4913e589b65136f506033b81633ae1f1..59cc3d95cf479a832982d953f0d4a285d5cac1f1 100644 (file)
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -139,6 +139,9 @@ struct macroblock {
  
    int optimize;
  
+  // indicate if it is in the rd search loop or encoding process
+  int rd_search;
+
    // TODO(jingning): Need to refactor the structure arrays that buffers the
    // coding mode decisions of each partition type.
    PICK_MODE_CONTEXT ab4x4_context[4][4][4];
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c

index 8d4eec1394bf36e8bcce7ab24e3bf189696a02fa..a90bcf5df6d6e4ce10bba6651d1d33ee2a940fd1 100644 (file)
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -991,8 +991,18 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
    }
  }
  
+static INLINE int dct_32_round(int input) {
+  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  assert(-131072 <= rv && rv <= 131071);
+  return rv;
+}
+
+static INLINE int half_round_shift(int input) {
+  int rv = (input + 1 + (input < 0)) >> 2;
+  return rv;
+}
  
-static void dct32_1d(int *input, int *output) {
+static void dct32_1d(int *input, int *output, int round) {
    int step[32];
    // Stage 1
    step[0] = input[0] + input[(32 - 1)];
@@ -1101,6 +1111,44 @@ static void dct32_1d(int *input, int *output) {
    step[30] = output[30] + output[25];
    step[31] = output[31] + output[24];
  
+  // dump the magnitude by half, hence the intermediate values are within 1108
+  // the range of 16 bits.
+  if (round) {
+    step[0] = half_round_shift(step[0]);
+    step[1] = half_round_shift(step[1]);
+    step[2] = half_round_shift(step[2]);
+    step[3] = half_round_shift(step[3]);
+    step[4] = half_round_shift(step[4]);
+    step[5] = half_round_shift(step[5]);
+    step[6] = half_round_shift(step[6]);
+    step[7] = half_round_shift(step[7]);
+    step[8] = half_round_shift(step[8]);
+    step[9] = half_round_shift(step[9]);
+    step[10] = half_round_shift(step[10]);
+    step[11] = half_round_shift(step[11]);
+    step[12] = half_round_shift(step[12]);
+    step[13] = half_round_shift(step[13]);
+    step[14] = half_round_shift(step[14]);
+    step[15] = half_round_shift(step[15]);
+
+    step[16] = half_round_shift(step[16]);
+    step[17] = half_round_shift(step[17]);
+    step[18] = half_round_shift(step[18]);
+    step[19] = half_round_shift(step[19]);
+    step[20] = half_round_shift(step[20]);
+    step[21] = half_round_shift(step[21]);
+    step[22] = half_round_shift(step[22]);
+    step[23] = half_round_shift(step[23]);
+    step[24] = half_round_shift(step[24]);
+    step[25] = half_round_shift(step[25]);
+    step[26] = half_round_shift(step[26]);
+    step[27] = half_round_shift(step[27]);
+    step[28] = half_round_shift(step[28]);
+    step[29] = half_round_shift(step[29]);
+    step[30] = half_round_shift(step[30]);
+    step[31] = half_round_shift(step[31]);
+  }
+
    // Stage 4
    output[0] = step[0] + step[3];
    output[1] = step[1] + step[2];
@@ -1283,12 +1331,12 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
    int output[32 * 32];
  
    // Columns
-  for (i = 0; i < 32; i++) {
+  for (i = 0; i < 32; ++i) {
      int temp_in[32], temp_out[32];
-    for (j = 0; j < 32; j++)
+    for (j = 0; j < 32; ++j)
        temp_in[j] = input[j * shortpitch + i] << 2;
-    dct32_1d(temp_in, temp_out);
-    for (j = 0; j < 32; j++)
+    dct32_1d(temp_in, temp_out, 0);
+    for (j = 0; j < 32; ++j)
        output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
    }
  
@@ -1297,8 +1345,37 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
      int temp_in[32], temp_out[32];
      for (j = 0; j < 32; ++j)
        temp_in[j] = output[j + i * 32];
-    dct32_1d(temp_in, temp_out);
+    dct32_1d(temp_in, temp_out, 0);
      for (j = 0; j < 32; ++j)
        out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
    }
  }
+
+// Note that although we use dct_32_round in dct32_1d computation flow,
+// this 2d fdct32x32 for rate-distortion optimization loop is operating
+// within 16 bits precision.
+void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) {
+  int shortpitch = pitch >> 1;
+  int i, j;
+  int output[32 * 32];
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    int temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = input[j * shortpitch + i] << 2;
+    dct32_1d(temp_in, temp_out, 0);
+    for (j = 0; j < 32; ++j)
+      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  }
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    int temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = output[j + i * 32];
+    dct32_1d(temp_in, temp_out, 1);
+    for (j = 0; j < 32; ++j)
+      out[j + i * 32] = temp_out[j];
+  }
+}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c

index 213a9c72a89005296d1f6ff345947393a46e4056..d6a82c815d84c6e855ffb709f38cdca754583886 100644 (file)
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -603,6 +603,8 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
    MACROBLOCK *const x = &cpi->mb;
    MACROBLOCKD *const xd = &x->e_mbd;
  
+  x->rd_search = 1;
+
    if (bsize < BLOCK_SIZE_SB8X8)
      if (xd->ab_index != 0)
        return;
@@ -1975,6 +1977,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
    const int mis = cm->mode_info_stride;
    const int bwl = mi_width_log2(bsize);
    const int bw = 1 << bwl, bh = 1 << mi_height_log2(bsize);
+  x->rd_search = 0;
  
    if (cm->frame_type == KEY_FRAME) {
      if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c

index 90f00d2be14c6672fc51d1d7b35ab1c1ee5daa9a..6add0623b9d27328e3c8ea52dbfe326f10a4eefb 100644 (file)
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -454,7 +454,10 @@ static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
  
    switch (ss_txfrm_size / 2) {
      case TX_32X32:
-      vp9_short_fdct32x32(src_diff, coeff, bw * 2);
+      if (x->rd_search)
+        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 2);
+      else
+        vp9_short_fdct32x32(src_diff, coeff, bw * 2);
        break;
      case TX_16X16:
        tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
author	Jingning Han <jingning@google.com>
	Fri, 14 Jun 2013 18:28:56 +0000 (11:28 -0700)
committer	Jingning Han <jingning@google.com>
	Tue, 18 Jun 2013 16:46:24 +0000 (09:46 -0700)
vp9/common/vp9_idct.h		patch \| blob \| history
vp9/common/vp9_rtcd_defs.sh		patch \| blob \| history
vp9/encoder/vp9_block.h		patch \| blob \| history
vp9/encoder/vp9_dct.c		patch \| blob \| history
vp9/encoder/vp9_encodeframe.c		patch \| blob \| history
vp9/encoder/vp9_encodemb.c		patch \| blob \| history