From: Yunqing Wang <yunqingwang@google.com>
Date: Wed, 13 Feb 2013 08:19:32 +0000 (-0800)
Subject: Rewrote fdct16x16
X-Git-Tag: v1.3.0~1151^2~170^2
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=048b9d41a6718e5b491dc9dd3fbaa9939fa07cb5;p=libvpx

Rewrote fdct16x16

Used same algorithm as others.

Change-Id: Ifdac560762aec9735cb4bb6f1dbf549e415c38a0
---

diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 746648291..230c7e986 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -1250,6 +1250,9 @@ void vp9_short_fdct16x16_c(short *input, short *out, int pitch) {
 }
 
 #else
+
+#define NEW_FDCT16 1
+#if !NEW_FDCT16
 static const int16_t C1 = 16305;
 static const int16_t C2 = 16069;
 static const int16_t C3 = 15679;
@@ -1469,6 +1472,137 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *out, int pitch) {
 }
 #undef RIGHT_SHIFT
 #undef ROUNDING
+
+#else
+// Rewrote to use same algorithm as others.
+static void fdct16_1d(int16_t input[16], int16_t output[16]) {
+  int16_t step[16];
+  int temp1, temp2;
+
+  // step 1
+  step[ 0] = input[0] + input[15];
+  step[ 1] = input[1] + input[14];
+  step[ 2] = input[2] + input[13];
+  step[ 3] = input[3] + input[12];
+  step[ 4] = input[4] + input[11];
+  step[ 5] = input[5] + input[10];
+  step[ 6] = input[6] + input[ 9];
+  step[ 7] = input[7] + input[ 8];
+  step[ 8] = input[7] - input[ 8];
+  step[ 9] = input[6] - input[ 9];
+  step[10] = input[5] - input[10];
+  step[11] = input[4] - input[11];
+  step[12] = input[3] - input[12];
+  step[13] = input[2] - input[13];
+  step[14] = input[1] - input[14];
+  step[15] = input[0] - input[15];
+
+  fdct8_1d(step, step);
+
+  // step 2
+  output[8] = step[8];
+  output[9] = step[9];
+  temp1 = (-step[10] + step[13]) * cospi_16_64;
+  temp2 = (-step[11] + step[12]) * cospi_16_64;
+  output[10] = dct_const_round_shift(temp1);
+  output[11] = dct_const_round_shift(temp2);
+  temp1 = (step[11] + step[12]) * cospi_16_64;
+  temp2 = (step[10] + step[13]) * cospi_16_64;
+  output[12] = dct_const_round_shift(temp1);
+  output[13] = dct_const_round_shift(temp2);
+  output[14] = step[14];
+  output[15] = step[15];
+
+  // step 3
+  step[ 8] = output[8] + output[11];
+  step[ 9] = output[9] + output[10];
+  step[ 10] = output[9] - output[10];
+  step[ 11] = output[8] - output[11];
+  step[ 12] = -output[12] + output[15];
+  step[ 13] = -output[13] + output[14];
+  step[ 14] = output[13] + output[14];
+  step[ 15] = output[12] + output[15];
+
+  // step 4
+  output[8] = step[8];
+  temp1 = -step[9] * cospi_8_64 + step[14] * cospi_24_64;
+  temp2 = -step[10] * cospi_24_64 - step[13] * cospi_8_64;
+  output[9] = dct_const_round_shift(temp1);
+  output[10] = dct_const_round_shift(temp2);
+  output[11] = step[11];
+  output[12] = step[12];
+  temp1 = -step[10] * cospi_8_64 + step[13] * cospi_24_64;
+  temp2 = step[9] * cospi_24_64 + step[14] * cospi_8_64;
+  output[13] = dct_const_round_shift(temp1);
+  output[14] = dct_const_round_shift(temp2);
+  output[15] = step[15];
+
+  // step 5
+  step[8] = output[8] + output[9];
+  step[9] = output[8] - output[9];
+  step[10] = -output[10] + output[11];
+  step[11] = output[10] + output[11];
+  step[12] = output[12] + output[13];
+  step[13] = output[12] - output[13];
+  step[14] = -output[14] + output[15];
+  step[15] = output[14] + output[15];
+
+  // step 6
+  output[0] = step[0];
+  output[8] = step[4];
+  output[4] = step[2];
+  output[12] = step[6];
+  output[2] = step[1];
+  output[10] = step[5];
+  output[6] = step[3];
+  output[14] = step[7];
+
+  temp1 = step[8] * cospi_30_64 + step[15] * cospi_2_64;
+  temp2 = step[9] * cospi_14_64 + step[14] * cospi_18_64;
+  output[1] = dct_const_round_shift(temp1);
+  output[9] = dct_const_round_shift(temp2);
+
+  temp1 = step[10] * cospi_22_64 + step[13] * cospi_10_64;
+  temp2 = step[11] * cospi_6_64 + step[12] * cospi_26_64;
+  output[5] = dct_const_round_shift(temp1);
+  output[13] = dct_const_round_shift(temp2);
+
+  temp1 = -step[11] * cospi_26_64 + step[12] * cospi_6_64;
+  temp2 = -step[10] * cospi_10_64 + step[13] * cospi_22_64;
+  output[3] = dct_const_round_shift(temp1);
+  output[11] = dct_const_round_shift(temp2);
+
+  temp1 = -step[9] * cospi_18_64 + step[14] * cospi_14_64;
+  temp2 = -step[8] * cospi_2_64 + step[15] * cospi_30_64;
+  output[7] = dct_const_round_shift(temp1);
+  output[15] = dct_const_round_shift(temp2);
+}
+
+void vp9_short_fdct16x16_c(int16_t *input, int16_t *out, int pitch) {
+  int shortpitch = pitch >> 1;
+  int i, j;
+  int16_t output[256];
+  int16_t temp_in[16], temp_out[16];
+
+  // First transform columns
+  for (i = 0; i < 16; i++) {
+    for (j = 0; j < 16; j++)
+      temp_in[j] = input[j * shortpitch + i];
+    fdct16_1d(temp_in, temp_out);
+    for (j = 0; j < 16; j++)
+      output[j * 16 + i] = temp_out[j];
+  }
+
+  // Then transform rows
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = output[j + i * 16];
+    fdct16_1d(temp_in, temp_out);
+    for (j = 0; j < 16; ++j)
+      out[j + i * 16] = temp_out[j];
+  }
+}
+#endif
 #endif
 
 #define TEST_INT_32x32_DCT 1