From fa36981ec8db79a156d698ddb455509756f97aec Mon Sep 17 00:00:00 2001
From: Yaowu Xu <yaowu@google.com>
Date: Mon, 4 Feb 2013 15:22:32 -0800
Subject: [PATCH] rewrite 4x4 idct and fdct

This commit changes the 4x4 iDCT to use same algorithm & constants as
other iDCTs. The 4x4 fDCT is also changed to be based on the new iDCT.

Change-Id: Ib1a902693228af903862e1f5a08078c36f2089b0
---
 vp9/common/vp9_idctllm.c | 244 ++++++++++++++++++---------------------
 vp9/encoder/vp9_dct.c    | 119 +++++++++++++------
 2 files changed, 194 insertions(+), 169 deletions(-)

diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c
index 7dd2776f6..055d8d8b0 100644
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -29,52 +29,6 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
 
-static const int cospi8sqrt2minus1 = 20091;
-static const int sinpi8sqrt2      = 35468;
-static const int rounding = 0;
-
-// Constants and Macros used by 16 and 32 point idct functions
-#define DCT_CONST_BITS 14
-#define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))
-// Constants are 16384 * cos(kPi/64) where k = 1 to 31.
-// Note: sin(kPi/64) = cos((32-k)Pi/64)
-static const int cospi_1_64  = 16364;
-static const int cospi_2_64  = 16305;
-static const int cospi_3_64  = 16207;
-static const int cospi_4_64  = 16069;
-static const int cospi_5_64  = 15893;
-static const int cospi_6_64  = 15679;
-static const int cospi_7_64  = 15426;
-static const int cospi_8_64  = 15137;
-static const int cospi_9_64  = 14811;
-static const int cospi_10_64 = 14449;
-static const int cospi_11_64 = 14053;
-static const int cospi_12_64 = 13623;
-static const int cospi_13_64 = 13160;
-static const int cospi_14_64 = 12665;
-static const int cospi_15_64 = 12140;
-static const int cospi_16_64 = 11585;
-static const int cospi_17_64 = 11003;
-static const int cospi_18_64 = 10394;
-static const int cospi_19_64 = 9760;
-static const int cospi_20_64 = 9102;
-static const int cospi_21_64 = 8423;
-static const int cospi_22_64 = 7723;
-static const int cospi_23_64 = 7005;
-static const int cospi_24_64 = 6270;
-static const int cospi_25_64 = 5520;
-static const int cospi_26_64 = 4756;
-static const int cospi_27_64 = 3981;
-static const int cospi_28_64 = 3196;
-static const int cospi_29_64 = 2404;
-static const int cospi_30_64 = 1606;
-static const int cospi_31_64 = 804;
-
-static int16_t dct_const_round_shift(int input) {
-  int rv = (input + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-  assert((rv <= INT16_MAX) && (rv >= INT16_MIN));
-  return (int16_t)rv;
-}
 
 
 static const int16_t idct_i4[16] = {
@@ -307,93 +261,6 @@ void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
   }
 }
 
-void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) {
-  int i;
-  int a1, b1, c1, d1;
-
-  int16_t *ip = input;
-  int16_t *op = output;
-  int temp1, temp2;
-  int shortpitch = pitch >> 1;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[8];
-    b1 = ip[0] - ip[8];
-
-    temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
-    temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
-    c1 = temp1 - temp2;
-
-    temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
-    temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
-    d1 = temp1 + temp2;
-
-    op[shortpitch * 0] = a1 + d1;
-    op[shortpitch * 3] = a1 - d1;
-
-    op[shortpitch * 1] = b1 + c1;
-    op[shortpitch * 2] = b1 - c1;
-
-    ip++;
-    op++;
-  }
-
-  ip = output;
-  op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[2];
-    b1 = ip[0] - ip[2];
-
-    temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;
-    temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);
-    c1 = temp1 - temp2;
-
-    temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);
-    temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;
-    d1 = temp1 + temp2;
-
-    op[0] = (a1 + d1 + 16) >> 5;
-    op[3] = (a1 - d1 + 16) >> 5;
-
-    op[1] = (b1 + c1 + 16) >> 5;
-    op[2] = (b1 - c1 + 16) >> 5;
-
-    ip += shortpitch;
-    op += shortpitch;
-  }
-}
-
-void vp9_short_idct4x4llm_1_c(int16_t *input, int16_t *output, int pitch) {
-  int i;
-  int a1;
-  int16_t *op = output;
-  int shortpitch = pitch >> 1;
-  a1 = ((input[0] + 16) >> 5);
-  for (i = 0; i < 4; i++) {
-    op[0] = a1;
-    op[1] = a1;
-    op[2] = a1;
-    op[3] = a1;
-    op += shortpitch;
-  }
-}
-
-void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr,
-                            uint8_t *dst_ptr, int pitch, int stride) {
-  int a1 = ((input_dc + 16) >> 5);
-  int r, c;
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
-      dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);
-    }
-
-    dst_ptr += stride;
-    pred_ptr += pitch;
-  }
-}
-
 void vp9_short_inv_walsh4x4_c(int16_t *input, int16_t *output) {
   int i;
   int a1, b1, c1, d1;
@@ -590,6 +457,50 @@ void vp9_dc_only_inv_walsh_add_c(short input_dc, uint8_t *pred_ptr,
 }
 #endif
 
+// Constants and Macros used by all idct functions
+// TODO(Yaowu): move these to a header file as they shared by DCTs and iDCTs
+#define DCT_CONST_BITS 14
+#define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))
+// Constants are 16384 * cos(kPi/64) where k = 1 to 31.
+// Note: sin(kPi/64) = cos((32-k)Pi/64)
+static const int cospi_1_64  = 16364;
+static const int cospi_2_64  = 16305;
+static const int cospi_3_64  = 16207;
+static const int cospi_4_64  = 16069;
+static const int cospi_5_64  = 15893;
+static const int cospi_6_64  = 15679;
+static const int cospi_7_64  = 15426;
+static const int cospi_8_64  = 15137;
+static const int cospi_9_64  = 14811;
+static const int cospi_10_64 = 14449;
+static const int cospi_11_64 = 14053;
+static const int cospi_12_64 = 13623;
+static const int cospi_13_64 = 13160;
+static const int cospi_14_64 = 12665;
+static const int cospi_15_64 = 12140;
+static const int cospi_16_64 = 11585;
+static const int cospi_17_64 = 11003;
+static const int cospi_18_64 = 10394;
+static const int cospi_19_64 = 9760;
+static const int cospi_20_64 = 9102;
+static const int cospi_21_64 = 8423;
+static const int cospi_22_64 = 7723;
+static const int cospi_23_64 = 7005;
+static const int cospi_24_64 = 6270;
+static const int cospi_25_64 = 5520;
+static const int cospi_26_64 = 4756;
+static const int cospi_27_64 = 3981;
+static const int cospi_28_64 = 3196;
+static const int cospi_29_64 = 2404;
+static const int cospi_30_64 = 1606;
+static const int cospi_31_64 = 804;
+
+static inline int dct_const_round_shift(int input) {
+  int rv = (input + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+  assert((rv <= INT16_MAX) && (rv >= INT16_MIN));
+  return rv;
+}
+
 void idct4_1d(int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
@@ -610,6 +521,73 @@ void idct4_1d(int16_t *input, int16_t *output) {
   output[3] = step[0] - step[3];
 }
 
+void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) {
+  int16_t out[4 * 4];
+  int16_t *outptr = &out[0];
+  const int short_pitch = pitch >> 1;
+  int i, j;
+  int16_t temp_in[4], temp_out[4];
+  // First transform rows
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = input[j];
+    idct4_1d(temp_in, outptr);
+    input += 4;
+    outptr += 4;
+  }
+  // Then transform columns
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    idct4_1d(temp_in, temp_out);
+    for (j = 0; j < 4; ++j)
+      output[j * short_pitch + i] = (temp_out[j] + 8) >> 4;
+  }
+}
+
+void vp9_short_idct4x4llm_1_c(int16_t *input, int16_t *output, int pitch) {
+  int i;
+  int a1;
+  int16_t *op = output;
+  int shortpitch = pitch >> 1;
+  int tmp;
+  int16_t out;
+  tmp = input[0] * cospi_16_64;
+  out = dct_const_round_shift(tmp);
+  tmp = out * cospi_16_64;
+  out = dct_const_round_shift(tmp);
+  a1 = (out + 8) >> 4;
+
+  for (i = 0; i < 4; i++) {
+    op[0] = a1;
+    op[1] = a1;
+    op[2] = a1;
+    op[3] = a1;
+    op += shortpitch;
+  }
+}
+
+void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr,
+                            uint8_t *dst_ptr, int pitch, int stride) {
+  int a1;
+  int r, c;
+  int tmp;
+  int16_t out;
+  tmp = input_dc * cospi_16_64;
+  out = dct_const_round_shift(tmp);
+  tmp = out * cospi_16_64;
+  out = dct_const_round_shift(tmp);
+  a1 = (out + 8) >> 4;
+
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 4; c++) {
+      dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);
+    }
+    dst_ptr += stride;
+    pred_ptr += pitch;
+  }
+}
+
 void idct8_1d(int16_t *input, int16_t *output) {
   int16_t step1[8], step2[8];
   int temp1, temp2;
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 32e7b3fbc..fbbea9aa0 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -725,48 +725,95 @@ void vp9_fht_int_c(const int16_t *input, int pitch, int16_t *output,
     op += tx_dim;
   }
 }
+// Constants and Macros used by all idct functions
+// TODO(Yaowu): move these to a header file as they shared by DCTs and iDCTs
+#define DCT_CONST_BITS 14
+#define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))
+// Constants are 16384 * cos(kPi/64) where k = 1 to 31.
+// Note: sin(kPi/64) = cos((32-k)Pi/64)
+static const int cospi_1_64  = 16364;
+static const int cospi_2_64  = 16305;
+static const int cospi_3_64  = 16207;
+static const int cospi_4_64  = 16069;
+static const int cospi_5_64  = 15893;
+static const int cospi_6_64  = 15679;
+static const int cospi_7_64  = 15426;
+static const int cospi_8_64  = 15137;
+static const int cospi_9_64  = 14811;
+static const int cospi_10_64 = 14449;
+static const int cospi_11_64 = 14053;
+static const int cospi_12_64 = 13623;
+static const int cospi_13_64 = 13160;
+static const int cospi_14_64 = 12665;
+static const int cospi_15_64 = 12140;
+static const int cospi_16_64 = 11585;
+static const int cospi_17_64 = 11003;
+static const int cospi_18_64 = 10394;
+static const int cospi_19_64 = 9760;
+static const int cospi_20_64 = 9102;
+static const int cospi_21_64 = 8423;
+static const int cospi_22_64 = 7723;
+static const int cospi_23_64 = 7005;
+static const int cospi_24_64 = 6270;
+static const int cospi_25_64 = 5520;
+static const int cospi_26_64 = 4756;
+static const int cospi_27_64 = 3981;
+static const int cospi_28_64 = 3196;
+static const int cospi_29_64 = 2404;
+static const int cospi_30_64 = 1606;
+static const int cospi_31_64 = 804;
+
+static inline int dct_const_round_shift(int input) {
+  int rv = (input + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+  assert((rv <= INT16_MAX) && (rv >= INT16_MIN));
+  return rv;
+}
 
-void vp9_short_fdct4x4_c(short *input, short *output, int pitch) {
-  int i;
-  int a1, b1, c1, d1;
-  short *ip = input;
-  short *op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ((ip[0] + ip[3]) << 5);
-    b1 = ((ip[1] + ip[2]) << 5);
-    c1 = ((ip[1] - ip[2]) << 5);
-    d1 = ((ip[0] - ip[3]) << 5);
-
-    op[0] = a1 + b1;
-    op[2] = a1 - b1;
-
-    op[1] = (c1 * 2217 + d1 * 5352 +  14500) >> 12;
-    op[3] = (d1 * 2217 - c1 * 5352 +   7500) >> 12;
-
-    ip += pitch / 2;
-    op += 4;
+static void fdct4_1d(int16_t *input, int16_t *output) {
+  int16_t step[4];
+  int temp1, temp2;
+
+  step[0] = input[0] + input[3];
+  step[1] = input[1] + input[2];
+  step[2] = input[1] - input[2];
+  step[3] = input[0] - input[3];
+
+  temp1 = (step[0] + step[1]) * cospi_16_64;
+  temp2 = (step[0] - step[1]) * cospi_16_64;
+  output[0] = dct_const_round_shift(temp1);
+  output[2] = dct_const_round_shift(temp2);
+  temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+  temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+  output[1] = dct_const_round_shift(temp1);
+  output[3] = dct_const_round_shift(temp2);
+}
 
+void vp9_short_fdct4x4_c(short *input, short *output, int pitch) {
+  int16_t out[4 * 4];
+  int16_t *outptr = &out[0];
+  const int short_pitch = pitch >> 1;
+  int i, j;
+  int16_t temp_in[4], temp_out[4];
+  // First transform cols
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = input[j * short_pitch + i] << 4;
+    if (i == 0 && temp_in[0])
+      temp_in[0] += 1;
+    fdct4_1d(temp_in, temp_out);
+    for (j = 0; j < 4; ++j)
+      outptr[j * 4 + i] = temp_out[j];
   }
-  ip = output;
-  op = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[12];
-    b1 = ip[4] + ip[8];
-    c1 = ip[4] - ip[8];
-    d1 = ip[0] - ip[12];
-
-    op[0]  = (a1 + b1 + 7) >> 4;
-    op[8]  = (a1 - b1 + 7) >> 4;
-
-    op[4]  = ((c1 * 2217 + d1 * 5352 +  12000) >> 16) + (d1 != 0);
-    op[12] = (d1 * 2217 - c1 * 5352 +  51000) >> 16;
-
-    ip++;
-    op++;
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j + i * 4];
+    fdct4_1d(temp_in, temp_out);
+    for (j = 0; j < 4; ++j)
+        output[j + i * 4] = (temp_out[j] + 1) >> 2;
   }
 }
 
+
 void vp9_short_fdct8x4_c(short *input, short *output, int pitch)
 {
     vp9_short_fdct4x4_c(input,   output,    pitch);
-- 
2.40.0