From 8a9e4e8a618016b45eba148f167def81d80db103 Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Thu, 18 Oct 2012 16:31:59 -0700
Subject: [PATCH] Convert the transforms to integer forms.

Converted the forward and inverse transforms to integer forms.

Modify #define TEST_INT 1/0
in the code to call integer/float version of transforms.

The tests showed that average OVERALL PSNR loss was less than 0.1%.

Change-Id: I1dfa4eeab6412597e3b970ce299cf0e116a917e6
---
 vp8/common/idct.h         |   2 +-
 vp8/common/idctllm.c      | 239 +++++++++++++++++++++++++++++++---
 vp8/encoder/dct.c         | 267 ++++++++++++++++++++++++++++++++++----
 vp8/encoder/encodeintra.c |   6 +-
 vp8/encoder/rdopt.c       |   6 +-
 5 files changed, 472 insertions(+), 48 deletions(-)

diff --git a/vp8/common/idct.h b/vp8/common/idct.h
index ae33df668..0496ca2c7 100644
--- a/vp8/common/idct.h
+++ b/vp8/common/idct.h
@@ -110,7 +110,7 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_1_lossless_c);
 #endif
 
 #include "vp8/common/blockd.h"
-void vp8_ihtllm_c(short *input, short *output, int pitch,
+void vp8_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
                   TX_TYPE tx_type, int tx_dim);
 
 typedef prototype_idct((*vp8_idct_fn_t));
diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c
index c7369b2e2..a0313bd2a 100644
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -28,6 +28,7 @@
 
 #include "vp8/common/blockd.h"
 
+#include <assert.h>
 #include <math.h>
 
 static const int cospi8sqrt2minus1 = 20091;
@@ -88,6 +89,58 @@ float iadst_8[64] = {
   0.326790388032145,  -0.255357107325375,   0.175227946595736,  -0.089131608307532
 };
 
+const int16_t idct_i4[16] = {
+  8192,  10703,  8192,   4433,
+  8192,   4433, -8192, -10703,
+  8192,  -4433, -8192,  10703,
+  8192, -10703,  8192,  -4433
+};
+
+const int16_t iadst_i4[16] = {
+   3736,  9459, 10757,   7021,
+   7021,  9459, -3736, -10757,
+   9459,     0, -9459,   9459,
+  10757, -9459,  7021,  -3736
+};
+
+const int16_t idct_i8[64] = {
+   5793,  8035,  7568,  6811,
+   5793,  4551,  3135,  1598,
+   5793,  6811,  3135, -1598,
+  -5793, -8035, -7568, -4551,
+   5793,  4551, -3135, -8035,
+  -5793,  1598,  7568,  6811,
+   5793,  1598, -7568, -4551,
+   5793,  6811, -3135, -8035,
+   5793, -1598, -7568,  4551,
+   5793, -6811, -3135,  8035,
+   5793, -4551, -3135,  8035,
+  -5793, -1598,  7568, -6811,
+   5793, -6811,  3135,  1598,
+  -5793,  8035, -7568,  4551,
+   5793, -8035,  7568, -6811,
+   5793, -4551,  3135, -1598
+};
+
+const int16_t iadst_i8[64] = {
+   1460,  4184,  6342,  7644,
+   7914,  7114,  5354,  2871,
+   2871,  7114,  7644,  4184,
+  -1460, -6342, -7914, -5354,
+   4184,  7914,  2871, -5354,
+  -7644, -1460,  6342,  7114,
+   5354,  6342, -4184, -7114,
+   2871,  7644, -1460, -7914,
+   6342,  2871, -7914,  1460,
+   7114, -5354, -4184,  7644,
+   7114, -1460, -5354,  7914,
+  -4184, -2871,  7644, -6342,
+   7644, -5354,  1460,  2871,
+  -6342,  7914, -7114,  4184,
+   7914, -7644,  7114, -6342,
+   5354, -4184,  2871, -1460
+};
+
 float idct_16[256] = {
   0.250000,  0.351851,  0.346760,  0.338330,  0.326641,  0.311806,  0.293969,  0.273300,
   0.250000,  0.224292,  0.196424,  0.166664,  0.135299,  0.102631,  0.068975,  0.034654,
@@ -158,22 +211,94 @@ float iadst_16[256] = {
   0.240255, -0.215215,  0.188227, -0.159534,  0.129396, -0.098087,  0.065889, -0.033094
 };
 
-void vp8_ihtllm_c(short *input, short *output, int pitch,
-                  TX_TYPE tx_type, int tx_dim) {
+const int16_t idct_i16[256] = {
+   4096,  5765,  5681,  5543,  5352,  5109,  4816,  4478,
+   4096,  3675,  3218,  2731,  2217,  1682,  1130,   568,
+   4096,  5543,  4816,  3675,  2217,   568, -1130, -2731,
+  -4096, -5109, -5681, -5765, -5352, -4478, -3218, -1682,
+   4096,  5109,  3218,   568, -2217, -4478, -5681, -5543,
+  -4096, -1682,  1130,  3675,  5352,  5765,  4816,  2731,
+   4096,  4478,  1130, -2731, -5352, -5543, -3218,   568,
+   4096,  5765,  4816,  1682, -2217, -5109, -5681, -3675,
+   4096,  3675, -1130, -5109, -5352, -1682,  3218,  5765,
+   4096,  -568, -4816, -5543, -2217,  2731,  5681,  4478,
+   4096,  2731, -3218, -5765, -2217,  3675,  5681,  1682,
+  -4096, -5543, -1130,  4478,  5352,   568, -4816, -5109,
+   4096,  1682, -4816, -4478,  2217,  5765,  1130, -5109,
+  -4096,  2731,  5681,   568, -5352, -3675,  3218,  5543,
+   4096,   568, -5681, -1682,  5352,  2731, -4816, -3675,
+   4096,  4478, -3218, -5109,  2217,  5543, -1130, -5765,
+   4096,  -568, -5681,  1682,  5352, -2731, -4816,  3675,
+   4096, -4478, -3218,  5109,  2217, -5543, -1130,  5765,
+   4096, -1682, -4816,  4478,  2217, -5765,  1130,  5109,
+  -4096, -2731,  5681,  -568, -5352,  3675,  3218, -5543,
+   4096, -2731, -3218,  5765, -2217, -3675,  5681, -1682,
+  -4096,  5543, -1130, -4478,  5352,  -568, -4816,  5109,
+   4096, -3675, -1130,  5109, -5352,  1682,  3218, -5765,
+   4096,   568, -4816,  5543, -2217, -2731,  5681, -4478,
+   4096, -4478,  1130,  2731, -5352,  5543, -3218,  -568,
+   4096, -5765,  4816, -1682, -2217,  5109, -5681,  3675,
+   4096, -5109,  3218,  -568, -2217,  4478, -5681,  5543,
+  -4096,  1682,  1130, -3675,  5352, -5765,  4816, -2731,
+   4096, -5543,  4816, -3675,  2217,  -568, -1130,  2731,
+  -4096,  5109, -5681,  5765, -5352,  4478, -3218,  1682,
+   4096, -5765,  5681, -5543,  5352, -5109,  4816, -4478,
+   4096, -3675,  3218, -2731,  2217, -1682,  1130,  -568
+};
 
-  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+const int16_t iadst_i16[256] = {
+    542,  1607,  2614,  3526,  4311,  4940,  5390,  5646,
+   5698,  5543,  5189,  4646,  3936,  3084,  2120,  1080,
+   1080,  3084,  4646,  5543,  5646,  4940,  3526,  1607,
+   -542, -2614, -4311, -5390, -5698, -5189, -3936, -2120,
+   1607,  4311,  5646,  5189,  3084,     0, -3084, -5189,
+  -5646, -4311, -1607,  1607,  4311,  5646,  5189,  3084,
+   2120,  5189,  5390,  2614, -1607, -4940, -5543, -3084,
+   1080,  4646,  5646,  3526, -542,  -4311, -5698, -3936,
+   2614,  5646,  3936, -1080, -5189, -4940,  -542,  4311,
+   5543,  2120, -3084, -5698, -3526,  1607,  5390,  4646,
+   3084,  5646,  1607, -4311, -5189,     0,  5189,  4311,
+  -1607, -5646, -3084,  3084,  5646,  1607, -4311, -5189,
+   3526,  5189, -1080, -5698, -1607,  4940,  3936, -3084,
+  -5390,   542,  5646,  2120, -4646, -4311,  2614,  5543,
+   3936,  4311, -3526, -4646,  3084,  4940, -2614, -5189,
+   2120,  5390, -1607, -5543,  1080,  5646,  -542, -5698,
+   4311,  3084, -5189, -1607,  5646,     0, -5646,  1607,
+   5189, -3084, -4311,  4311,  3084, -5189, -1607,  5646,
+   4646,  1607, -5698,  2120,  4311, -4940, -1080,  5646,
+  -2614, -3936,  5189,   542, -5543,  3084,  3526, -5390,
+   4940,     0, -4940,  4940,     0, -4940,  4940,     0,
+  -4940,  4940,     0, -4940,  4940,     0, -4940,  4940,
+   5189, -1607, -3084,  5646, -4311,     0,  4311, -5646,
+   3084,  1607, -5189,  5189, -1607, -3084,  5646, -4311,
+   5390, -3084,  -542,  3936, -5646,  4940, -2120, -1607,
+   4646, -5698,  4311, -1080, -2614,  5189, -5543,  3526,
+   5543, -4311,  2120,   542, -3084,  4940, -5698,  5189,
+  -3526,  1080,  1607, -3936,  5390, -5646,  4646, -2614,
+   5646, -5189,  4311, -3084,  1607,     0, -1607,  3084,
+  -4311,  5189, -5646,  5646, -5189,  4311, -3084,  1607,
+   5698, -5646,  5543, -5390,  5189, -4940,  4646, -4311,
+   3936, -3526,  3084, -2614,  2120, -1607,  1080,  -542
+};
+
+/* For test */
+#define TEST_INT 1
+#if TEST_INT
+#define vp8_ihtllm_int_c vp8_ihtllm_c
+#else
+#define vp8_ihtllm_float_c vp8_ihtllm_c
+#endif
+
+void vp8_ihtllm_float_c(const int16_t *input, int16_t *output, int pitch,
+                  TX_TYPE tx_type, int tx_dim) {
+  vp8_clear_system_state();  // Make it simd safe : __asm emms;
   {
     int i, j, k;
-    float bufa[256], bufb[256]; // buffers are for floating-point test purpose
-                                // the implementation could be simplified in
-                                // conjunction with integer transform
-
-                                // further notice, since we are thinking to use
-                                // one function for both 4x4 and 8x8 transforms
-                                // the temporary buffers are simply initialized
-                                // with 64.
-    short *ip = input;
-    short *op = output;
+    float bufa[256], bufb[256];  // buffers are for floating-point test purpose
+                                 // the implementation could be simplified in
+                                 // conjunction with integer transform
+    const int16_t *ip = input;
+    int16_t *op = output;
     int shortpitch = pitch >> 1;
 
     float *pfa = &bufa[0];
@@ -272,8 +397,8 @@ void vp8_ihtllm_c(short *input, short *output, int pitch,
 
     for(j = 0; j < tx_dim; j++) {
       for(i = 0; i < tx_dim; i++) {
-        op[i] = (pfa[i] > 0 ) ? (short)( pfa[i] / 8 + 0.49) :
-                               -(short)( - pfa[i] / 8 + 0.49);
+        op[i] = (pfa[i] > 0 ) ? (int16_t)( pfa[i] / 8 + 0.49) :
+                               -(int16_t)( - pfa[i] / 8 + 0.49);
       }
 
       op  += shortpitch;
@@ -283,6 +408,90 @@ void vp8_ihtllm_c(short *input, short *output, int pitch,
   vp8_clear_system_state(); // Make it simd safe : __asm emms;
 }
 
+/* Converted the transforms to integer form. */
+#define VERTICAL_SHIFT 14  //16
+#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
+#define HORIZONTAL_SHIFT 17  //15
+#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
+void vp8_ihtllm_int_c(const int16_t *input, int16_t *output, int pitch,
+                   TX_TYPE tx_type, int tx_dim) {
+  int i, j, k;
+  int16_t imbuf[256];
+
+  const int16_t *ip = input;
+  int16_t *op = output;
+  int16_t *im = &imbuf[0];
+
+  /* pointers to vertical and horizontal transforms. */
+  const int16_t *ptv = NULL, *pth = NULL;
+  int shortpitch = pitch >> 1;
+
+  switch (tx_type) {
+    case ADST_ADST :
+      ptv = pth = (tx_dim == 4) ? &iadst_i4[0]
+                                  : ((tx_dim == 8) ? &iadst_i8[0]
+                                                     : &iadst_i16[0]);
+      break;
+    case ADST_DCT  :
+      ptv = (tx_dim == 4) ? &iadst_i4[0]
+                            : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);
+      pth = (tx_dim == 4) ? &idct_i4[0]
+                            : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);
+      break;
+    case  DCT_ADST :
+      ptv = (tx_dim == 4) ? &idct_i4[0]
+                            : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);
+      pth = (tx_dim == 4) ? &iadst_i4[0]
+                            : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);
+      break;
+    case  DCT_DCT :
+      ptv = pth = (tx_dim == 4) ? &idct_i4[0]
+                                  : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  /* vertical transformation */
+  for (j = 0; j < tx_dim; j++) {
+    for (i = 0; i < tx_dim; i++) {
+      int temp = 0;
+
+      for (k = 0; k < tx_dim; k++) {
+        temp += ptv[k] * ip[(k * tx_dim)];
+      }
+
+      im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
+      ip++;
+    }
+    im += tx_dim;  //16
+    ptv += tx_dim;
+    ip = input;
+  }
+
+  /* horizontal transformation */
+  im = &imbuf[0];
+
+  for (j = 0; j < tx_dim; j++) {
+    const int16_t *pthc = pth;
+
+    for (i = 0; i < tx_dim; i++) {
+      int temp = 0;
+
+      for (k = 0; k < tx_dim; k++) {
+        temp += im[k] * pthc[k];
+      }
+
+      op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
+      pthc += tx_dim;
+    }
+
+    im += tx_dim;  //16
+    op += shortpitch;
+  }
+}
+
 void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) {
   int i;
   int a1, b1, c1, d1;
diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c
index 0983b1c0a..edb667425 100644
--- a/vp8/encoder/dct.c
+++ b/vp8/encoder/dct.c
@@ -9,6 +9,7 @@
  */
 
 
+#include <assert.h>
 #include <math.h>
 #include "vpx_ports/config.h"
 #include "vp8/common/idct.h"
@@ -70,6 +71,59 @@ float adst_8[64] = {
   0.466553967085785,  -0.387095214016348,   0.255357107325376,  -0.089131608307532
 };
 
+/* Converted the transforms to integers. */
+const int16_t dct_i4[16] = {
+  16384,  16384,  16384,  16384,
+  21407,   8867,  -8867, -21407,
+  16384, -16384, -16384,  16384,
+   8867, -21407,  21407,  -8867
+};
+
+const int16_t adst_i4[16] = {
+   7472,  14042,  18919,  21513,
+  18919,  18919,      0, -18919,
+  21513,  -7472, -18919,  14042,
+  14042, -21513,  18919,  -7472
+};
+
+const int16_t dct_i8[64] = {
+   11585,  11585,  11585,  11585,
+   11585,  11585,  11585,  11585,
+   16069,  13623,   9102,   3196,
+   -3196,  -9102, -13623, -16069,
+   15137,   6270,  -6270, -15137,
+  -15137,  -6270,   6270,  15137,
+   13623,  -3196, -16069,  -9102,
+    9102,  16069,   3196, -13623,
+   11585, -11585, -11585,  11585,
+   11585, -11585, -11585,  11585,
+    9102, -16069,   3196,  13623,
+  -13623,  -3196,  16069,  -9102,
+    6270, -15137,  15137,  -6270,
+   -6270,  15137, -15137,   6270,
+    3196,  -9102,  13623, -16069,
+   16069, -13623,   9102,  -3196
+};
+
+const int16_t adst_i8[64] = {
+    2921,   5742,   8368,  10708,
+   12684,  14228,  15288,  15827,
+    8368,  14228,  15827,  12684,
+    5742,  -2921, -10708, -15288,
+   12684,  15288,   5742,  -8368,
+  -15827, -10708,   2921,  14228,
+   15288,   8368, -10708, -14228,
+    2921,  15827,   5742, -12684,
+   15827,  -2921, -15288,   5742,
+   14228,  -8368, -12684,  10708,
+   14228, -12684,  -2921,  15288,
+  -10708,  -5742,  15827,  -8368,
+   10708, -15827,  12684,  -2921,
+   -8368,  15288, -14228,   5742,
+    5742, -10708,  14228, -15827,
+   15288, -12684,   8368,  -2921
+};
+
 float dct_16[256] = {
   0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,
   0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,
@@ -140,6 +194,77 @@ float adst_16[256] = {
   0.344612, -0.329007,  0.301511, -0.263118,  0.215215, -0.159534,  0.098087, -0.033094
 };
 
+/* Converted the transforms to integers. */
+const int16_t dct_i16[256] = {
+    8192,   8192,   8192,   8192,   8192,   8192,   8192,   8192,
+    8192,   8192,   8192,   8192,   8192,   8192,   8192,   8192,
+   11529,  11086,  10217,   8955,   7350,   5461,   3363,   1136,
+   -1136,  -3363,  -5461,  -7350,  -8955, -10217, -11086, -11529,
+   11363,   9633,   6436,   2260,  -2260,  -6436,  -9633, -11363,
+  -11363,  -9633,  -6436,  -2260,   2260,   6436,   9633,  11363,
+   11086,   7350,   1136,  -5461, -10217, -11529,  -8955,  -3363,
+    3363,   8955,  11529,  10217,   5461,  -1136,  -7350, -11086,
+   10703,   4433,  -4433, -10703, -10703,  -4433,   4433,  10703,
+   10703,   4433,  -4433, -10703, -10703,  -4433,   4433,  10703,
+   10217,   1136,  -8955, -11086,  -3363,   7350,  11529,   5461,
+   -5461, -11529,  -7350,   3363,  11086,   8955,  -1136, -10217,
+    9633,  -2260, -11363,  -6436,   6436,  11363,   2260,  -9633,
+   -9633,   2260,  11363,   6436,  -6436, -11363,  -2260,   9633,
+    8955,  -5461, -11086,   1136,  11529,   3363, -10217,  -7350,
+    7350,  10217,  -3363, -11529,  -1136,  11086,   5461,  -8955,
+    8192,  -8192,  -8192,   8192,   8192,  -8192,  -8192,   8192,
+    8192,  -8192,  -8192,   8192,   8192,  -8192,  -8192,   8192,
+    7350, -10217,  -3363,  11529,  -1136, -11086,   5461,   8955,
+   -8955,  -5461,  11086,   1136, -11529,   3363,  10217,  -7350,
+    6436, -11363,   2260,   9633,  -9633,  -2260,  11363,  -6436,
+   -6436,  11363,  -2260,  -9633,   9633,   2260, -11363,   6436,
+    5461, -11529,   7350,   3363, -11086,   8955,   1136, -10217,
+   10217,  -1136,  -8955,  11086,  -3363,  -7350,  11529,  -5461,
+    4433, -10703,  10703,  -4433,  -4433,  10703, -10703,   4433,
+    4433, -10703,  10703,  -4433,  -4433,  10703, -10703,   4433,
+    3363,  -8955,  11529, -10217,   5461,   1136,  -7350,  11086,
+  -11086,   7350,  -1136,  -5461,  10217, -11529,   8955,  -3363,
+    2260,  -6436,   9633, -11363,  11363,  -9633,   6436,  -2260,
+   -2260,   6436,  -9633,  11363, -11363,   9633,  -6436,   2260,
+    1136,  -3363,   5461,  -7350,   8955, -10217,  11086, -11529,
+   11529, -11086,  10217,  -8955,   7350,  -5461,   3363,  -1136
+};
+
+const int16_t adst_i16[256] = {
+    1084,   2159,   3214,   4240,   5228,   6168,   7052,   7873,
+    8622,   9293,   9880,  10377,  10781,  11087,  11292,  11395,
+    3214,   6168,   8622,  10377,  11292,  11292,  10377,   8622,
+    6168,   3214,      0,  -3214,  -6168,  -8622, -10377, -11292,
+    5228,   9293,  11292,  10781,   7873,   3214,  -2159,  -7052,
+  -10377, -11395,  -9880,  -6168,  -1084,   4240,   8622,  11087,
+    7052,  11087,  10377,   5228,  -2159,  -8622, -11395,  -9293,
+   -3214,   4240,   9880,  11292,   7873,   1084,  -6168, -10781,
+    8622,  11292,   6168,  -3214, -10377, -10377,  -3214,   6168,
+   11292,   8622,      0,  -8622, -11292,  -6168,   3214,  10377,
+    9880,   9880,      0,  -9880,  -9880,      0,   9880,   9880,
+       0,  -9880,  -9880,      0,   9880,   9880,      0,  -9880,
+   10781,   7052,  -6168, -11087,  -1084,  10377,   7873,  -5228,
+  -11292,  -2159,   9880,   8622,  -4240, -11395,  -3214,   9293,
+   11292,   3214, -10377,  -6168,   8622,   8622,  -6168, -10377,
+    3214,  11292,      0, -11292,  -3214,  10377,   6168,  -8622,
+   11395,  -1084, -11292,   2159,  11087,  -3214, -10781,   4240,
+   10377,  -5228,  -9880,   6168,   9293,  -7052,  -8622,   7873,
+   11087,  -5228,  -8622,   9293,   4240, -11292,   1084,  10781,
+   -6168,  -7873,   9880,   3214, -11395,   2159,  10377,  -7052,
+   10377,  -8622,  -3214,  11292,  -6168,  -6168,  11292,  -3214,
+   -8622,  10377,      0, -10377,   8622,   3214, -11292,   6168,
+    9293, -10781,   3214,   7052, -11395,   6168,   4240, -11087,
+    8622,   1084,  -9880,  10377,  -2159,  -7873,  11292,  -5228,
+    7873, -11395,   8622,  -1084,  -7052,  11292,  -9293,   2159,
+    6168, -11087,   9880,  -3214,  -5228,  10781, -10377,   4240,
+    6168, -10377,  11292,  -8622,   3214,   3214,  -8622,  11292,
+  -10377,   6168,      0,  -6168,  10377, -11292,   8622,  -3214,
+    4240,  -7873,  10377, -11395,  10781,  -8622,   5228,  -1084,
+   -3214,   7052,  -9880,  11292, -11087,   9293,  -6168,   2159,
+    2159,  -4240,   6168,  -7873,   9293, -10377,  11087, -11395,
+   11292, -10781,   9880,  -8622,   7052,  -5228,   3214,  -1084
+};
+
 static const int xC1S7 = 16069;
 static const int xC2S6 = 15137;
 static const int xC3S5 = 13623;
@@ -395,17 +520,24 @@ void vp8_short_fhaar2x2_c(short *input, short *output, int pitch) { // pitch = 8
 
 }
 
-void vp8_fht_c(short *input, short *output, int pitch,
-               TX_TYPE tx_type, int tx_dim) {
+/* For test */
+#define TEST_INT 1
+#if TEST_INT
+#define vp8_fht_int_c vp8_fht_c
+#else
+#define vp8_fht_float_c vp8_fht_c
+#endif
 
-  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+void vp8_fht_float_c(const int16_t *input, int pitch, int16_t *output,
+               TX_TYPE tx_type, int tx_dim) {
+  vp8_clear_system_state();  // Make it simd safe : __asm emms;
   {
     int i, j, k;
-    float bufa[256], bufb[256]; // buffers are for floating-point test purpose
-                               // the implementation could be simplified in
-                               // conjunction with integer transform
-    short *ip = input;
-    short *op = output;
+    float bufa[256], bufb[256];  // buffers are for floating-point test purpose
+                                 // the implementation could be simplified in
+                                 // conjunction with integer transform
+    const int16_t *ip = input;
+    int16_t *op = output;
 
     float *pfa = &bufa[0];
     float *pfb = &bufb[0];
@@ -415,8 +547,8 @@ void vp8_fht_c(short *input, short *output, int pitch,
 
     assert(tx_type != DCT_DCT);
     // load and convert residual array into floating-point
-    for(j = 0; j < tx_dim; j++) {
-      for(i = 0; i < tx_dim; i++) {
+    for (j = 0; j < tx_dim; j++) {
+      for (i = 0; i < tx_dim; i++) {
         pfa[i] = (float)ip[i];
       }
       pfa += tx_dim;
@@ -427,7 +559,7 @@ void vp8_fht_c(short *input, short *output, int pitch,
     pfa = &bufa[0];
     pfb = &bufb[0];
 
-    switch(tx_type) {
+    switch (tx_type) {
       case ADST_ADST :
       case ADST_DCT  :
         ptv = (tx_dim == 4) ? &adst_4[0] :
@@ -440,10 +572,10 @@ void vp8_fht_c(short *input, short *output, int pitch,
         break;
     }
 
-    for(j = 0; j < tx_dim; j++) {
-      for(i = 0; i < tx_dim; i++) {
+    for (j = 0; j < tx_dim; j++) {
+      for (i = 0; i < tx_dim; i++) {
         pfb[i] = 0;
-        for(k = 0; k < tx_dim; k++) {
+        for (k = 0; k < tx_dim; k++) {
           pfb[i] += ptv[k] * pfa[(k * tx_dim)];
         }
         pfa += 1;
@@ -457,7 +589,7 @@ void vp8_fht_c(short *input, short *output, int pitch,
     pfa = &bufa[0];
     pfb = &bufb[0];
 
-    switch(tx_type) {
+    switch (tx_type) {
       case ADST_ADST :
       case  DCT_ADST :
         pth = (tx_dim == 4) ? &adst_4[0] :
@@ -470,10 +602,10 @@ void vp8_fht_c(short *input, short *output, int pitch,
         break;
     }
 
-    for(j = 0; j < tx_dim; j++) {
-      for(i = 0; i < tx_dim; i++) {
+    for (j = 0; j < tx_dim; j++) {
+      for (i = 0; i < tx_dim; i++) {
         pfa[i] = 0;
-        for(k = 0; k < tx_dim; k++) {
+        for (k = 0; k < tx_dim; k++) {
           pfa[i] += pfb[k] * pth[k];
         }
         pth += tx_dim;
@@ -483,7 +615,7 @@ void vp8_fht_c(short *input, short *output, int pitch,
       pfb += tx_dim;
       // pth -= tx_dim * tx_dim;
 
-      switch(tx_type) {
+      switch (tx_type) {
         case ADST_ADST :
         case  DCT_ADST :
           pth = (tx_dim == 4) ? &adst_4[0] :
@@ -498,19 +630,102 @@ void vp8_fht_c(short *input, short *output, int pitch,
     }
 
     // convert to short integer format and load BLOCKD buffer
-    op  = output ;
-    pfa = &bufa[0] ;
+    op = output;
+    pfa = &bufa[0];
 
-    for(j = 0; j < tx_dim; j++) {
-      for(i = 0; i < tx_dim; i++) {
-        op[i] = (pfa[i] > 0 ) ? (short)( 8 * pfa[i] + 0.49) :
-                                     -(short)(- 8 * pfa[i] + 0.49);
+    for (j = 0; j < tx_dim; j++) {
+      for (i = 0; i < tx_dim; i++) {
+        op[i] = (pfa[i] > 0 ) ? (int16_t)( 8 * pfa[i] + 0.49) :
+                                     -(int16_t)(- 8 * pfa[i] + 0.49);
       }
       op  += tx_dim;
       pfa += tx_dim;
     }
   }
-  vp8_clear_system_state(); // Make it simd safe : __asm emms;
+  vp8_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+/* Converted the transforms to integer form. */
+#define VERTICAL_SHIFT 11
+#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
+#define HORIZONTAL_SHIFT 16
+#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
+void vp8_fht_int_c(const int16_t *input, int pitch, int16_t *output,
+                   TX_TYPE tx_type, int tx_dim) {
+  int i, j, k;
+  int16_t imbuf[256];
+
+  const int16_t *ip = input;
+  int16_t *op = output;
+  int16_t *im = &imbuf[0];
+
+  /* pointers to vertical and horizontal transforms. */
+  const int16_t *ptv = NULL, *pth = NULL;
+
+  switch (tx_type) {
+    case ADST_ADST :
+      ptv = pth = (tx_dim == 4) ? &adst_i4[0]
+                                  : ((tx_dim == 8) ? &adst_i8[0]
+                                                     : &adst_i16[0]);
+      break;
+    case ADST_DCT  :
+      ptv = (tx_dim == 4) ? &adst_i4[0]
+                            : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);
+      pth = (tx_dim == 4) ? &dct_i4[0]
+                            : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
+      break;
+    case  DCT_ADST :
+      ptv = (tx_dim == 4) ? &dct_i4[0]
+                            : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
+      pth = (tx_dim == 4) ? &adst_i4[0]
+                            : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);
+      break;
+    case  DCT_DCT :
+      ptv = pth = (tx_dim == 4) ? &dct_i4[0]
+                                  : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  /* vertical transformation */
+  for (j = 0; j < tx_dim; j++) {
+    for (i = 0; i < tx_dim; i++) {
+      int temp = 0;
+
+      for (k = 0; k < tx_dim; k++) {
+        temp += ptv[k] * ip[(k * (pitch >> 1))];
+      }
+
+      im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
+      ip++;
+    }
+    im += tx_dim;  //16
+    ptv += tx_dim;
+    ip = input;
+  }
+
+  /* horizontal transformation */
+  im = &imbuf[0];
+
+  for (j = 0; j < tx_dim; j++) {
+    const int16_t *pthc = pth;
+
+    for (i = 0; i < tx_dim; i++) {
+      int temp = 0;
+
+      for (k = 0; k < tx_dim; k++) {
+        temp += im[k] * pthc[k];
+      }
+
+      op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
+      pthc += tx_dim;
+    }
+
+    im += tx_dim;  //16
+    op += tx_dim;
+  }
 }
 
 void vp8_short_fdct4x4_c(short *input, short *output, int pitch) {
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index 508e5aaa5..d23c2305b 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -73,7 +73,7 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,
 
   tx_type = get_tx_type(&x->e_mbd, b);
   if (tx_type != DCT_DCT) {
-    vp8_fht_c(be->src_diff, be->coeff, 32, tx_type, 4);
+    vp8_fht_c(be->src_diff, 32, be->coeff, tx_type, 4);
     vp8_ht_quantize_b_4x4(be, b, tx_type);
     vp8_ihtllm_c(b->dqcoeff, b->diff, 32, tx_type, 4);
   } else {
@@ -114,7 +114,7 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
     BLOCKD  *bd = &xd->block[0];
     tx_type = get_tx_type(xd, bd);
     if (tx_type != DCT_DCT) {
-      vp8_fht_c(b->src_diff, b->coeff, 32, tx_type, 16);
+      vp8_fht_c(b->src_diff, 32, b->coeff, tx_type, 16);
       vp8_quantize_mby_16x16(x);
       if (x->optimize)
         vp8_optimize_mby_16x16(x, rtcd);
@@ -205,7 +205,7 @@ void vp8_encode_intra8x8(const VP8_ENCODER_RTCD *rtcd,
 
     tx_type = get_tx_type(xd, xd->block + idx);
     if (tx_type != DCT_DCT) {
-      vp8_fht_c(be->src_diff, (x->block + idx)->coeff, 32,
+      vp8_fht_c(be->src_diff, 32, (x->block + idx)->coeff,
                 tx_type, 8);
       x->quantize_b_8x8(x->block + idx, xd->block + idx);
       vp8_ihtllm_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 9ead1bd36..8207574dd 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -834,7 +834,7 @@ static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
 
   tx_type = get_tx_type_16x16(xd, b);
   if (tx_type != DCT_DCT) {
-    vp8_fht_c(be->src_diff, be->coeff, 32, tx_type, 16);
+    vp8_fht_c(be->src_diff, 32, be->coeff, tx_type, 16);
   } else
     vp8_transform_mby_16x16(mb);
 
@@ -1116,7 +1116,7 @@ static int64_t rd_pick_intra4x4block(VP8_COMP *cpi, MACROBLOCK *x, BLOCK *be,
       b->bmi.as_mode.first = mode;
       tx_type = get_tx_type_4x4(xd, b);
       if (tx_type != DCT_DCT) {
-        vp8_fht_c(be->src_diff, be->coeff, 32, tx_type, 4);
+        vp8_fht_c(be->src_diff, 32, be->coeff, tx_type, 4);
         vp8_ht_quantize_b_4x4(be, b, tx_type);
       } else {
         x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
@@ -1441,7 +1441,7 @@ static int64_t rd_pick_intra8x8block(VP8_COMP *cpi, MACROBLOCK *x, int ib,
       if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
         TX_TYPE tx_type = get_tx_type_8x8(xd, b);
         if (tx_type != DCT_DCT)
-          vp8_fht_c(be->src_diff, (x->block + idx)->coeff, 32, tx_type, 8);
+          vp8_fht_c(be->src_diff, 32, (x->block + idx)->coeff, tx_type, 8);
         else
           x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
         x->quantize_b_8x8(x->block + idx, xd->block + idx);
-- 
2.50.1