From 74854987735599e2c42359d4ee0a59d4b78ebb47 Mon Sep 17 00:00:00 2001
From: Debargha Mukherjee <debargha@google.com>
Date: Fri, 12 Feb 2016 16:44:33 -0800
Subject: [PATCH] Extends ext-tx to support 32x32 masked transforms

Adds new 32x32 masked 1-d transforms that combine 1-D length-16
DCT with length-16 identity transforms.

To be continued in subsequent patches.

Change-Id: I0b4f66492d44c079b3c3b531ba48a97201de1484
---
 vp10/common/idct.c            | 236 ++++++++++++++++++++++++++++++++--
 vp10/common/vp10_rtcd_defs.pl |   9 ++
 vp10/encoder/dct.c            | 100 +++++++++++++-
 vpx_dsp/inv_txfm.c            |  12 +-
 vpx_dsp/inv_txfm.h            |   1 +
 vpx_dsp/txfm_common.h         |   5 +-
 6 files changed, 343 insertions(+), 20 deletions(-)

diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index 6f38f74a7..dbb50fbba 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -259,6 +259,73 @@ void idst16_c(const tran_low_t *input, tran_low_t *output) {
   output[15] = WRAPLOW(-step2[0] + step2[15], 8);
 }
 
+#if CONFIG_EXT_TX
+// For use in lieu of DST
+static void ihalfcenter32_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 8; ++i) {
+    output[i] = input[16 + i] * 4;
+    output[24 + i] = input[24 + i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+  }
+  idct16_c(inputhalf, output + 8);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
+static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 16; ++i) {
+    output[i] = input[16 + i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+  }
+  idct16_c(inputhalf, output + 16);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_ihalfcenter32_c(const tran_low_t *input, tran_low_t *output,
+                                   int bd) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 8; ++i) {
+    output[i] = input[16 + i] * 4;
+    output[24 + i] = input[24 + i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)highbd_dct_const_round_shift(
+        input[i] * Sqrt2, bd);
+  }
+  vpx_highbd_idct16_c(inputhalf, output + 8, bd);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
+static void highbd_ihalfright32_c(const tran_low_t *input, tran_low_t *output,
+                                  int bd) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 16; ++i) {
+    output[i] = input[16 + i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)highbd_dct_const_round_shift(
+        input[i] * Sqrt2, bd);
+  }
+  vpx_highbd_idct16_c(inputhalf, output + 16, bd);
+  // Note overall scaling factor is 4 times orthogonal
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_EXT_TX
+
 // Inverse identiy transform and add.
 static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                            int bs) {
@@ -808,6 +875,67 @@ void vp10_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
   }
 }
 
+#if CONFIG_EXT_TX
+void vp10_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
+                              int stride, int tx_type) {
+  static const transform_2d IHT_32[] = {
+    { idct32_c,  idct32_c  },                // DCT_DCT           = 0,
+    { ihalfright32_c, idct32_c  },           // ADST_DCT          = 1,
+    { idct32_c,  ihalfright32_c },           // DCT_ADST          = 2,
+    { ihalfright32_c, ihalfright32_c },      // ADST_ADST         = 3,
+    { ihalfright32_c, idct32_c  },           // FLIPADST_DCT      = 4,
+    { idct32_c,  ihalfright32_c },           // DCT_FLIPADST      = 5,
+    { ihalfright32_c, ihalfright32_c },      // FLIPADST_FLIPADST = 6,
+    { ihalfright32_c, ihalfright32_c },      // ADST_FLIPADST     = 7,
+    { ihalfright32_c, ihalfright32_c },      // FLIPADST_ADST     = 8,
+    { ihalfcenter32_c,  idct32_c  },         // DST_DCT           = 9,
+    { idct32_c,  ihalfcenter32_c  },         // DCT_DST           = 10,
+    { ihalfcenter32_c,  ihalfright32_c },    // DST_ADST          = 11,
+    { ihalfright32_c, ihalfcenter32_c  },    // ADST_DST          = 12,
+    { ihalfcenter32_c,  ihalfright32_c },    // DST_FLIPADST      = 13,
+    { ihalfright32_c, ihalfcenter32_c  },    // FLIPADST_DST      = 14,
+    { ihalfcenter32_c,  ihalfcenter32_c  },  // DST_DST           = 15
+  };
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[32][32];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 32;
+
+  // inverse transform row vectors
+  for (i = 0; i < 32; ++i) {
+    IHT_32[tx_type].rows(input, out[i]);
+    input  += 32;
+  }
+
+  // transpose
+  for (i = 1 ; i < 32; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 32; ++i) {
+    IHT_32[tx_type].cols(out[i], out[i]);
+  }
+
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 32);
+
+  // Sum with the destination
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+    }
+  }
+}
+#endif  // CONFIG_EXT_TX
+
 // idct
 void vp10_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                      int eob) {
@@ -998,15 +1126,27 @@ void vp10_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
       vp10_idct32x32_add(input, dest, stride, eob);
       break;
 #if CONFIG_EXT_TX
-    case IDTX:
-      inv_idtx_add_c(input, dest, stride, 32);
-      break;
-#endif  // CONFIG_EXT_TX
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
-      assert(0);
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case DST_DST:
+    case DST_DCT:
+    case DCT_DST:
+    case DST_ADST:
+    case ADST_DST:
+    case FLIPADST_DST:
+    case DST_FLIPADST:
+      vp10_iht32x32_1024_add_c(input, dest, stride, tx_type);
       break;
+    case IDTX:
+      inv_idtx_add_c(input, dest, stride, 32);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -1212,6 +1352,70 @@ void vp10_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
+#if CONFIG_EXT_TX
+void vp10_highbd_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+                                     int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_32[] = {
+    { vpx_highbd_idct32_c, vpx_highbd_idct32_c  },        // DCT_DCT
+    { highbd_ihalfright32_c, vpx_highbd_idct32_c  },      // ADST_DCT
+    { vpx_highbd_idct32_c, highbd_ihalfright32_c },       // DCT_ADST
+    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // ADST_ADST
+    { highbd_ihalfright32_c, vpx_highbd_idct32_c  },      // FLIPADST_DCT
+    { vpx_highbd_idct32_c, highbd_ihalfright32_c },       // DCT_FLIPADST
+    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // FLIPADST_FLIPADST
+    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // ADST_FLIPADST
+    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // FLIPADST_ADST
+    { highbd_ihalfcenter32_c, vpx_highbd_idct32_c  },     // DST_DCT
+    { vpx_highbd_idct32_c, highbd_ihalfcenter32_c  },     // DCT_DST
+    { highbd_ihalfcenter32_c, highbd_ihalfright32_c },    // DST_ADST
+    { highbd_ihalfright32_c, highbd_ihalfcenter32_c  },   // ADST_DST
+    { highbd_ihalfcenter32_c, highbd_ihalfright32_c },    // DST_FLIPADST
+    { highbd_ihalfright32_c, highbd_ihalfcenter32_c  },   // FLIPADST_DST
+    { highbd_ihalfcenter32_c, highbd_ihalfcenter32_c  },  // DST_DST
+  };
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[32][32];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 32;
+
+  // inverse transform row vectors
+  for (i = 0; i < 32; ++i) {
+    HIGH_IHT_32[tx_type].rows(input, out[i], bd);
+    input  += 32;
+  }
+
+  // transpose
+  for (i = 1 ; i < 32; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 32; ++i) {
+    HIGH_IHT_32[tx_type].cols(out[i], out[i], bd);
+  }
+
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 32);
+
+  // Sum with the destination
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 6), bd);
+    }
+  }
+}
+#endif  // CONFIG_EXT_TX
+
 // idct
 void vp10_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                             int eob, int bd) {
@@ -1409,15 +1613,27 @@ void vp10_highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
       vp10_highbd_idct32x32_add(input, dest, stride, eob, bd);
       break;
 #if CONFIG_EXT_TX
-    case IDTX:
-      highbd_inv_idtx_add_c(input, dest, stride, 32, bd);
-      break;
-#endif  // CONFIG_EXT_TX
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
-      assert(0);
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case DST_DST:
+    case DST_DCT:
+    case DCT_DST:
+    case DST_ADST:
+    case ADST_DST:
+    case FLIPADST_DST:
+    case DST_FLIPADST:
+      vp10_highbd_iht32x32_1024_add_c(input, dest, stride, tx_type, bd);
       break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 32, bd);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 9860baedf..c9f02953f 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -404,6 +404,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vp10_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_fht16x16 sse2/;
 
+  add_proto qw/void vp10_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_fht32x32/;
+
   add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp10_fwht4x4/, "$mmx_x86inc";
 } else {
@@ -416,6 +419,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vp10_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_fht16x16 sse2 msa/;
 
+  add_proto qw/void vp10_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_fht32x32/;
+
   add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
 }
@@ -642,6 +648,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vp10_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_highbd_fht16x16/;
 
+  add_proto qw/void vp10_highbd_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_highbd_fht32x32/;
+
   add_proto qw/void vp10_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp10_highbd_fwht4x4/;
 
diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c
index cdb732a44..333adbbcb 100644
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -14,7 +14,6 @@
 #include "./vp10_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
-
 #include "vp10/common/blockd.h"
 #include "vp10/common/idct.h"
 #include "vpx_dsp/fwd_txfm.h"
@@ -538,7 +537,7 @@ static void fdct16(const tran_low_t *input, tran_low_t *output) {
   range_check(output, 16, 16);
 }
 
-/* TODO(angiebird): Unify this with vp10_fwd_txfm.c: vp10_fdct32
+#if CONFIG_EXT_TX
 static void fdct32(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[32];
@@ -936,7 +935,7 @@ static void fdct32(const tran_low_t *input, tran_low_t *output) {
 
   range_check(output, 32, 18);
 }
-*/
+#endif  // CONFIG_EXT_TX
 
 static void fadst4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t x0, x1, x2, x3;
@@ -1213,6 +1212,37 @@ static void fadst16(const tran_low_t *input, tran_low_t *output) {
 }
 
 #if CONFIG_EXT_TX
+// For use in lieu of DST
+static void fhalfcenter32(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 8; ++i) {
+    output[16 + i] = input[i] * 4;
+    output[24 + i] = input[24 + i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 8] * Sqrt2);
+  }
+  fdct16(inputhalf, output);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
+// For use in lieu of ADST
+static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 16; ++i) {
+    output[16 + i] = input[i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2);
+  }
+  fdct16(inputhalf, output);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
 static void copy_block(const int16_t *src, int src_stride, int l,
                        int16_t *dest, int dest_stride) {
   int i;
@@ -1375,6 +1405,27 @@ static const transform_2d FHT_16[] = {
 #endif  // CONFIG_EXT_TX
 };
 
+#if CONFIG_EXT_TX
+static const transform_2d FHT_32[] = {
+  { fdct32,  fdct32  },                // DCT_DCT           = 0,
+  { fhalfright32, fdct32  },           // ADST_DCT          = 1,
+  { fdct32,  fhalfright32 },           // DCT_ADST          = 2,
+  { fhalfright32, fhalfright32 },      // ADST_ADST         = 3,
+  { fhalfright32, fdct32  },           // FLIPADST_DCT      = 4,
+  { fdct32,  fhalfright32 },           // DCT_FLIPADST      = 5,
+  { fhalfright32, fhalfright32 },      // FLIPADST_FLIPADST = 6,
+  { fhalfright32, fhalfright32 },      // ADST_FLIPADST     = 7,
+  { fhalfright32, fhalfright32 },      // FLIPADST_ADST     = 8,
+  { fhalfcenter32,  fdct32  },         // DST_DCT           = 9,
+  { fdct32,  fhalfcenter32  },         // DCT_DST           = 10,
+  { fhalfcenter32,  fhalfright32 },    // DST_ADST          = 11,
+  { fhalfright32, fhalfcenter32  },    // ADST_DST          = 12,
+  { fhalfcenter32,  fhalfright32 },    // DST_FLIPADST      = 13,
+  { fhalfright32, fhalfcenter32  },    // FLIPADST_DST      = 14,
+  { fhalfcenter32,  fhalfcenter32  },  // DST_DST           = 15
+};
+#endif  // CONFIG_EXT_TX
+
 void vp10_fht4x4_c(const int16_t *input, tran_low_t *output,
                    int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
@@ -1671,3 +1722,46 @@ void vp10_highbd_fht16x16_c(const int16_t *input, tran_low_t *output,
   vp10_fht16x16_c(input, output, stride, tx_type);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_EXT_TX
+void vp10_fht32x32_c(const int16_t *input, tran_low_t *output,
+                     int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vpx_fdct32x32_c(input, output, stride);
+  } else {
+    tran_low_t out[1024];
+    int i, j;
+    tran_low_t temp_in[32], temp_out[32];
+    const transform_2d ht = FHT_32[tx_type];
+
+    int16_t flipped_input[32 * 32];
+    maybe_flip_input(&input, &stride, 32, flipped_input, tx_type);
+
+    // Columns
+    for (i = 0; i < 32; ++i) {
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 32; ++j)
+        out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+    }
+
+    // Rows
+    for (i = 0; i < 32; ++i) {
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = out[j + i * 32];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 32; ++j)
+        output[j + i * 32] =
+            (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_fht32x32_c(const int16_t *input, tran_low_t *output,
+                            int stride, int tx_type) {
+  vp10_fht32x32_c(input, output, stride, tx_type);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_EXT_TX
diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c
index a0f59bf75..402fd9a23 100644
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -2057,8 +2057,8 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-static void highbd_idct32_c(const tran_low_t *input,
-                            tran_low_t *output, int bd) {
+void vpx_highbd_idct32_c(const tran_low_t *input,
+                         tran_low_t *output, int bd) {
   tran_low_t step1[32], step2[32];
   tran_high_t temp1, temp2;
   (void) bd;
@@ -2447,7 +2447,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
 
     if (zero_coeff[0] | zero_coeff[1])
-      highbd_idct32_c(input, outptr, bd);
+      vpx_highbd_idct32_c(input, outptr, bd);
     else
       memset(outptr, 0, sizeof(tran_low_t) * 32);
     input += 32;
@@ -2458,7 +2458,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j)
       temp_in[j] = out[j * 32 + i];
-    highbd_idct32_c(temp_in, temp_out, bd);
+    vpx_highbd_idct32_c(temp_in, temp_out, bd);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
@@ -2477,7 +2477,7 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
   // Rows
   // Only upper-left 8x8 has non-zero coeff.
   for (i = 0; i < 8; ++i) {
-    highbd_idct32_c(input, outptr, bd);
+    vpx_highbd_idct32_c(input, outptr, bd);
     input += 32;
     outptr += 32;
   }
@@ -2485,7 +2485,7 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j)
       temp_in[j] = out[j * 32 + i];
-    highbd_idct32_c(temp_in, temp_out, bd);
+    vpx_highbd_idct32_c(temp_in, temp_out, bd);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h
index 23588139e..adbb83872 100644
--- a/vpx_dsp/inv_txfm.h
+++ b/vpx_dsp/inv_txfm.h
@@ -100,6 +100,7 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output);
 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vpx_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd);
 
 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
diff --git a/vpx_dsp/txfm_common.h b/vpx_dsp/txfm_common.h
index 442e6a57b..9b0e9900a 100644
--- a/vpx_dsp/txfm_common.h
+++ b/vpx_dsp/txfm_common.h
@@ -57,10 +57,13 @@ static const tran_high_t cospi_29_64 = 2404;
 static const tran_high_t cospi_30_64 = 1606;
 static const tran_high_t cospi_31_64 = 804;
 
-//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
+// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3
 static const tran_high_t sinpi_1_9 = 5283;
 static const tran_high_t sinpi_2_9 = 9929;
 static const tran_high_t sinpi_3_9 = 13377;
 static const tran_high_t sinpi_4_9 = 15212;
 
+// 16384 * sqrt(2)
+static const tran_high_t Sqrt2 = 23170;
+
 #endif  // VPX_DSP_TXFM_COMMON_H_
-- 
2.40.0