From 9b88762b178f9f335d434090a398f2e5a6f2182d Mon Sep 17 00:00:00 2001
From: Debargha Mukherjee <debargha@google.com>
Date: Mon, 14 Mar 2016 22:30:09 -0700
Subject: [PATCH] Refactor 1D transforms

In preparation for adding more 1D variants with ADST/FlipADST/etc.

BDRATE actually improves by 0.21% on lowres.

Change-Id: I2fa4720c69fe001fa666119a284dfc6b17fffab2
---
 vp10/common/idct.c             | 298 ++++++++++++++-------------------
 vp10/encoder/dct.c             | 121 +++++--------
 vp10/encoder/hybrid_fwd_txfm.c |  25 ++-
 3 files changed, 183 insertions(+), 261 deletions(-)

diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index f621ec61b..863f0db6b 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -260,6 +260,30 @@ void idst16_c(const tran_low_t *input, tran_low_t *output) {
 }
 
 #if CONFIG_EXT_TX
+static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 4; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+}
+
+static void iidtx8_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 8; ++i)
+    output[i] = input[i] * 2;
+}
+
+static void iidtx16_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 16; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 2 * Sqrt2);
+}
+
+static void iidtx32_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 32; ++i)
+    output[i] = input[i] * 4;
+}
+
 // For use in lieu of DST
 static void ihalfcenter32_c(const tran_low_t *input, tran_low_t *output) {
   int i;
@@ -291,6 +315,37 @@ static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_iidtx4_c(const tran_low_t *input, tran_low_t *output,
+                            int bd) {
+  int i;
+  for (i = 0; i < 4; ++i)
+    output[i] = (tran_low_t)highbd_dct_const_round_shift(input[i] * Sqrt2, bd);
+}
+
+static void highbd_iidtx8_c(const tran_low_t *input, tran_low_t *output,
+                            int bd) {
+  int i;
+  (void) bd;
+  for (i = 0; i < 8; ++i)
+    output[i] = input[i] * 2;
+}
+
+static void highbd_iidtx16_c(const tran_low_t *input, tran_low_t *output,
+                            int bd) {
+  int i;
+  for (i = 0; i < 16; ++i)
+    output[i] = (tran_low_t)highbd_dct_const_round_shift(
+        input[i] * 2 * Sqrt2, bd);
+}
+
+static void highbd_iidtx32_c(const tran_low_t *input, tran_low_t *output,
+                             int bd) {
+  int i;
+  (void) bd;
+  for (i = 0; i < 32; ++i)
+    output[i] = input[i] * 4;
+}
+
 static void highbd_ihalfcenter32_c(const tran_low_t *input, tran_low_t *output,
                                    int bd) {
   int i;
@@ -331,85 +386,19 @@ static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                            int bs, int tx_type) {
   int r, c;
   const int shift = bs < 32 ? 3 : 2;
-
-  tran_low_t temp_in[32], temp_out[32];
-  transform_2d ht = {idct4_c, idct4_c};
-  int out_scale = 1;
-  int coeff_stride = 0;
-
-  switch (bs) {
-    case 4:
-      ht.cols = idct4_c;
-      ht.rows = idct4_c;
-      out_scale = cospi_16_64 >> 3;
-      coeff_stride = 4;
-      break;
-    case 8:
-      ht.cols = idct8_c;
-      ht.rows = idct8_c;
-      out_scale = (1 << (DCT_CONST_BITS - 4));
-      coeff_stride = 8;
-      break;
-    case 16:
-      ht.cols = idct16_c;
-      ht.rows = idct16_c;
-      out_scale = cospi_16_64 >> 4;
-      coeff_stride = 16;
-      break;
-    case 32:
-      ht.cols = idct32_c;
-      ht.rows = idct32_c;
-      out_scale = (1 << (DCT_CONST_BITS - 4));
-      coeff_stride = 32;
-      break;
-    default:
-      assert(0);
-  }
-
-  // Columns
-  if (tx_type == V_DCT) {
-    for (c = 0; c < bs; ++c) {
-      for (r = 0; r < bs; ++r)
-        temp_in[r] = input[r * coeff_stride + c];
-      ht.cols(temp_in, temp_out);
-
-      for (r = 0; r < bs; ++r) {
-        tran_high_t temp = (tran_high_t)temp_out[r] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        dest[r * stride + c] = clip_pixel_add(dest[r * stride + c],
-                                              (tran_low_t)temp);
-      }
-    }
-    return;
-  }
-
-  if (tx_type == H_DCT) {
+  if (tx_type == IDTX) {
     for (r = 0; r < bs; ++r) {
       for (c = 0; c < bs; ++c)
-        temp_in[c] = input[r * coeff_stride + c];
-      ht.rows(temp_in, temp_out);
-
-      for (c = 0; c < bs; ++c) {
-        tran_high_t temp = (tran_high_t)temp_out[c] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        dest[r * stride + c] = clip_pixel_add(dest[r * stride + c],
-                                              (tran_low_t)temp);
-      }
+        dest[c] = clip_pixel_add(dest[c], input[c] >> shift);
+      dest += stride;
+      input += bs;
     }
-    return;
-  }
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c)
-      dest[c] = clip_pixel_add(dest[c], input[c] >> shift);
-    dest += stride;
-    input += bs;
   }
 }
 
 #define FLIPUD_PTR(dest, stride, size) do {     \
-    (dest) = (dest) + ((size) - 1) * (stride);  \
-    (stride) = - (stride);                      \
+  (dest) = (dest) + ((size) - 1) * (stride);  \
+  (stride) = - (stride);                      \
 } while (0)
 
 static void maybe_flip_strides(uint8_t **dst, int *dstride,
@@ -428,6 +417,7 @@ static void maybe_flip_strides(uint8_t **dst, int *dstride,
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
+    case IDTX:
     case V_DCT:
     case H_DCT:
       break;
@@ -705,78 +695,13 @@ static void highbd_inv_idtx_add_c(const tran_low_t *input, uint8_t *dest8,
   const int shift = bs < 32 ? 3 : 2;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  tran_low_t temp_in[32], temp_out[32];
-  highbd_transform_2d ht = {vpx_highbd_idct4_c, vpx_highbd_idct4_c};
-  int out_scale = 1;
-  int coeff_stride = 0;
-
-  switch (bs) {
-    case 4:
-      ht.cols = vpx_highbd_idct4_c;
-      ht.rows = vpx_highbd_idct4_c;
-      out_scale = cospi_16_64 >> 3;
-      coeff_stride = 4;
-      break;
-    case 8:
-      ht.cols = vpx_highbd_idct8_c;
-      ht.rows = vpx_highbd_idct8_c;
-      out_scale = (1 << (DCT_CONST_BITS - 4));
-      coeff_stride = 8;
-      break;
-    case 16:
-      ht.cols = vpx_highbd_idct16_c;
-      ht.rows = vpx_highbd_idct16_c;
-      out_scale = cospi_16_64 >> 4;
-      coeff_stride = 16;
-      break;
-    case 32:
-      ht.cols = vpx_highbd_idct32_c;
-      ht.rows = vpx_highbd_idct32_c;
-      out_scale = (1 << (DCT_CONST_BITS - 4));
-      coeff_stride = 32;
-      break;
-    default:
-      assert(0);
-  }
-
-  // Columns
-  if (tx_type == V_DCT) {
-    for (c = 0; c < bs; ++c) {
-      for (r = 0; r < bs; ++r)
-        temp_in[r] = input[r * coeff_stride + c];
-      ht.cols(temp_in, temp_out, bd);
-
-      for (r = 0; r < bs; ++r) {
-        tran_high_t temp = (tran_high_t)temp_out[r] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        dest[r * stride + c] = highbd_clip_pixel_add(dest[r * stride + c],
-                                                     (tran_low_t)temp, bd);
-      }
-    }
-    return;
-  }
-
-  if (tx_type == H_DCT) {
+  if (tx_type == IDTX) {
     for (r = 0; r < bs; ++r) {
       for (c = 0; c < bs; ++c)
-        temp_in[c] = input[r * coeff_stride + c];
-      ht.rows(temp_in, temp_out, bd);
-
-      for (c = 0; c < bs; ++c) {
-        tran_high_t temp = (tran_high_t)temp_out[c] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        dest[r * stride + c] = highbd_clip_pixel_add(dest[r * stride + c],
-                                                     (tran_low_t)temp, bd);
-      }
+        dest[c] = highbd_clip_pixel_add(dest[c], input[c] >> shift, bd);
+      dest += stride;
+      input += bs;
     }
-    return;
-  }
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c)
-      dest[c] = highbd_clip_pixel_add(dest[c], input[c] >> shift, bd);
-    dest += stride;
-    input += bs;
   }
 }
 
@@ -796,6 +721,9 @@ static void maybe_flip_strides16(uint16_t **dst, int *dstride,
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
+    case IDTX:
+    case V_DCT:
+    case H_DCT:
       break;
     case FLIPADST_DCT:
     case FLIPADST_ADST:
@@ -843,6 +771,9 @@ void vp10_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
     { idst4_c,  iadst4_c },  // DST_FLIPADST      = 13,
     { iadst4_c, idst4_c  },  // FLIPADST_DST      = 14,
     { idst4_c,  idst4_c  },  // DST_DST           = 15
+    { iidtx4_c, iidtx4_c },  // IDTX              = 16
+    { idct4_c,  iidtx4_c },  // V_DCT             = 17
+    { iidtx4_c, idct4_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -906,6 +837,9 @@ void vp10_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
     { idst8_c,  iadst8_c },  // DST_FLIPADST      = 13,
     { iadst8_c, idst8_c  },  // FLIPADST_DST      = 14,
     { idst8_c,  idst8_c  },  // DST_DST           = 15
+    { iidtx8_c, iidtx8_c },  // IDTX              = 16
+    { idct8_c,  iidtx8_c },  // V_DCT             = 17
+    { iidtx8_c, idct8_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -969,6 +903,9 @@ void vp10_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
     { idst16_c,  iadst16_c },  // DST_FLIPADST      = 13,
     { iadst16_c, idst16_c  },  // FLIPADST_DST      = 14,
     { idst16_c,  idst16_c  },  // DST_DST           = 15
+    { iidtx16_c, iidtx16_c },  // IDTX              = 16
+    { idct16_c,  iidtx16_c },  // V_DCT             = 17
+    { iidtx16_c, idct16_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -1032,6 +969,9 @@ void vp10_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
     { ihalfcenter32_c,  ihalfright32_c },    // DST_FLIPADST      = 13,
     { ihalfright32_c, ihalfcenter32_c  },    // FLIPADST_DST      = 14,
     { ihalfcenter32_c,  ihalfcenter32_c  },  // DST_DST           = 15
+    { iidtx32_c, iidtx32_c },                // IDTX              = 16
+    { idct32_c,  iidtx32_c },                // V_DCT             = 17
+    { iidtx32_c, idct32_c  },                // H_DCT             = 18
   };
 
   int i, j;
@@ -1165,11 +1105,11 @@ void vp10_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_iht4x4_16_add_c(input, dest, stride, tx_type);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       inv_idtx_add_c(input, dest, stride, 4, tx_type);
       break;
@@ -1206,11 +1146,11 @@ void vp10_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_iht8x8_64_add_c(input, dest, stride, tx_type);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       inv_idtx_add_c(input, dest, stride, 8, tx_type);
       break;
@@ -1247,11 +1187,11 @@ void vp10_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_iht16x16_256_add_c(input, dest, stride, tx_type);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       inv_idtx_add_c(input, dest, stride, 16, tx_type);
       break;
@@ -1284,10 +1224,10 @@ void vp10_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
-      vp10_iht32x32_1024_add_c(input, dest, stride, tx_type);
-      break;
     case H_DCT:
     case V_DCT:
+      vp10_iht32x32_1024_add_c(input, dest, stride, tx_type);
+      break;
     case IDTX:
       inv_idtx_add_c(input, dest, stride, 32, tx_type);
       break;
@@ -1319,6 +1259,9 @@ void vp10_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
     {     highbd_idst4_c,  vpx_highbd_iadst4_c },  // DST_FLIPADST      = 13,
     { vpx_highbd_iadst4_c,     highbd_idst4_c  },  // FLIPADST_DST      = 14,
     {     highbd_idst4_c,      highbd_idst4_c  },  // DST_DST           = 15
+    {     highbd_iidtx4_c,     highbd_iidtx4_c },  // IDTX              = 16
+    { vpx_highbd_idct4_c,      highbd_iidtx4_c },  // V_DCT             = 17
+    {     highbd_iidtx4_c, vpx_highbd_idct4_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -1385,6 +1328,9 @@ void vp10_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
     {     highbd_idst8_c,  vpx_highbd_iadst8_c },  // DST_FLIPADST      = 13,
     { vpx_highbd_iadst8_c,     highbd_idst8_c  },  // FLIPADST_DST      = 14,
     {     highbd_idst8_c,      highbd_idst8_c  },  // DST_DST           = 15
+    {     highbd_iidtx8_c,     highbd_iidtx8_c },  // IDTX              = 16
+    { vpx_highbd_idct8_c,      highbd_iidtx8_c },  // V_DCT             = 17
+    {     highbd_iidtx8_c, vpx_highbd_idct8_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -1451,6 +1397,9 @@ void vp10_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
     {     highbd_idst16_c,  vpx_highbd_iadst16_c },  // DST_FLIPADST      = 13,
     { vpx_highbd_iadst16_c,     highbd_idst16_c  },  // FLIPADST_DST      = 14,
     {     highbd_idst16_c,      highbd_idst16_c  },  // DST_DST           = 15
+    {     highbd_iidtx16_c,     highbd_iidtx16_c },  // IDTX              = 16
+    { vpx_highbd_idct16_c,      highbd_iidtx16_c },  // V_DCT             = 17
+    {     highbd_iidtx16_c, vpx_highbd_idct16_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -1501,22 +1450,25 @@ void vp10_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
 void vp10_highbd_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
                                      int stride, int tx_type, int bd) {
   static const highbd_transform_2d HIGH_IHT_32[] = {
-    { vpx_highbd_idct32_c, vpx_highbd_idct32_c  },        // DCT_DCT
-    { highbd_ihalfright32_c, vpx_highbd_idct32_c  },      // ADST_DCT
-    { vpx_highbd_idct32_c, highbd_ihalfright32_c },       // DCT_ADST
-    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // ADST_ADST
-    { highbd_ihalfright32_c, vpx_highbd_idct32_c  },      // FLIPADST_DCT
-    { vpx_highbd_idct32_c, highbd_ihalfright32_c },       // DCT_FLIPADST
-    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // FLIPADST_FLIPADST
-    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // ADST_FLIPADST
-    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // FLIPADST_ADST
-    { highbd_ihalfcenter32_c, vpx_highbd_idct32_c  },     // DST_DCT
-    { vpx_highbd_idct32_c, highbd_ihalfcenter32_c  },     // DCT_DST
-    { highbd_ihalfcenter32_c, highbd_ihalfright32_c },    // DST_ADST
-    { highbd_ihalfright32_c, highbd_ihalfcenter32_c  },   // ADST_DST
-    { highbd_ihalfcenter32_c, highbd_ihalfright32_c },    // DST_FLIPADST
-    { highbd_ihalfright32_c, highbd_ihalfcenter32_c  },   // FLIPADST_DST
-    { highbd_ihalfcenter32_c, highbd_ihalfcenter32_c  },  // DST_DST
+    { vpx_highbd_idct32_c,    vpx_highbd_idct32_c    },  // DCT_DCT
+    { highbd_ihalfright32_c,  vpx_highbd_idct32_c    },  // ADST_DCT
+    { vpx_highbd_idct32_c,    highbd_ihalfright32_c  },  // DCT_ADST
+    { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // ADST_ADST
+    { highbd_ihalfright32_c,  vpx_highbd_idct32_c    },  // FLIPADST_DCT
+    { vpx_highbd_idct32_c,    highbd_ihalfright32_c  },  // DCT_FLIPADST
+    { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // FLIPADST_FLIPADST
+    { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // ADST_FLIPADST
+    { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // FLIPADST_ADST
+    { highbd_ihalfcenter32_c, vpx_highbd_idct32_c    },  // DST_DCT
+    { vpx_highbd_idct32_c,    highbd_ihalfcenter32_c },  // DCT_DST
+    { highbd_ihalfcenter32_c, highbd_ihalfright32_c  },  // DST_ADST
+    { highbd_ihalfright32_c,  highbd_ihalfcenter32_c },  // ADST_DST
+    { highbd_ihalfcenter32_c, highbd_ihalfright32_c  },  // DST_FLIPADST
+    { highbd_ihalfright32_c,  highbd_ihalfcenter32_c },  // FLIPADST_DST
+    { highbd_ihalfcenter32_c, highbd_ihalfcenter32_c },  // DST_DST
+    {     highbd_iidtx32_c,   highbd_iidtx32_c       },  // IDTX
+    { vpx_highbd_idct32_c,    highbd_iidtx32_c       },  // V_DCT
+    {     highbd_iidtx32_c,   vpx_highbd_idct32_c    },  // H_DCT
   };
 
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
@@ -1657,11 +1609,11 @@ void vp10_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_highbd_iht4x4_16_add_c(input, dest, stride, tx_type, bd);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       highbd_inv_idtx_add_c(input, dest, stride, 4, tx_type, bd);
       break;
@@ -1699,11 +1651,11 @@ void vp10_highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_highbd_iht8x8_64_add_c(input, dest, stride, tx_type, bd);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       highbd_inv_idtx_add_c(input, dest, stride, 8, tx_type, bd);
       break;
@@ -1741,11 +1693,11 @@ void vp10_highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_highbd_iht16x16_256_add_c(input, dest, stride, tx_type, bd);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       highbd_inv_idtx_add_c(input, dest, stride, 16, tx_type, bd);
       break;
@@ -1779,10 +1731,10 @@ void vp10_highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
-      vp10_highbd_iht32x32_1024_add_c(input, dest, stride, tx_type, bd);
-      break;
     case H_DCT:
     case V_DCT:
+      vp10_highbd_iht32x32_1024_add_c(input, dest, stride, tx_type, bd);
+      break;
     case IDTX:
       highbd_inv_idtx_add_c(input, dest, stride, 32, tx_type, bd);
       break;
diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c
index 31a4c87c2..8a1ee201c 100644
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -1212,6 +1212,30 @@ static void fadst16(const tran_low_t *input, tran_low_t *output) {
 }
 
 #if CONFIG_EXT_TX
+static void fidtx4(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 4; ++i)
+    output[i] = (tran_low_t)fdct_round_shift(input[i] * Sqrt2);
+}
+
+static void fidtx8(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 8; ++i)
+    output[i] = input[i] * 2;
+}
+
+static void fidtx16(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 16; ++i)
+    output[i] = (tran_low_t)fdct_round_shift(input[i] * 2 * Sqrt2);
+}
+
+static void fidtx32(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 32; ++i)
+    output[i] = input[i] * 4;
+}
+
 // For use in lieu of DST
 static void fhalfcenter32(const tran_low_t *input, tran_low_t *output) {
   int i;
@@ -1315,6 +1339,7 @@ static void maybe_flip_input(const int16_t **src, int *src_stride, int l,
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
+    case IDTX:
     case H_DCT:
     case V_DCT:
       break;
@@ -1362,6 +1387,9 @@ static const transform_2d FHT_4[] = {
   { fdst4,  fadst4 },  // DST_FLIPADST      = 13,
   { fadst4, fdst4  },  // FLIPADST_DST      = 14,
   { fdst4,  fdst4  },  // DST_DST           = 15
+  { fidtx4, fidtx4 },  // IDTX              = 16
+  { fdct4,  fidtx4 },  // V_DCT             = 17
+  { fidtx4, fdct4  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
 };
 
@@ -1383,6 +1411,9 @@ static const transform_2d FHT_8[] = {
   { fdst8,  fadst8 },  // DST_FLIPADST      = 13,
   { fadst8, fdst8  },  // FLIPADST_DST      = 14,
   { fdst8,  fdst8  },  // DST_DST           = 15
+  { fidtx8, fidtx8 },  // IDTX              = 16
+  { fdct8,  fidtx8 },  // V_DCT             = 17
+  { fidtx8, fdct8  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
 };
 
@@ -1404,6 +1435,9 @@ static const transform_2d FHT_16[] = {
   { fdst16,  fadst16 },  // DST_FLIPADST      = 13,
   { fadst16, fdst16  },  // FLIPADST_DST      = 14,
   { fdst16,  fdst16  },  // DST_DST           = 15
+  { fidtx16, fidtx16 },  // IDTX              = 16
+  { fdct16,  fidtx16 },  // V_DCT             = 17
+  { fidtx16, fdct16  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
 };
 
@@ -1425,6 +1459,9 @@ static const transform_2d FHT_32[] = {
   { fhalfcenter32,  fhalfright32 },    // DST_FLIPADST      = 13,
   { fhalfright32, fhalfcenter32  },    // FLIPADST_DST      = 14,
   { fhalfcenter32,  fhalfcenter32  },  // DST_DST           = 15
+  { fidtx32, fidtx32 },                // IDTX              = 16
+  { fdct32,  fidtx32 },                // V_DCT             = 17
+  { fidtx32, fdct32  },                // H_DCT             = 18
 };
 #endif  // CONFIG_EXT_TX
 
@@ -1766,86 +1803,12 @@ void vp10_fwd_idtx_c(const int16_t *src_diff,
                      int bs, int tx_type) {
   int r, c;
   const int shift = bs < 32 ? 3 : 2;
-
-  const int16_t *input = src_diff;
-  tran_low_t *output = coeff;
-
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-  transform_2d ht = {fdct4, fdct4};
-  int in_scale = 1;
-  int out_scale = 1;
-  int coeff_stride = 0;
-
-  switch (bs) {
-    case 4:
-      ht.cols = fdct4;
-      ht.rows = fdct4;
-      in_scale = 16;
-      out_scale = cospi_16_64 >> 1;
-      coeff_stride = 4;
-      break;
-    case 8:
-      ht.cols = fdct8;
-      ht.rows = fdct8;
-      in_scale = 4;
-      out_scale = (1 << DCT_CONST_BITS);
-      coeff_stride = 8;
-      break;
-    case 16:
-      ht.cols = fdct16;
-      ht.rows = fdct16;
-      in_scale = 4;
-      out_scale = cospi_16_64;
-      coeff_stride = 16;
-      break;
-    case 32:
-      ht.cols = fdct32;
-      ht.rows = fdct32;
-      in_scale = 4;
-      out_scale = (1 << (DCT_CONST_BITS - 2));
-      coeff_stride = 32;
-      break;
-    default:
-      assert(0);
-  }
-
-  // Columns
-  if (tx_type == V_DCT) {
-    for (i = 0; i < bs; ++i) {
-      for (j = 0; j < bs; ++j)
-        temp_in[j] = input[j * stride + i] * in_scale;
-      ht.cols(temp_in, temp_out);
-
-      for (j = 0; j < bs; ++j) {
-        tran_high_t temp = (tran_high_t)temp_out[j] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        output[j * coeff_stride + i] = (tran_low_t)temp;
-      }
+  if (tx_type == IDTX) {
+    for (r = 0; r < bs; ++r) {
+      for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] << shift;
+      src_diff += stride;
+      coeff += bs;
     }
-    return;
-  }
-
-  // Rows
-  if (tx_type == H_DCT) {
-    for (j = 0; j < bs; ++j) {
-      for (i = 0; i < bs; ++i)
-        temp_in[i] = input[j * stride + i] * in_scale;
-      ht.rows(temp_in, temp_out);
-
-      for (i = 0; i < bs; ++i) {
-        tran_high_t temp = (tran_high_t)temp_out[i] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        output[j * coeff_stride + i] = (tran_low_t)temp;
-      }
-    }
-    return;
-  }
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] << shift;
-    src_diff += stride;
-    coeff += bs;
   }
 }
 
diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c
index c3a739b7d..faedb4349 100644
--- a/vp10/encoder/hybrid_fwd_txfm.c
+++ b/vp10/encoder/hybrid_fwd_txfm.c
@@ -65,6 +65,8 @@ void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
       break;
     case H_DCT:
     case V_DCT:
+      vp10_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type);
       break;
@@ -105,6 +107,8 @@ static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
       break;
     case H_DCT:
     case V_DCT:
+      vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type);
       break;
@@ -145,6 +149,8 @@ static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
       break;
     case H_DCT:
     case V_DCT:
+      vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type);
       break;
@@ -185,6 +191,8 @@ static void fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
       break;
     case H_DCT:
     case V_DCT:
+      vp10_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
       break;
@@ -226,11 +234,10 @@ void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
     case ADST_DST:
     case DST_FLIPADST:
     case FLIPADST_DST:
-      // Use C version since DST exists only in C
-      vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
-      break;
     case H_DCT:
     case V_DCT:
+      vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type);
       break;
@@ -270,11 +277,11 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
     case ADST_DST:
     case DST_FLIPADST:
     case FLIPADST_DST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST exists only in C
       vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type);
       break;
@@ -314,11 +321,11 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
     case ADST_DST:
     case DST_FLIPADST:
     case FLIPADST_DST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST exists only in C
       vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type);
       break;
@@ -355,10 +362,10 @@ static void highbd_fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
     case ADST_DST:
     case DST_FLIPADST:
     case FLIPADST_DST:
-      vp10_highbd_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
-      break;
     case H_DCT:
     case V_DCT:
+      vp10_highbd_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
       break;
-- 
2.40.0