From 6a47cff882d027542d3320a57e14cbe5ba7404ed Mon Sep 17 00:00:00 2001
From: Debargha Mukherjee <debargha@google.com>
Date: Wed, 2 Nov 2016 14:57:42 -0700
Subject: [PATCH] Further work on 64x64 fwd/inv transform support

For higher level fwd and inv transform functions.

Change-Id: I91518250a0be7d94aada7519f6c9e7ed024574fb
---
 av1/common/av1_rtcd_defs.pl   |  17 +++++
 av1/common/enums.h            |  36 ++++++----
 av1/common/idct.c             | 129 +++++++++++++++++++++++++++++-----
 av1/encoder/hybrid_fwd_txfm.c |  98 ++++++++++++++++++++++++++
 4 files changed, 249 insertions(+), 31 deletions(-)

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 597d5b2a0..ee46820fe 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -111,6 +111,9 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
 
     add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
     specialize qw/av1_iht16x16_256_add sse2 avx2/;
+
+    add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/av1_iht32x32_1024_add/;
   }
 } else {
   # Force C versions if CONFIG_EMULATE_HARDWARE is 1
@@ -141,6 +144,10 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
 
     add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
     specialize qw/av1_iht16x16_256_add/;
+
+    add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/av1_iht32x32_1024_add/;
+
   } else {
     add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht4x4_16_add sse2 neon dspr2/;
@@ -169,6 +176,9 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
     specialize qw/av1_iht16x16_256_add sse2 avx2 dspr2/;
 
+    add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/av1_iht32x32_1024_add/;
+
     if (aom_config("CONFIG_EXT_TX") ne "yes") {
       specialize qw/av1_iht4x4_16_add msa/;
       specialize qw/av1_iht8x8_64_add msa/;
@@ -176,6 +186,13 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
     }
   }
 }
+add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+specialize qw/av1_iht32x32_1024_add/;
+
+if (aom_config("CONFIG_TX64X64") eq "yes") {
+  add_proto qw/void av1_iht64x64_4096_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+  specialize qw/av1_iht64x64_4096_add/;
+}
 
 if (aom_config("CONFIG_NEW_QUANT") eq "yes") {
   add_proto qw/void quantize_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
diff --git a/av1/common/enums.h b/av1/common/enums.h
index 2ec83ecae..6c0eb3d6b 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -134,24 +134,32 @@ typedef enum ATTRIBUTE_PACKED {
 #if CONFIG_CB4X4
   TX_2X2,  // 2x2 transform
 #endif
-  TX_4X4,                   // 4x4 transform
-  TX_8X8,                   // 8x8 transform
-  TX_16X16,                 // 16x16 transform
-  TX_32X32,                 // 32x32 transform
-  TX_4X8,                   // 4x8 transform
-  TX_8X4,                   // 8x4 transform
-  TX_8X16,                  // 8x16 transform
-  TX_16X8,                  // 16x8 transform
-  TX_16X32,                 // 16x32 transform
-  TX_32X16,                 // 32x16 transform
-  TX_SIZES_ALL,             // Includes rectangular transforms
-  TX_SIZES = TX_32X32 + 1,  // Does NOT include rectangular transforms
-  TX_INVALID = 255          // Invalid transform size
+  TX_4X4,    // 4x4 transform
+  TX_8X8,    // 8x8 transform
+  TX_16X16,  // 16x16 transform
+  TX_32X32,  // 32x32 transform
+#if CONFIG_TX64X64
+  TX_64X64,           // 64x64 transform
+#endif                // CONFIG_TX64X64
+  TX_4X8,             // 4x8 transform
+  TX_8X4,             // 8x4 transform
+  TX_8X16,            // 8x16 transform
+  TX_16X8,            // 16x8 transform
+  TX_16X32,           // 16x32 transform
+  TX_32X16,           // 32x16 transform
+#if 0                 // CONFIG_TX64X64
+  // TODO(debargha): To be enabled later
+  TX_32X64,                 // 32x64 transform
+  TX_64X32,                 // 64x32 transform
+#endif                // CONFIG_TX64X64
+  TX_SIZES_ALL,       // Includes rectangular transforms
+  TX_SIZES = TX_4X8,  // Does NOT include rectangular transforms
+  TX_INVALID = 255    // Invalid transform size
 } TX_SIZE;
 
 #define MAX_TX_DEPTH (TX_32X32 - TX_4X4)
 
-#define MAX_TX_SIZE_LOG2 5
+#define MAX_TX_SIZE_LOG2 (5 + CONFIG_TX64X64)
 #define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
 #define MIN_TX_SIZE_LOG2 2
 #define MIN_TX_SIZE (1 << MIN_TX_SIZE_LOG2)
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 223c57744..2663d2d36 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -23,14 +23,14 @@
 int get_tx_scale(const MACROBLOCKD *const xd, const TX_TYPE tx_type,
                  const TX_SIZE tx_size) {
   (void)tx_type;
-#if CONFIG_AOM_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    return txsize_sqr_up_map[tx_size] == TX_32X32;
-  }
-#else
   (void)xd;
-#endif
-  return txsize_sqr_up_map[tx_size] == TX_32X32;
+  if (txsize_sqr_up_map[tx_size] == TX_32X32) return 1;
+#if CONFIG_TX64X64
+  else if (txsize_sqr_up_map[tx_size] == TX_64X64)
+    return 2;
+#endif  // CONFIG_TX64X64
+  else
+    return 0;
 }
 
 // NOTE: The implementation of all inverses need to be aware of the fact
@@ -58,6 +58,14 @@ static void iidtx32_c(const tran_low_t *input, tran_low_t *output) {
   int i;
   for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
 }
+
+#if CONFIG_TX64X64
+static void iidtx64_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 64; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_EXT_TX
 
 // For use in lieu of ADST
@@ -94,12 +102,6 @@ static void idct64_row_c(const tran_low_t *input, tran_low_t *output) {
   for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
 }
 
-static void iidtx64_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  for (i = 0; i < 64; ++i)
-    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
-}
-
 // For use in lieu of ADST
 static void ihalfright64_c(const tran_low_t *input, tran_low_t *output) {
   int i;
@@ -174,7 +176,10 @@ static void highbd_iidtx64_c(const tran_low_t *input, tran_low_t *output,
     output[i] =
         HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[i] * 4 * Sqrt2), bd);
 }
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_EXT_TX
 
+#if CONFIG_TX64X64
 // For use in lieu of ADST
 static void highbd_ihalfright64_c(const tran_low_t *input, tran_low_t *output,
                                   int bd) {
@@ -215,7 +220,6 @@ static void highbd_idct64_row_c(const tran_low_t *input, tran_low_t *output,
   for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
 }
 #endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 // Inverse identity transform and add.
@@ -223,7 +227,7 @@ static void highbd_idct64_row_c(const tran_low_t *input, tran_low_t *output,
 static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                            int bs, int tx_type) {
   int r, c;
-  const int shift = bs < 32 ? 3 : 2;
+  const int shift = bs < 32 ? 3 : (bs < 64 ? 2 : 1);
   if (tx_type == IDTX) {
     for (r = 0; r < bs; ++r) {
       for (c = 0; c < bs; ++c)
@@ -929,6 +933,7 @@ void av1_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride,
     }
   }
 }
+#endif  // CONFIG_EXT_TX
 
 #if CONFIG_TX64X64
 void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
@@ -938,6 +943,7 @@ void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
     { ihalfright64_c, idct64_row_c },    // ADST_DCT
     { idct64_col_c, ihalfright64_c },    // DCT_ADST
     { ihalfright64_c, ihalfright64_c },  // ADST_ADST
+#if CONFIG_EXT_TX
     { ihalfright64_c, idct64_row_c },    // FLIPADST_DCT
     { idct64_col_c, ihalfright64_c },    // DCT_FLIPADST
     { ihalfright64_c, ihalfright64_c },  // FLIPADST_FLIPADST
@@ -950,6 +956,7 @@ void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
     { iidtx64_c, ihalfright64_c },       // H_ADST
     { ihalfright64_c, iidtx64_c },       // V_FLIPADST
     { iidtx64_c, ihalfright64_c },       // H_FLIPADST
+#endif                                   // CONFIG_EXT_TX
   };
 
   int i, j;
@@ -979,7 +986,9 @@ void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
     IHT_64[tx_type].cols(out[i], out[i]);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
+#endif  // CONFIG_EXT_TX
 
   // Sum with the destination
   for (i = 0; i < 64; ++i) {
@@ -991,7 +1000,6 @@ void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
   }
 }
 #endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
 
 // idct
 void av1_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
@@ -1056,6 +1064,14 @@ void av1_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
     aom_idct32x32_1024_add(input, dest, stride);
 }
 
+#if CONFIG_TX64X64
+void av1_idct64x64_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob) {
+  (void)eob;
+  av1_iht64x64_4096_add(input, dest, stride, DCT_DCT);
+}
+#endif  // CONFIG_TX64X64
+
 void av1_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, TX_TYPE tx_type, int lossless) {
   if (lossless) {
@@ -1206,6 +1222,35 @@ void av1_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest, int stride,
   }
 }
 
+#if CONFIG_TX64X64
+void av1_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT: av1_idct64x64_add(input, dest, stride, eob); break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      av1_iht64x64_4096_add_c(input, dest, stride, tx_type);
+      break;
+    case IDTX: inv_idtx_add_c(input, dest, stride, 64, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_AOM_HIGHBITDEPTH
 void av1_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int tx_type, int bd) {
@@ -1835,6 +1880,7 @@ void av1_highbd_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
     }
   }
 }
+#endif  // CONFIG_EXT_TX
 
 #if CONFIG_TX64X64
 void av1_highbd_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1844,6 +1890,7 @@ void av1_highbd_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest8,
     { highbd_ihalfright64_c, highbd_idct64_row_c },    // ADST_DCT
     { highbd_idct64_col_c, highbd_ihalfright64_c },    // DCT_ADST
     { highbd_ihalfright64_c, highbd_ihalfright64_c },  // ADST_ADST
+#if CONFIG_EXT_TX
     { highbd_ihalfright64_c, highbd_idct64_row_c },    // FLIPADST_DCT
     { highbd_idct64_col_c, highbd_ihalfright64_c },    // DCT_FLIPADST
     { highbd_ihalfright64_c, highbd_ihalfright64_c },  // FLIPADST_FLIPADST
@@ -1856,6 +1903,7 @@ void av1_highbd_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest8,
     { highbd_iidtx64_c, highbd_ihalfright64_c },       // H_ADST
     { highbd_ihalfright64_c, highbd_iidtx64_c },       // V_FLIPADST
     { highbd_iidtx64_c, highbd_ihalfright64_c },       // H_FLIPADST
+#endif                                                 // CONFIG_EXT_TX
   };
 
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
@@ -1887,7 +1935,9 @@ void av1_highbd_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest8,
     HIGH_IHT_64[tx_type].cols(out[i], out[i], bd);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
+#endif  // CONFIG_EXT_TX
 
   // Sum with the destination
   for (i = 0; i < 64; ++i) {
@@ -1900,7 +1950,6 @@ void av1_highbd_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 #endif  // CONFIG_TX64X64
-#endif  // CONFIG_EXT_TX
 
 // idct
 void av1_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
@@ -2155,6 +2204,42 @@ void av1_highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
     default: assert(0); break;
   }
 }
+
+#if CONFIG_TX64X64
+void av1_highbd_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest,
+                                   int stride, int eob, int bd,
+                                   TX_TYPE tx_type) {
+  (void)eob;
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_inv_txfm2d_add_64x64(input, CONVERT_TO_SHORTPTR(dest), stride,
+                               DCT_DCT, bd);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      av1_highbd_iht64x64_4096_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 64, tx_type, bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 void inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
@@ -2165,6 +2250,11 @@ void inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
   const int lossless = inv_txfm_param->lossless;
 
   switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      av1_inv_txfm_add_64x64(input, dest, stride, eob, tx_type);
+      break;
+#endif  // CONFIG_TX64X64
     case TX_32X32:
       av1_inv_txfm_add_32x32(input, dest, stride, eob, tx_type);
       break;
@@ -2206,6 +2296,11 @@ void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
   const int lossless = inv_txfm_param->lossless;
 
   switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      av1_highbd_inv_txfm_add_64x64(input, dest, stride, eob, bd, tx_type);
+      break;
+#endif  // CONFIG_TX64X64
     case TX_32X32:
       av1_highbd_inv_txfm_add_32x32(input, dest, stride, eob, bd, tx_type);
       break;
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index ff0351630..a88c88435 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -24,6 +24,24 @@ static INLINE void fdct32x32(int rd_transform, const int16_t *src,
     av1_fht32x32(src, dst, src_stride, DCT_DCT);
 }
 
+#if CONFIG_TX64X64
+static INLINE void fdct64x64(const int16_t *src, tran_low_t *dst,
+                             int src_stride) {
+  av1_fht64x64(src, dst, src_stride, DCT_DCT);
+}
+
+static INLINE void fdct64x64_1(const int16_t *src, tran_low_t *dst,
+                               int src_stride) {
+  int i, j;
+  int32_t sum = 0;
+  memset(dst, 0, sizeof(*dst) * 4096);
+  for (i = 0; i < 64; ++i)
+    for (j = 0; j < 64; ++j) sum += src[i * src_stride + j];
+  // Note: this scaling makes the transform 2 times unitary
+  dst[0] = ROUND_POWER_OF_TWO_SIGNED(sum, 5);
+}
+#endif  // CONFIG_TX64X64
+
 static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TX_TYPE tx_type, int lossless) {
   if (lossless) {
@@ -192,6 +210,41 @@ static void fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
   }
 }
 
+#if CONFIG_TX64X64
+static void fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
+                           int diff_stride, TX_TYPE tx_type,
+                           FWD_TXFM_OPT fwd_txfm_opt) {
+  switch (tx_type) {
+    case DCT_DCT:
+      if (fwd_txfm_opt == FWD_TXFM_OPT_NORMAL)
+        fdct64x64(src_diff, coeff, diff_stride);
+      else  // FWD_TXFM_OPT_DC
+        fdct64x64_1(src_diff, coeff, diff_stride);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      av1_fht64x64(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST: av1_fht32x32(src_diff, coeff, diff_stride, tx_type); break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_AOM_HIGHBITDEPTH
 static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TX_TYPE tx_type, int lossless,
@@ -379,6 +432,40 @@ static void highbd_fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
     default: assert(0); break;
   }
 }
+
+#if CONFIG_TX64X64
+static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TX_TYPE tx_type,
+                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  (void)bd;
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_highbd_fht64x64_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      av1_highbd_fht64x64_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 void fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
@@ -389,6 +476,11 @@ void fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
   const int rd_transform = fwd_txfm_param->rd_transform;
   const int lossless = fwd_txfm_param->lossless;
   switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      fwd_txfm_64x64(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+#endif  // CONFIG_TX64X64
     case TX_32X32:
       fwd_txfm_32x32(rd_transform, src_diff, coeff, diff_stride, tx_type,
                      fwd_txfm_opt);
@@ -434,6 +526,12 @@ void highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
   const int lossless = fwd_txfm_param->lossless;
   const int bd = fwd_txfm_param->bd;
   switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                            bd);
+      break;
+#endif  // CONFIG_TX64X64
     case TX_32X32:
       highbd_fwd_txfm_32x32(rd_transform, src_diff, coeff, diff_stride, tx_type,
                             fwd_txfm_opt, bd);
-- 
2.49.0