From: Geza Lore <gezalore@gmail.com>
Date: Tue, 26 Jul 2016 16:12:43 +0000 (+0100)
Subject: Use rectangular transforms for >= 8x8 blocks
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d21982c80fdc210c9aae01455912ec9b778eb5be;p=libvpx

Use rectangular transforms for >= 8x8 blocks

For rectangular blocks between 8x8 and 32x32, we can now code the
transform size as one bigger than the largest square that fits in
the block (eg, for 16x8, we can code a transform size of 16x16
rather than the previous maximum of 8x8), when this oversized
transform is coded in the bitstream, the codec will use the full
size rectangular transform for that block (eg 16x8 transform in
the above example).

Also fixes a scaling bug in 16x8/8x16 transforms.

Change-Id: I62ce75f1b01c46fe2fbc727ce4abef695f4fcd43
---

diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index ee5ce0dcc..98fbcf62d 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -524,8 +524,63 @@ static INLINE int get_ext_tx_types(TX_SIZE tx_size, BLOCK_SIZE bs,
   const int set = get_ext_tx_set(tx_size, bs, is_inter);
   return is_inter ? num_ext_tx_set_inter[set] : num_ext_tx_set_intra[set];
 }
+
+#if CONFIG_RECT_TX
+static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
+  static const char LUT[BLOCK_SIZES] = {
+    0,  // BLOCK_4X4
+    1,  // BLOCK_4X8
+    1,  // BLOCK_8X4
+    0,  // BLOCK_8X8
+    1,  // BLOCK_8X16
+    1,  // BLOCK_16X8
+    0,  // BLOCK_16X16
+    1,  // BLOCK_16X32
+    1,  // BLOCK_32X16
+    0,  // BLOCK_32X32
+    0,  // BLOCK_32X64
+    0,  // BLOCK_64X32
+    0,  // BLOCK_64X64
+#if CONFIG_EXT_PARTITION
+    0,  // BLOCK_64X128
+    0,  // BLOCK_128X64
+    0,  // BLOCK_128X128
+#endif  // CONFIG_EXT_PARTITION
+  };
+
+  return LUT[bsize];
+}
+
+static INLINE int is_rect_tx_allowed(const MB_MODE_INFO *mbmi) {
+  return is_inter_block(mbmi) && is_rect_tx_allowed_bsize(mbmi->sb_type);
+}
+
+static INLINE int is_rect_tx(TX_SIZE tx_size) { return tx_size >= TX_SIZES; }
+#endif  // CONFIG_RECT_TX
 #endif  // CONFIG_EXT_TX
 
+static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode,
+                                           int is_inter) {
+  const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  if (!is_inter) {
+    return VPXMIN(max_tx_size, largest_tx_size);
+  } else {
+    const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bsize];
+    if (txsize_sqr_up_map[max_rect_tx_size] <= largest_tx_size) {
+      return max_rect_tx_size;
+    } else {
+      return largest_tx_size;
+    }
+  }
+#else
+  (void)is_inter;
+  return VPXMIN(max_tx_size, largest_tx_size);
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+}
+
 #if CONFIG_EXT_INTRA
 #define ALLOW_FILTER_INTRA_MODES 1
 #define ANGLE_STEP 3
diff --git a/vp10/common/common_data.h b/vp10/common/common_data.h
index 65e99e195..666aa75d4 100644
--- a/vp10/common/common_data.h
+++ b/vp10/common/common_data.h
@@ -355,9 +355,9 @@ static const TX_SIZE max_txsize_rect_lookup[BLOCK_SIZES] = {
   // 4X8,    8X4,      8X8
   TX_4X8,    TX_8X4,   TX_8X8,
   // 8X16,   16X8,     16X16
-  TX_8X8,    TX_8X8,   TX_16X16,
+  TX_8X16,   TX_16X8,  TX_16X16,
   // 16X32,  32X16,    32X32
-  TX_16X16,  TX_16X16, TX_32X32,
+  TX_16X32,  TX_32X16, TX_32X32,
   // 32X64,  64X32,    64X64
   TX_32X32,  TX_32X32, TX_32X32,
 #if CONFIG_EXT_PARTITION
@@ -366,6 +366,49 @@ static const TX_SIZE max_txsize_rect_lookup[BLOCK_SIZES] = {
 #endif  // CONFIG_EXT_PARTITION
 };
 #endif  // CONFIG_EXT_TX
+
+// Same as "max_txsize_lookup[bsize] - TX_8X8", invalid for bsize < 8X8
+static const int32_t intra_tx_size_cat_lookup[BLOCK_SIZES] = {
+  //                                      4X4
+                                          INT32_MIN,
+  // 4X8,             8X4,                8X8
+  INT32_MIN,          INT32_MIN,          TX_8X8 - TX_8X8,
+  // 8X16,            16X8,               16X16
+  TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,    TX_16X16 - TX_8X8,
+  // 16X32,           32X16,              32X32
+  TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,  TX_32X32 - TX_8X8,
+  // 32X64,           64X32,              64X64
+  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
+#if CONFIG_EXT_PARTITION
+  // 64x128,          128x64,             128x128
+  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
+#endif  // CONFIG_EXT_PARTITION
+};
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+// Same as "max_txsize_lookup[bsize] - TX_8X8", except for rectangular
+// block which may use a rectangular transform, in which  case it is
+// "(max_txsize_lookup[bsize] + 1) - TX_8X8", invalid for bsize < 8X8
+static const int32_t inter_tx_size_cat_lookup[BLOCK_SIZES] = {
+  //                                      4X4
+                                          INT32_MIN,
+  // 4X8,             8X4,                8X8
+  INT32_MIN,          INT32_MIN,           TX_8X8 - TX_8X8,
+  // 8X16,            16X8,               16X16
+  TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,
+  // 16X32,           32X16,              32X32
+  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
+  // 32X64,           64X32,              64X64
+  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
+#if CONFIG_EXT_PARTITION
+  // 64x128,          128x64,             128x128
+  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
+#endif  // CONFIG_EXT_PARTITION
+};
+#else
+#define inter_tx_size_cat_lookup intra_tx_size_cat_lookup
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
 /* clang-format on */
 
 static const TX_SIZE txsize_horz_map[TX_SIZES_ALL] = {
diff --git a/vp10/common/entropymode.h b/vp10/common/entropymode.h
index d1eea891d..addb493e7 100644
--- a/vp10/common/entropymode.h
+++ b/vp10/common/entropymode.h
@@ -171,6 +171,9 @@ typedef struct FRAME_COUNTS {
 #else
   unsigned int comp_ref[REF_CONTEXTS][COMP_REFS - 1][2];
 #endif  // CONFIG_EXT_REFS
+  // TODO(any): tx_size_totals is only used by the encoder to decide whether
+  // to use forward updates for the coeff probs, and as such it does not really
+  // belong into this structure.
   unsigned int tx_size_totals[TX_SIZES];
   unsigned int tx_size[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
 #if CONFIG_VAR_TX
diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index c19c684cc..8ff403668 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -24,12 +24,12 @@ int get_tx_scale(const MACROBLOCKD *const xd, const TX_TYPE tx_type,
   (void)tx_type;
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    return tx_size == TX_32X32;
+    return txsize_sqr_up_map[tx_size] == TX_32X32;
   }
 #else
   (void)xd;
 #endif
-  return tx_size == TX_32X32;
+  return txsize_sqr_up_map[tx_size] == TX_32X32;
 }
 
 #if CONFIG_EXT_TX
diff --git a/vp10/decoder/decodemv.c b/vp10/decoder/decodemv.c
index d8ce8fdec..983ad81c7 100644
--- a/vp10/decoder/decodemv.c
+++ b/vp10/decoder/decodemv.c
@@ -269,10 +269,9 @@ static void read_tx_size_vartx(VP10_COMMON *cm, MACROBLOCKD *xd,
 #endif
 
 static TX_SIZE read_selected_tx_size(VP10_COMMON *cm, MACROBLOCKD *xd,
-                                     TX_SIZE max_tx_size, vp10_reader *r) {
+                                     int tx_size_cat, vp10_reader *r) {
   FRAME_COUNTS *counts = xd->counts;
   const int ctx = get_tx_size_context(xd);
-  const int tx_size_cat = max_tx_size - TX_8X8;
   int tx_size = vp10_read_tree(r, vp10_tx_size_tree[tx_size_cat],
                                cm->fc->tx_size_probs[tx_size_cat][ctx]);
   if (counts) ++counts->tx_size[tx_size_cat][ctx][tx_size];
@@ -285,11 +284,13 @@ static TX_SIZE read_tx_size_intra(VP10_COMMON *cm, MACROBLOCKD *xd,
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   if (xd->lossless[xd->mi[0]->mbmi.segment_id]) return TX_4X4;
   if (bsize >= BLOCK_8X8) {
-    const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
     if (tx_mode == TX_MODE_SELECT) {
-      return read_selected_tx_size(cm, xd, max_tx_size, r);
+      const TX_SIZE tx_size =
+          read_selected_tx_size(cm, xd, intra_tx_size_cat_lookup[bsize], r);
+      assert(tx_size <= max_txsize_lookup[bsize]);
+      return tx_size;
     } else {
-      return VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
+      return tx_size_from_tx_mode(bsize, cm->tx_mode, 0);
     }
   } else {
     return TX_4X4;
@@ -302,17 +303,20 @@ static TX_SIZE read_tx_size_inter(VP10_COMMON *cm, MACROBLOCKD *xd,
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   if (xd->lossless[xd->mi[0]->mbmi.segment_id]) return TX_4X4;
   if (bsize >= BLOCK_8X8) {
-    const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
     if (allow_select && tx_mode == TX_MODE_SELECT) {
-      return read_selected_tx_size(cm, xd, max_tx_size, r);
+      const TX_SIZE coded_tx_size =
+          read_selected_tx_size(cm, xd, inter_tx_size_cat_lookup[bsize], r);
+#if !CONFIG_RECT_TX
+      assert(coded_tx_size <= max_txsize_lookup[bsize]);
+#else
+      if (coded_tx_size > max_txsize_lookup[bsize]) {
+        assert(coded_tx_size == max_txsize_lookup[bsize] + 1);
+        return max_txsize_rect_lookup[bsize];
+      }
+#endif  // !CONFIG_RECT_TX
+      return coded_tx_size;
     } else {
-      TX_SIZE tx_size =
-          VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-      if (txsize_sqr_map[max_txsize_rect_lookup[bsize]] <= tx_size)
-        tx_size = max_txsize_rect_lookup[bsize];
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-      return tx_size;
+      return tx_size_from_tx_mode(bsize, cm->tx_mode, 1);
     }
   } else {
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -1689,11 +1693,12 @@ static void read_inter_frame_mode_info(VP10Decoder *const pbi,
 
       if (inter_block) {
         if (eset > 0) {
-          mbmi->tx_type =
-              vp10_read_tree(r, vp10_ext_tx_inter_tree[eset],
-                             cm->fc->inter_ext_tx_prob[eset][mbmi->tx_size]);
+          mbmi->tx_type = vp10_read_tree(
+              r, vp10_ext_tx_inter_tree[eset],
+              cm->fc->inter_ext_tx_prob[eset][txsize_sqr_map[mbmi->tx_size]]);
           if (counts)
-            ++counts->inter_ext_tx[eset][mbmi->tx_size][mbmi->tx_type];
+            ++counts->inter_ext_tx[eset][txsize_sqr_map[mbmi->tx_size]]
+                                  [mbmi->tx_type];
         }
       } else if (ALLOW_INTRA_EXT_TX) {
         if (eset > 0) {
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index e07b4bcef..d33f84d32 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -384,15 +384,26 @@ static void update_txfm_partition_probs(VP10_COMMON *cm, vp10_writer *w,
 
 static void write_selected_tx_size(const VP10_COMMON *cm, const MACROBLOCKD *xd,
                                    vp10_writer *w) {
-  TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size;
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
   // For sub8x8 blocks the tx_size symbol does not need to be sent
   if (bsize >= BLOCK_8X8) {
-    vp10_write_token(
-        w, vp10_tx_size_tree[max_tx_size - TX_8X8],
-        cm->fc->tx_size_probs[max_tx_size - TX_8X8][get_tx_size_context(xd)],
-        &tx_size_encodings[max_tx_size - TX_8X8][tx_size]);
+    const TX_SIZE tx_size = mbmi->tx_size;
+    const int is_inter = is_inter_block(mbmi);
+    const int tx_size_ctx = get_tx_size_context(xd);
+    const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+                                     : intra_tx_size_cat_lookup[bsize];
+    const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(mbmi)));
+    assert(
+        IMPLIES(is_rect_tx(tx_size), tx_size == max_txsize_rect_lookup[bsize]));
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+    vp10_write_token(w, vp10_tx_size_tree[tx_size_cat],
+                     cm->fc->tx_size_probs[tx_size_cat][tx_size_ctx],
+                     &tx_size_encodings[tx_size_cat][coded_tx_size]);
   }
 }
 
@@ -1411,10 +1422,12 @@ static void pack_inter_mode_mvs(VP10_COMP *cpi, const MODE_INFO *mi,
         !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       int eset = get_ext_tx_set(mbmi->tx_size, bsize, is_inter);
       if (is_inter) {
+        assert(ext_tx_used_inter[eset][mbmi->tx_type]);
         if (eset > 0)
-          vp10_write_token(w, vp10_ext_tx_inter_tree[eset],
-                           cm->fc->inter_ext_tx_prob[eset][mbmi->tx_size],
-                           &ext_tx_inter_encodings[eset][mbmi->tx_type]);
+          vp10_write_token(
+              w, vp10_ext_tx_inter_tree[eset],
+              cm->fc->inter_ext_tx_prob[eset][txsize_sqr_map[mbmi->tx_size]],
+              &ext_tx_inter_encodings[eset][mbmi->tx_type]);
       } else if (ALLOW_INTRA_EXT_TX) {
         if (eset > 0)
           vp10_write_token(
diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c
index 588f429f0..fa9c7c0d6 100644
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -1444,7 +1444,7 @@ void vp10_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
   for (i = 0; i < n2; ++i) {
     for (j = 0; j < n; ++j) temp_in[j] = out[j + i * n];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j) output[j + i * n] = (temp_out[j] + 1) >> 1;
+    for (j = 0; j < n; ++j) output[j + i * n] = (temp_out[j] + 1) >> 2;
   }
   // Note: overall scale factor of transform is 8 times unitary
 }
@@ -1473,7 +1473,7 @@ void vp10_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n2; ++j) output[j + i * n2] = (temp_out[j] + 1) >> 1;
+    for (j = 0; j < n2; ++j) output[j + i * n2] = (temp_out[j] + 1) >> 2;
   }
   // Note: overall scale factor of transform is 8 times unitary
 }
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index c32a7d57a..ff8084b91 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -5041,25 +5041,29 @@ static void encode_superblock(VP10_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
   if (output_enabled) {
     if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 &&
         !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) {
-      const int ctx = get_tx_size_context(xd);
-      const int tx_size_cat = max_txsize_lookup[bsize] - TX_8X8;
+      const int is_inter = is_inter_block(mbmi);
+      const int tx_size_ctx = get_tx_size_context(xd);
+      const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+                                       : intra_tx_size_cat_lookup[bsize];
+      const TX_SIZE coded_tx_size = txsize_sqr_up_map[mbmi->tx_size];
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(mbmi)));
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
 #if CONFIG_VAR_TX
-      if (is_inter_block(mbmi))
+      if (is_inter)
         tx_partition_count_update(cm, xd, bsize, mi_row, mi_col, td->counts);
 #endif
-      ++td->counts->tx_size[tx_size_cat][ctx][txsize_sqr_up_map[mbmi->tx_size]];
+      ++td->counts->tx_size[tx_size_cat][tx_size_ctx][coded_tx_size];
     } else {
       int x, y;
       TX_SIZE tx_size;
       // The new intra coding scheme requires no change of transform size
       if (is_inter_block(&mi->mbmi)) {
-        tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
-                         max_txsize_lookup[bsize]);
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-        if (txsize_sqr_map[max_txsize_rect_lookup[bsize]] <= tx_size)
-          tx_size = max_txsize_rect_lookup[bsize];
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-        if (xd->lossless[mbmi->segment_id]) tx_size = TX_4X4;
+        if (xd->lossless[mbmi->segment_id]) {
+          tx_size = TX_4X4;
+        } else {
+          tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, 1);
+        }
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
         ++td->counts->tx_size_implied[max_txsize_lookup[bsize]]
                                      [txsize_sqr_up_map[mbmi->tx_size]];
@@ -5082,7 +5086,8 @@ static void encode_superblock(VP10_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
       int eset = get_ext_tx_set(mbmi->tx_size, bsize, is_inter_block(mbmi));
       if (eset > 0) {
         if (is_inter_block(mbmi)) {
-          ++td->counts->inter_ext_tx[eset][mbmi->tx_size][mbmi->tx_type];
+          ++td->counts->inter_ext_tx[eset][txsize_sqr_map[mbmi->tx_size]]
+                                    [mbmi->tx_type];
         } else {
           ++td->counts
                 ->intra_ext_tx[eset][mbmi->tx_size][mbmi->mode][mbmi->tx_type];
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 3f28008ad..639fb3f42 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -290,8 +290,7 @@ static void setup_frame(VP10_COMP *cpi) {
     else if (cpi->refresh_alt_ref_frame)
       cm->frame_context_idx = ARF_FRAME;
 #else
-    if (cpi->refresh_alt_ref_frame)
-      cm->frame_context_idx = ARF_FRAME;
+    if (cpi->refresh_alt_ref_frame) cm->frame_context_idx = ARF_FRAME;
 #endif
     else if (cpi->rc.is_src_frame_alt_ref)
       cm->frame_context_idx = OVERLAY_FRAME;
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 624b10cbc..4e0fa2c5c 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -1320,16 +1320,19 @@ static int64_t txfm_yrd(VP10_COMP *cpi, MACROBLOCK *x, int *r, int64_t *d,
   int64_t rd = INT64_MAX;
   vpx_prob skip_prob = vp10_get_skip_prob(cm, xd);
   int s0, s1;
-  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
-  const int tx_select = cm->tx_mode == TX_MODE_SELECT;
   const int is_inter = is_inter_block(mbmi);
+  const int tx_size_ctx = get_tx_size_context(xd);
+  const int tx_size_cat =
+      is_inter ? inter_tx_size_cat_lookup[bs] : intra_tx_size_cat_lookup[bs];
+  const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
+  const int tx_select = cm->tx_mode == TX_MODE_SELECT;
   const int r_tx_size =
-      cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)][tx_size];
-#if CONFIG_EXT_TX
-  int ext_tx_set;
-#endif  // CONFIG_EXT_TX
+      cpi->tx_size_cost[tx_size_cat][tx_size_ctx][coded_tx_size];
 
   assert(skip_prob > 0);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
 
   s0 = vp10_cost_bit(skip_prob, 0);
   s1 = vp10_cost_bit(skip_prob, 1);
@@ -1340,20 +1343,19 @@ static int64_t txfm_yrd(VP10_COMP *cpi, MACROBLOCK *x, int *r, int64_t *d,
                    cpi->sf.use_fast_coef_costing);
   if (*r == INT_MAX) return INT64_MAX;
 #if CONFIG_EXT_TX
-  ext_tx_set = get_ext_tx_set(tx_size, bs, is_inter);
   if (get_ext_tx_types(tx_size, bs, is_inter) > 1 &&
       !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+    const int ext_tx_set = get_ext_tx_set(tx_size, bs, is_inter);
     if (is_inter) {
       if (ext_tx_set > 0)
-        *r +=
-            cpi->inter_tx_type_costs[ext_tx_set][mbmi->tx_size][mbmi->tx_type];
+        *r += cpi->inter_tx_type_costs
+                  [ext_tx_set][txsize_sqr_map[mbmi->tx_size]][mbmi->tx_type];
     } else {
       if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
         *r += cpi->intra_tx_type_costs[ext_tx_set][mbmi->tx_size][mbmi->mode]
                                       [mbmi->tx_type];
     }
   }
-
 #else
   if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
       !FIXED_TX_TYPE) {
@@ -1400,19 +1402,29 @@ static int64_t choose_tx_size_fix_type(VP10_COMP *cpi, BLOCK_SIZE bs,
   int start_tx, end_tx;
   int64_t best_rd = INT64_MAX, last_rd = INT64_MAX;
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
-  TX_SIZE best_tx = max_tx_size;
+  TX_SIZE best_tx_size = max_tx_size;
   const int tx_select = cm->tx_mode == TX_MODE_SELECT;
   const int is_inter = is_inter_block(mbmi);
 #if CONFIG_EXT_TX
+#if CONFIG_RECT_TX
+  int evaulate_rect_tx = 0;
+#endif  // CONFIG_RECT_TX
   int ext_tx_set;
 #endif  // CONFIG_EXT_TX
 
   if (tx_select) {
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    evaulate_rect_tx = is_rect_tx_allowed(mbmi);
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
     start_tx = max_tx_size;
     end_tx = 0;
   } else {
     const TX_SIZE chosen_tx_size =
-        VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[cm->tx_mode]);
+        tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    evaulate_rect_tx = is_rect_tx(chosen_tx_size);
+    assert(IMPLIES(evaulate_rect_tx, is_rect_tx_allowed(mbmi)));
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
     start_tx = chosen_tx_size;
     end_tx = chosen_tx_size;
   }
@@ -1423,8 +1435,29 @@ static int64_t choose_tx_size_fix_type(VP10_COMP *cpi, BLOCK_SIZE bs,
   *psse = INT64_MAX;
 
   mbmi->tx_type = tx_type;
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  if (evaulate_rect_tx) {
+    const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs];
+    const int ext_tx_set = get_ext_tx_set(rect_tx_size, bs, 1);
+    if (ext_tx_used_inter[ext_tx_set][tx_type]) {
+      rd = txfm_yrd(cpi, x, &r, &d, &s, &sse, ref_best_rd, bs, tx_type,
+                    rect_tx_size);
+      best_tx_size = rect_tx_size;
+      best_rd = rd;
+      *distortion = d;
+      *rate = r;
+      *skip = s;
+      *psse = sse;
+    }
+  }
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
   last_rd = INT64_MAX;
   for (n = start_tx; n >= end_tx; --n) {
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    if (is_rect_tx(n)) break;
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
     if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, n)) continue;
     if (!is_inter && x->use_default_intra_tx_type &&
         tx_type != get_default_tx_type(0, xd, 0, n))
@@ -1463,7 +1496,7 @@ static int64_t choose_tx_size_fix_type(VP10_COMP *cpi, BLOCK_SIZE bs,
 
     last_rd = rd;
     if (rd < best_rd) {
-      best_tx = n;
+      best_tx_size = n;
       best_rd = rd;
       *distortion = d;
       *rate = r;
@@ -1471,7 +1504,7 @@ static int64_t choose_tx_size_fix_type(VP10_COMP *cpi, BLOCK_SIZE bs,
       *psse = sse;
     }
   }
-  mbmi->tx_size = best_tx;
+  mbmi->tx_size = best_tx_size;
 
   return best_rd;
 }
@@ -1488,9 +1521,7 @@ static int64_t estimate_yrd_for_sb(VP10_COMP *cpi, BLOCK_SIZE bs, MACROBLOCK *x,
 static void choose_largest_tx_size(VP10_COMP *cpi, MACROBLOCK *x, int *rate,
                                    int64_t *distortion, int *skip, int64_t *sse,
                                    int64_t ref_best_rd, BLOCK_SIZE bs) {
-  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP10_COMMON *const cm = &cpi->common;
-  const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   TX_TYPE tx_type, best_tx_type = DCT_DCT;
@@ -1505,7 +1536,7 @@ static void choose_largest_tx_size(VP10_COMP *cpi, MACROBLOCK *x, int *rate,
   int ext_tx_set;
 #endif  // CONFIG_EXT_TX
 
-  mbmi->tx_size = VPXMIN(max_tx_size, largest_tx_size);
+  mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
 
 #if CONFIG_EXT_TX
   ext_tx_set = get_ext_tx_set(mbmi->tx_size, bs, is_inter);
@@ -4225,7 +4256,7 @@ static int64_t encode_inter_mb_segment(VP10_COMP *cpi, MACROBLOCK *x,
                  tx_size == max_txsize_rect_lookup[mi->mbmi.sb_type]));
 #else
   assert(tx_size == TX_4X4);
-#endif
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
   assert(tx_type == DCT_DCT);
 
   vp10_build_inter_predictor_sub8x8(xd, 0, i, ir, ic, mi_row, mi_col);
@@ -4750,7 +4781,7 @@ static int64_t rd_pick_best_sub8x8_mode(
       xd->lossless[mbmi->segment_id] ? TX_4X4 : max_txsize_rect_lookup[bsize];
 #else
   mbmi->tx_size = TX_4X4;
-#endif
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
 
   vp10_zero(*bsi);
 
@@ -10522,7 +10553,7 @@ void vp10_rd_pick_inter_mode_sub8x8(struct VP10_COMP *cpi,
   *mbmi = best_mbmode;
 #if CONFIG_VAR_TX && CONFIG_EXT_TX && CONFIG_RECT_TX
   mbmi->inter_tx_size[0][0] = mbmi->tx_size;
-#endif
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
 
   x->skip |= best_skip2;
   if (!is_inter_block(&best_mbmode)) {