From: Geza Lore <gezalore@gmail.com>
Date: Wed, 4 Nov 2015 14:56:34 +0000 (+0000)
Subject: Flip the result of the inverse transform for FLIPADST.
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4f5108090a6047d5d4d9ce1df302da23b2ef4bc5;p=libvpx

Flip the result of the inverse transform for FLIPADST.

When using FLIPADST, the vp10_inv_txfm_add functions used to flip
the destination array, add the result of the inverse transform, to it
and then flip the destination back. This has been replaced by
flipping the result of the inverse transform before adding it to the
destination. Up-Down flipping is done by negating the destination
stride, and staring from the bottom, so it should now be free.
Left-right flipping is done with the usual SSE2 instructions in the
optimized code.

The C functions match the SSE2 functions as expected, so the C functions
now do the flipping as well when required. Adding this cleanly required
some refactoring of the C functions, but there is no measurable
performance impact when ext-tx is not enabled.

Encode speedup with ext-tx enabled is about 3%.

Change-Id: I5b04e5d720f0b9f0d54fd8607a8764f2314c7234
---

diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index e91a2deaf..3b806dded 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -182,39 +182,6 @@ void idst16_c(const tran_low_t *input, tran_low_t *output) {
   output[15] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
 }
 
-static void fliplr(uint8_t *dest, int stride, int l) {
-  int i, j;
-  for (i = 0; i < l; ++i) {
-    for (j = 0; j < l / 2; ++j) {
-      const uint8_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[i * stride + l - 1 - j];
-      dest[i * stride + l - 1 - j] = tmp;
-    }
-  }
-}
-
-static void flipud(uint8_t *dest, int stride, int l) {
-  int i, j;
-  for (j = 0; j < l; ++j) {
-    for (i = 0; i < l / 2; ++i) {
-      const uint8_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
-      dest[(l - 1 - i) * stride + j] = tmp;
-    }
-  }
-}
-
-static void fliplrud(uint8_t *dest, int stride, int l) {
-  int i, j;
-  for (i = 0; i < l / 2; ++i) {
-    for (j = 0; j < l; ++j) {
-      const uint8_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
-      dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
-    }
-  }
-}
-
 // Inverse identiy transform and add.
 static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                            int bs) {
@@ -228,6 +195,52 @@ static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
   }
 }
 
+#define FLIPUD_PTR(dest, stride, size) do {     \
+    (dest) = (dest) + ((size) - 1) * (stride);  \
+    (stride) = - (stride);                      \
+} while (0)
+
+static void maybe_flip_strides(uint8_t **dst, int *dstride,
+                               tran_low_t **src, int *sstride,
+                               int tx_type, int size) {
+  // Note that the transpose of src will be added to dst. In order to LR
+  // flip the addends (in dst coordinates), we UD flip the src. To UD flip
+  // the addends, we UD flip the dst.
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case DST_DST:
+    case DCT_DST:
+    case DST_DCT:
+    case DST_ADST:
+    case ADST_DST:
+      break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case FLIPADST_DST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, size);
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case DST_FLIPADST:
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, size);
+      break;
+    case FLIPADST_FLIPADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, size);
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, size);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
 #if CONFIG_VP9_HIGHBITDEPTH
 void highbd_idst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   // {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2)
@@ -407,39 +420,6 @@ void highbd_idst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   output[15] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
 }
 
-static void fliplr16(uint16_t *dest, int stride, int l) {
-  int i, j;
-  for (i = 0; i < l; ++i) {
-    for (j = 0; j < l / 2; ++j) {
-      const uint16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[i * stride + l - 1 - j];
-      dest[i * stride + l - 1 - j] = tmp;
-    }
-  }
-}
-
-static void flipud16(uint16_t *dest, int stride, int l) {
-  int i, j;
-  for (j = 0; j < l; ++j) {
-    for (i = 0; i < l / 2; ++i) {
-      const uint16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
-      dest[(l - 1 - i) * stride + j] = tmp;
-    }
-  }
-}
-
-static void fliplrud16(uint16_t *dest, int stride, int l) {
-  int i, j;
-  for (i = 0; i < l / 2; ++i) {
-    for (j = 0; j < l; ++j) {
-      const uint16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
-      dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
-    }
-  }
-}
-
 static void highbd_inv_idtx_add_c(const tran_low_t *input, uint8_t *dest8,
                                   int stride, int bs, int bd) {
   int r, c;
@@ -480,25 +460,41 @@ void vp10_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
   };
 
   int i, j;
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[4], temp_out[4];
+  tran_low_t tmp;
+  tran_low_t out[4][4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 4;
 
   // inverse transform row vectors
   for (i = 0; i < 4; ++i) {
-    IHT_4[tx_type].rows(input, outptr);
+    IHT_4[tx_type].rows(input, out[i]);
     input  += 4;
-    outptr += 4;
+  }
+
+  // transpose
+  for (i = 1 ; i < 4; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
   }
 
   // inverse transform column vectors
   for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j * 4 + i];
-    IHT_4[tx_type].cols(temp_in, temp_out);
+    IHT_4[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
     }
   }
 }
@@ -527,26 +523,41 @@ void vp10_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
   };
 
   int i, j;
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[8], temp_out[8];
-  const transform_2d ht = IHT_8[tx_type];
+  tran_low_t tmp;
+  tran_low_t out[8][8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 8;
 
   // inverse transform row vectors
   for (i = 0; i < 8; ++i) {
-    ht.rows(input, outptr);
-    input += 8;
-    outptr += 8;
+    IHT_8[tx_type].rows(input, out[i]);
+    input  += 8;
+  }
+
+  // transpose
+  for (i = 1 ; i < 8; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
   }
 
   // inverse transform column vectors
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
-    ht.cols(temp_in, temp_out);
+    IHT_8[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
     }
   }
 }
@@ -573,27 +584,43 @@ void vp10_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
     { idst16_c,  idst16_c  },  // DST_DST           = 15
 #endif  // CONFIG_EXT_TX
   };
+
   int i, j;
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[16], temp_out[16];
-  const transform_2d ht = IHT_16[tx_type];
+  tran_low_t tmp;
+  tran_low_t out[16][16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 16;
+
+  // inverse transform row vectors
+  for (i = 0; i < 16; ++i) {
+    IHT_16[tx_type].rows(input, out[i]);
+    input  += 16;
+  }
+
+  // transpose
+  for (i = 1 ; i < 16; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
 
-  // Rows
+  // inverse transform column vectors
   for (i = 0; i < 16; ++i) {
-    ht.rows(input, outptr);
-    input += 16;
-    outptr += 16;
+    IHT_16[tx_type].cols(out[i], out[i]);
   }
 
-  // Columns
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 16);
+#endif
+
+  // Sum with the destination
   for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j * 16 + i];
-    ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
     }
   }
 }
@@ -663,68 +690,43 @@ void vp10_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
   if (lossless) {
     assert(tx_type == DCT_DCT);
     vp10_iwht4x4_add(input, dest, stride, eob);
-  } else {
-    switch (tx_type) {
-      case DCT_DCT:
-        vp10_idct4x4_add(input, dest, stride, eob);
-        break;
-      case ADST_DCT:
-      case DCT_ADST:
-      case ADST_ADST:
-        vp10_iht4x4_16_add(input, dest, stride, tx_type);
-        break;
+    return;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vp10_idct4x4_add(input, dest, stride, eob);
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_iht4x4_16_add(input, dest, stride, tx_type);
+      break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      flipud(dest, stride, 4);
-      vp10_iht4x4_16_add(input, dest, stride, ADST_DCT);
-      flipud(dest, stride, 4);
-        break;
     case DCT_FLIPADST:
-      fliplr(dest, stride, 4);
-      vp10_iht4x4_16_add(input, dest, stride, DCT_ADST);
-      fliplr(dest, stride, 4);
-      break;
     case FLIPADST_FLIPADST:
-      fliplrud(dest, stride, 4);
-      vp10_iht4x4_16_add(input, dest, stride, ADST_ADST);
-      fliplrud(dest, stride, 4);
-      break;
     case ADST_FLIPADST:
-      fliplr(dest, stride, 4);
-      vp10_iht4x4_16_add(input, dest, stride, ADST_ADST);
-      fliplr(dest, stride, 4);
-      break;
     case FLIPADST_ADST:
-      flipud(dest, stride, 4);
-      vp10_iht4x4_16_add(input, dest, stride, ADST_ADST);
-      flipud(dest, stride, 4);
+      vp10_iht4x4_16_add(input, dest, stride, tx_type);
       break;
     case DST_DST:
     case DST_DCT:
     case DCT_DST:
     case DST_ADST:
     case ADST_DST:
-      // Use C version since DST only exists in C code
-      vp10_iht4x4_16_add_c(input, dest, stride, tx_type);
-      break;
     case FLIPADST_DST:
-      flipud(dest, stride, 4);
-      vp10_iht4x4_16_add_c(input, dest, stride, ADST_DST);
-      flipud(dest, stride, 4);
-      break;
     case DST_FLIPADST:
-      fliplr(dest, stride, 4);
-      vp10_iht4x4_16_add_c(input, dest, stride, DST_ADST);
-      fliplr(dest, stride, 4);
+      // Use C version since DST only exists in C code
+      vp10_iht4x4_16_add_c(input, dest, stride, tx_type);
       break;
     case IDTX:
       inv_idtx_add_c(input, dest, stride, 4);
       break;
 #endif  // CONFIG_EXT_TX
-      default:
-        assert(0);
-        break;
-    }
+    default:
+      assert(0);
+      break;
   }
 }
 
@@ -741,47 +743,21 @@ void vp10_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      flipud(dest, stride, 8);
-      vp10_iht8x8_64_add(input, dest, stride, ADST_DCT);
-      flipud(dest, stride, 8);
-      break;
     case DCT_FLIPADST:
-      fliplr(dest, stride, 8);
-      vp10_iht8x8_64_add(input, dest, stride, DCT_ADST);
-      fliplr(dest, stride, 8);
-      break;
     case FLIPADST_FLIPADST:
-      fliplrud(dest, stride, 8);
-      vp10_iht8x8_64_add(input, dest, stride, ADST_ADST);
-      fliplrud(dest, stride, 8);
-      break;
     case ADST_FLIPADST:
-      fliplr(dest, stride, 8);
-      vp10_iht8x8_64_add(input, dest, stride, ADST_ADST);
-      fliplr(dest, stride, 8);
-      break;
     case FLIPADST_ADST:
-      flipud(dest, stride, 8);
-      vp10_iht8x8_64_add(input, dest, stride, ADST_ADST);
-      flipud(dest, stride, 8);
+      vp10_iht8x8_64_add(input, dest, stride, tx_type);
       break;
     case DST_DST:
     case DST_DCT:
     case DCT_DST:
     case DST_ADST:
     case ADST_DST:
-      // Use C version since DST only exists in C code
-      vp10_iht8x8_64_add_c(input, dest, stride, tx_type);
-      break;
     case FLIPADST_DST:
-      flipud(dest, stride, 8);
-      vp10_iht8x8_64_add_c(input, dest, stride, ADST_DST);
-      flipud(dest, stride, 8);
-      break;
     case DST_FLIPADST:
-      fliplr(dest, stride, 8);
-      vp10_iht8x8_64_add_c(input, dest, stride, DST_ADST);
-      fliplr(dest, stride, 8);
+      // Use C version since DST only exists in C code
+      vp10_iht8x8_64_add_c(input, dest, stride, tx_type);
       break;
     case IDTX:
       inv_idtx_add_c(input, dest, stride, 8);
@@ -806,47 +782,21 @@ void vp10_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      flipud(dest, stride, 16);
-      vp10_iht16x16_256_add(input, dest, stride, ADST_DCT);
-      flipud(dest, stride, 16);
-      break;
     case DCT_FLIPADST:
-      fliplr(dest, stride, 16);
-      vp10_iht16x16_256_add(input, dest, stride, DCT_ADST);
-      fliplr(dest, stride, 16);
-      break;
     case FLIPADST_FLIPADST:
-      fliplrud(dest, stride, 16);
-      vp10_iht16x16_256_add(input, dest, stride, ADST_ADST);
-      fliplrud(dest, stride, 16);
-      break;
     case ADST_FLIPADST:
-      fliplr(dest, stride, 16);
-      vp10_iht16x16_256_add(input, dest, stride, ADST_ADST);
-      fliplr(dest, stride, 16);
-      break;
     case FLIPADST_ADST:
-      flipud(dest, stride, 16);
-      vp10_iht16x16_256_add(input, dest, stride, ADST_ADST);
-      flipud(dest, stride, 16);
+      vp10_iht16x16_256_add(input, dest, stride, tx_type);
       break;
     case DST_DST:
     case DST_DCT:
     case DCT_DST:
     case DST_ADST:
     case ADST_DST:
-      // Use C version since DST only exists in C code
-      vp10_iht16x16_256_add_c(input, dest, stride, tx_type);
-      break;
     case FLIPADST_DST:
-      flipud(dest, stride, 16);
-      vp10_iht16x16_256_add_c(input, dest, stride, ADST_DST);
-      flipud(dest, stride, 16);
-      break;
     case DST_FLIPADST:
-      fliplr(dest, stride, 16);
-      vp10_iht16x16_256_add_c(input, dest, stride, DST_ADST);
-      fliplr(dest, stride, 16);
+      // Use C version since DST only exists in C code
+      vp10_iht16x16_256_add_c(input, dest, stride, tx_type);
       break;
     case IDTX:
       inv_idtx_add_c(input, dest, stride, 16);
@@ -907,25 +857,43 @@ void vp10_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   int i, j;
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[4], temp_out[4];
+  tran_low_t tmp;
+  tran_low_t out[4][4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 4;
 
-  // Inverse transform row vectors.
+  // inverse transform row vectors
   for (i = 0; i < 4; ++i) {
-    HIGH_IHT_4[tx_type].rows(input, outptr, bd);
+    HIGH_IHT_4[tx_type].rows(input, out[i], bd);
     input  += 4;
-    outptr += 4;
   }
 
-  // Inverse transform column vectors.
+  // transpose
+  for (i = 1 ; i < 4; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 4; ++i) {
+    HIGH_IHT_4[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides((uint8_t**)&dest, &stride,
+                     &outp, &outstride, tx_type, 4 * 2);
+#endif
+
+  // Sum with the destination
   for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j * 4 + i];
-    HIGH_IHT_4[tx_type].cols(temp_in, temp_out, bd);
     for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 4), bd);
     }
   }
 }
@@ -953,28 +921,46 @@ void vp10_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
 #endif  // CONFIG_EXT_TX
   };
 
-  int i, j;
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[8], temp_out[8];
-  const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // Inverse transform row vectors.
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[8][8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 8;
+
+  // inverse transform row vectors
   for (i = 0; i < 8; ++i) {
-    ht.rows(input, outptr, bd);
-    input += 8;
-    outptr += 8;
+    HIGH_IHT_8[tx_type].rows(input, out[i], bd);
+    input  += 8;
   }
 
-  // Inverse transform column vectors.
+  // transpose
+  for (i = 1 ; i < 8; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 8; ++i) {
+    HIGH_IHT_8[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides((uint8_t**)&dest,
+                     &stride, &outp, &outstride, tx_type, 8 * 2);
+#endif
+
+  // Sum with the destination
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
-    ht.cols(temp_in, temp_out, bd);
     for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 5), bd);
     }
   }
 }
@@ -1002,28 +988,46 @@ void vp10_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
 #endif  // CONFIG_EXT_TX
   };
 
-  int i, j;
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[16], temp_out[16];
-  const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // Rows
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[16][16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 16;
+
+  // inverse transform row vectors
+  for (i = 0; i < 16; ++i) {
+    HIGH_IHT_16[tx_type].rows(input, out[i], bd);
+    input  += 16;
+  }
+
+  // transpose
+  for (i = 1 ; i < 16; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
   for (i = 0; i < 16; ++i) {
-    ht.rows(input, outptr, bd);
-    input += 16;
-    outptr += 16;
+    HIGH_IHT_16[tx_type].cols(out[i], out[i], bd);
   }
 
-  // Columns
+#if CONFIG_EXT_TX
+  maybe_flip_strides((uint8_t**)&dest, &stride,
+                     &outp, &outstride, tx_type, 16 * 2);
+#endif
+
+  // Sum with the destination
   for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j * 16 + i];
-    ht.cols(temp_in, temp_out, bd);
     for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 6), bd);
     }
   }
 }
@@ -1097,68 +1101,43 @@ void vp10_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
   if (lossless) {
     assert(tx_type == DCT_DCT);
     vp10_highbd_iwht4x4_add(input, dest, stride, eob, bd);
-  } else {
-    switch (tx_type) {
-      case DCT_DCT:
-        vp10_highbd_idct4x4_add(input, dest, stride, eob, bd);
-        break;
-      case ADST_DCT:
-      case DCT_ADST:
-      case ADST_ADST:
-         vp10_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
-         break;
+    return;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vp10_highbd_idct4x4_add(input, dest, stride, eob, bd);
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
+      break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-      vp10_highbd_iht4x4_16_add(input, dest, stride, ADST_DCT, bd);
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-         break;
     case DCT_FLIPADST:
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-      vp10_highbd_iht4x4_16_add(input, dest, stride, DCT_ADST, bd);
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-      break;
     case FLIPADST_FLIPADST:
-      fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-      vp10_highbd_iht4x4_16_add(input, dest, stride, ADST_ADST, bd);
-      fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-      break;
     case ADST_FLIPADST:
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-      vp10_highbd_iht4x4_16_add(input, dest, stride, ADST_ADST, bd);
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-      break;
     case FLIPADST_ADST:
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-      vp10_highbd_iht4x4_16_add(input, dest, stride, ADST_ADST, bd);
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
+      vp10_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
       break;
     case DST_DST:
     case DST_DCT:
     case DCT_DST:
     case DST_ADST:
     case ADST_DST:
-      // Use C version since DST only exists in C code
-      vp10_highbd_iht4x4_16_add_c(input, dest, stride, tx_type, bd);
-      break;
     case FLIPADST_DST:
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-      vp10_highbd_iht4x4_16_add_c(input, dest, stride, ADST_DST, bd);
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-      break;
     case DST_FLIPADST:
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-      vp10_highbd_iht4x4_16_add_c(input, dest, stride, DST_ADST, bd);
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
+      // Use C version since DST only exists in C code
+      vp10_highbd_iht4x4_16_add_c(input, dest, stride, tx_type, bd);
       break;
     case IDTX:
       highbd_inv_idtx_add_c(input, dest, stride, 4, bd);
       break;
 #endif  // CONFIG_EXT_TX
-      default:
-        assert(0);
-        break;
-    }
+    default:
+      assert(0);
+      break;
   }
 }
 
@@ -1176,47 +1155,21 @@ void vp10_highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-      vp10_highbd_iht8x8_64_add(input, dest, stride, ADST_DCT, bd);
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-      break;
     case DCT_FLIPADST:
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-      vp10_highbd_iht8x8_64_add(input, dest, stride, DCT_ADST, bd);
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-      break;
     case FLIPADST_FLIPADST:
-      fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-      vp10_highbd_iht8x8_64_add(input, dest, stride, ADST_ADST, bd);
-      fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-      break;
     case ADST_FLIPADST:
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-      vp10_highbd_iht8x8_64_add(input, dest, stride, ADST_ADST, bd);
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-      break;
     case FLIPADST_ADST:
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-      vp10_highbd_iht8x8_64_add(input, dest, stride, ADST_ADST, bd);
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
+      vp10_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
       break;
     case DST_DST:
     case DST_DCT:
     case DCT_DST:
     case DST_ADST:
     case ADST_DST:
-      // Use C version since DST only exists in C code
-      vp10_highbd_iht8x8_64_add_c(input, dest, stride, tx_type, bd);
-      break;
     case FLIPADST_DST:
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-      vp10_highbd_iht8x8_64_add_c(input, dest, stride, ADST_DST, bd);
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-      break;
     case DST_FLIPADST:
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-      vp10_highbd_iht8x8_64_add_c(input, dest, stride, DST_ADST, bd);
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
+      // Use C version since DST only exists in C code
+      vp10_highbd_iht8x8_64_add_c(input, dest, stride, tx_type, bd);
       break;
     case IDTX:
       highbd_inv_idtx_add_c(input, dest, stride, 8, bd);
@@ -1242,47 +1195,21 @@ void vp10_highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-      vp10_highbd_iht16x16_256_add(input, dest, stride, ADST_DCT, bd);
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-      break;
     case DCT_FLIPADST:
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-      vp10_highbd_iht16x16_256_add(input, dest, stride, DCT_ADST, bd);
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-      break;
     case FLIPADST_FLIPADST:
-      fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-      vp10_highbd_iht16x16_256_add(input, dest, stride, ADST_ADST, bd);
-      fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-      break;
     case ADST_FLIPADST:
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-      vp10_highbd_iht16x16_256_add(input, dest, stride, ADST_ADST, bd);
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-      break;
     case FLIPADST_ADST:
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-      vp10_highbd_iht16x16_256_add(input, dest, stride, ADST_ADST, bd);
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
+      vp10_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
       break;
     case DST_DST:
     case DST_DCT:
     case DCT_DST:
     case DST_ADST:
     case ADST_DST:
-      // Use C version since DST only exists in C code
-      vp10_highbd_iht16x16_256_add_c(input, dest, stride, tx_type, bd);
-      break;
     case FLIPADST_DST:
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-      vp10_highbd_iht16x16_256_add_c(input, dest, stride, ADST_DST, bd);
-      flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-      break;
     case DST_FLIPADST:
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-      vp10_highbd_iht16x16_256_add_c(input, dest, stride, DST_ADST, bd);
-      fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
+      // Use C version since DST only exists in C code
+      vp10_highbd_iht16x16_256_add_c(input, dest, stride, tx_type, bd);
       break;
     case IDTX:
       highbd_inv_idtx_add_c(input, dest, stride, 16, bd);
diff --git a/vp10/common/x86/idct_intrin_sse2.c b/vp10/common/x86/idct_intrin_sse2.c
index a2c674b80..900f09188 100644
--- a/vp10/common/x86/idct_intrin_sse2.c
+++ b/vp10/common/x86/idct_intrin_sse2.c
@@ -11,6 +11,54 @@
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
+#include "vp10/common/enums.h"
+
+#if CONFIG_EXT_TX
+// Reverse the 8 16 bit words in __m128i
+static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+  const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
+  const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
+  return _mm_shuffle_epi32(b, 0x4e);
+}
+
+static INLINE void fliplr_4x4(__m128i in[2]) {
+  in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+  in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
+  in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+  in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
+}
+
+static INLINE void fliplr_8x8(__m128i in[8]) {
+  in[0] = mm_reverse_epi16(in[0]);
+  in[1] = mm_reverse_epi16(in[1]);
+  in[2] = mm_reverse_epi16(in[2]);
+  in[3] = mm_reverse_epi16(in[3]);
+
+  in[4] = mm_reverse_epi16(in[4]);
+  in[5] = mm_reverse_epi16(in[5]);
+  in[6] = mm_reverse_epi16(in[6]);
+  in[7] = mm_reverse_epi16(in[7]);
+}
+
+static INLINE void fliplr_16x8(__m128i in[16]) {
+  fliplr_8x8(&in[0]);
+  fliplr_8x8(&in[8]);
+}
+
+#define FLIPLR_16x16(in0, in1) do {             \
+  __m128i *tmp;                                 \
+  fliplr_16x8(in0);                             \
+  fliplr_16x8(in1);                             \
+  tmp = (in0);                                  \
+  (in0) = (in1);                                \
+  (in1) = tmp;                                  \
+} while (0)
+
+#define FLIPUD_PTR(dest, stride, size) do {     \
+    (dest) = (dest) + ((size) - 1) * (stride);  \
+    (stride) = - (stride);                      \
+} while (0)
+#endif
 
 void vp10_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                              int tx_type) {
@@ -22,22 +70,50 @@ void vp10_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
   in[1] = load_input_data(input + 8);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
+      idct4_sse2(in);
+      idct4_sse2(in);
+      break;
+    case ADST_DCT:
       idct4_sse2(in);
+      iadst4_sse2(in);
+      break;
+    case DCT_ADST:
+      iadst4_sse2(in);
       idct4_sse2(in);
       break;
-    case 1:  // ADST_DCT
+    case ADST_ADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
       idct4_sse2(in);
       iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
       break;
-    case 2:  // DCT_ADST
+    case DCT_FLIPADST:
       iadst4_sse2(in);
       idct4_sse2(in);
+      fliplr_4x4(in);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      fliplr_4x4(in);
       break;
-    case 3:  // ADST_ADST
+    case ADST_FLIPADST:
       iadst4_sse2(in);
       iadst4_sse2(in);
+      fliplr_4x4(in);
       break;
+    case FLIPADST_ADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -52,12 +128,12 @@ void vp10_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
 
   // Reconstruction and Store
   {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
+    __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi32(d0,
-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
-    d2 = _mm_unpacklo_epi32(
-        d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
+    __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+    d0 = _mm_unpacklo_epi32(d0, d1);
+    d2 = _mm_unpacklo_epi32(d2, d3);
     d0 = _mm_unpacklo_epi8(d0, zero);
     d2 = _mm_unpacklo_epi8(d2, zero);
     d0 = _mm_add_epi16(d0, in[0]);
@@ -94,22 +170,50 @@ void vp10_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
   in[7] = load_input_data(input + 8 * 7);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
+      idct8_sse2(in);
+      idct8_sse2(in);
+      break;
+    case ADST_DCT:
       idct8_sse2(in);
+      iadst8_sse2(in);
+      break;
+    case DCT_ADST:
+      iadst8_sse2(in);
       idct8_sse2(in);
       break;
-    case 1:  // ADST_DCT
+    case ADST_ADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
       idct8_sse2(in);
       iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
       break;
-    case 2:  // DCT_ADST
+    case DCT_FLIPADST:
       iadst8_sse2(in);
       idct8_sse2(in);
+      fliplr_8x8(in);
       break;
-    case 3:  // ADST_ADST
+    case FLIPADST_FLIPADST:
       iadst8_sse2(in);
       iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      fliplr_8x8(in);
       break;
+    case ADST_FLIPADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      fliplr_8x8(in);
+      break;
+    case FLIPADST_ADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -146,29 +250,59 @@ void vp10_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
 
 void vp10_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
                                 int stride, int tx_type) {
-  __m128i in0[16], in1[16];
+  __m128i in[32];
+  __m128i *in0 = &in[0];
+  __m128i *in1 = &in[16];
 
   load_buffer_8x16(input, in0);
   input += 8;
   load_buffer_8x16(input, in1);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
+      idct16_sse2(in0, in1);
       idct16_sse2(in0, in1);
+      break;
+    case ADST_DCT:
       idct16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      break;
+    case DCT_ADST:
+      iadst16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      break;
+    case ADST_ADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
       break;
-    case 1:  // ADST_DCT
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
       idct16_sse2(in0, in1);
       iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
       break;
-    case 2:  // DCT_ADST
+    case DCT_FLIPADST:
       iadst16_sse2(in0, in1);
       idct16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case ADST_FLIPADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
       break;
-    case 3:  // ADST_ADST
+    case FLIPADST_ADST:
       iadst16_sse2(in0, in1);
       iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
       break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;