From 85ab9d56cc8c8e00cb2d5bb6fc8283ab40a30fe0 Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Fri, 13 Nov 2015 15:16:28 +0000
Subject: [PATCH] Flip the result of the inv transform for FLIPADST.

This is a port of 4f5108090a6047d5d4d9ce1df302da23b2ef4bc5

This commit also fixes a bug where FLIPADST transforms when combined
with a DST (that is FLIPADST_DST and DST_FLIPADST) did not actually did
a flipped transform but a straight ADST instead. This was due to the C
implementation that it fell back on not implementing flipping.  This is
now fixed as well and FLIPADST_DST and DST_FLIPADST does what it is
supposed to do.

There are 3 functions in the SR_MODE experiment that should be updated,
but given that the build of SR_MODE is broken at the upstream tip of
nextgen, I could not test these, so I have put in assertions and FIXME
notes at the problematic places.

Change-Id: I5b8175b85f944f2369b183a26256e08d97f4bdef
---
 vp9/common/vp9_idct.c                 | 596 ++++++++++++--------------
 vp9/common/x86/vp9_idct_intrin_sse2.c | 171 +++++++-
 2 files changed, 423 insertions(+), 344 deletions(-)

diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 81106446d..a9431f4da 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -16,6 +16,59 @@
 #include "vp9/common/vp9_idct.h"
 
 #if CONFIG_EXT_TX
+#define FLIPUD_PTR(dest, stride, size) do {     \
+    (dest) = (dest) + ((size) - 1) * (stride);  \
+    (stride) = - (stride);                      \
+} while (0)
+
+static void maybe_flip_strides(uint8_t **dst, int *dstride,
+                               tran_low_t **src, int *sstride,
+                               int tx_type, int size) {
+  // Note that the transpose of src will be added to dst. In order to LR
+  // flip the addends (in dst coordinates), we UD flip the src. To UD flip
+  // the addends, we UD flip the dst.
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, size);
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, size);
+      break;
+    case FLIPADST_FLIPADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, size);
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, size);
+      break;
+    case DST_DST:
+    case DCT_DST:
+    case DST_DCT:
+    case DST_ADST:
+    case ADST_DST:
+      break;
+    case DST_FLIPADST:
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, size);
+      break;
+    case FLIPADST_DST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, size);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
 void idst4(const tran_low_t *input, tran_low_t *output) {
   // {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2)
   static const int32_t sinvalue_lookup[] = {
@@ -635,25 +688,41 @@ void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
   };
 
   int i, j;
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[4], temp_out[4];
+  tran_low_t tmp;
+  tran_low_t out[4][4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 4;
 
   // inverse transform row vectors
   for (i = 0; i < 4; ++i) {
-    IHT_4[tx_type].rows(input, outptr);
+    IHT_4[tx_type].rows(input, out[i]);
     input  += 4;
-    outptr += 4;
+  }
+
+  // transpose
+  for (i = 1 ; i < 4; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
   }
 
   // inverse transform column vectors
   for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j * 4 + i];
-    IHT_4[tx_type].cols(temp_in, temp_out);
+    IHT_4[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
     }
   }
 }
@@ -756,97 +825,44 @@ static const transform_2d IHT_8[] = {
 #endif  // CONFIG_EXT_TX
 };
 
-#if CONFIG_EXT_TX
-void fliplr(uint8_t *dest, int stride, int l) {
-  int i, j;
-  for (i = 0; i < l; ++i) {
-    for (j = 0; j < l / 2; ++j) {
-      const uint8_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[i * stride + l - 1 - j];
-      dest[i * stride + l - 1 - j] = tmp;
-    }
-  }
-}
-
-void flipud(uint8_t *dest, int stride, int l) {
-  int i, j;
-  for (j = 0; j < l; ++j) {
-    for (i = 0; i < l / 2; ++i) {
-      const uint8_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
-      dest[(l - 1 - i) * stride + j] = tmp;
-    }
-  }
-}
-
-void fliplrud(uint8_t *dest, int stride, int l) {
-  int i, j;
-  for (i = 0; i < l / 2; ++i) {
-    for (j = 0; j < l; ++j) {
-      const uint8_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
-      dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
-    }
-  }
-}
-
-void fliplr16(uint16_t *dest, int stride, int l) {
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                         int tx_type) {
   int i, j;
-  for (i = 0; i < l; ++i) {
-    for (j = 0; j < l / 2; ++j) {
-      const uint16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[i * stride + l - 1 - j];
-      dest[i * stride + l - 1 - j] = tmp;
-    }
-  }
-}
+  tran_low_t tmp;
+  tran_low_t out[8][8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 8;
 
-void flipud16(uint16_t *dest, int stride, int l) {
-  int i, j;
-  for (j = 0; j < l; ++j) {
-    for (i = 0; i < l / 2; ++i) {
-      const uint16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
-      dest[(l - 1 - i) * stride + j] = tmp;
-    }
+  // inverse transform row vectors
+  for (i = 0; i < 8; ++i) {
+    IHT_8[tx_type].rows(input, out[i]);
+    input  += 8;
   }
-}
 
-void fliplrud16(uint16_t *dest, int stride, int l) {
-  int i, j;
-  for (i = 0; i < l / 2; ++i) {
-    for (j = 0; j < l; ++j) {
-      const uint16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
-      dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
+  // transpose
+  for (i = 1 ; i < 8; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
     }
   }
-}
-#endif  // CONFIG_EXT_TX
-
-void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                         int tx_type) {
-  int i, j;
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[8], temp_out[8];
-  const transform_2d ht = IHT_8[tx_type];
 
-  // inverse transform row vectors
+  // inverse transform column vectors
   for (i = 0; i < 8; ++i) {
-    ht.rows(input, outptr);
-    input += 8;
-    outptr += 8;
+    IHT_8[tx_type].cols(out[i], out[i]);
   }
 
-  // inverse transform column vectors
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8);
+#endif
+
+  // Sum with the destination
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
-    ht.cols(temp_in, temp_out);
     for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
     }
   }
 }
@@ -1291,26 +1307,41 @@ static const transform_2d IHT_16[] = {
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   int i, j;
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[16], temp_out[16];
-  const transform_2d ht = IHT_16[tx_type];
+  tran_low_t tmp;
+  tran_low_t out[16][16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 16;
 
-  // Rows
+  // inverse transform row vectors
   for (i = 0; i < 16; ++i) {
-    ht.rows(input, outptr);
-    input += 16;
-    outptr += 16;
+    IHT_16[tx_type].rows(input, out[i]);
+    input  += 16;
   }
 
-  // Columns
+  // transpose
+  for (i = 1 ; i < 16; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 16; ++i) {
+    IHT_16[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 16);
+#endif
+
+  // Sum with the destination
   for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j * 16 + i];
-    ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
     }
   }
 }
@@ -1911,26 +1942,6 @@ void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
 #if CONFIG_EXT_TX
   } else if (is_dst_used(tx_type)) {
     vp9_iht4x4_16_add_c(input, dest, stride, tx_type);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud(dest, stride, 4);
-    vp9_iht4x4_16_add(input, dest, stride, ADST_DCT);
-    flipud(dest, stride, 4);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr(dest, stride, 4);
-    vp9_iht4x4_16_add(input, dest, stride, DCT_ADST);
-    fliplr(dest, stride, 4);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud(dest, stride, 4);
-    vp9_iht4x4_16_add(input, dest, stride, ADST_ADST);
-    fliplrud(dest, stride, 4);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr(dest, stride, 4);
-    vp9_iht4x4_16_add(input, dest, stride, ADST_ADST);
-    fliplr(dest, stride, 4);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud(dest, stride, 4);
-    vp9_iht4x4_16_add(input, dest, stride, ADST_ADST);
-    flipud(dest, stride, 4);
 #endif  // CONFIG_EXT_TX
   } else {
     vp9_iht4x4_16_add(input, dest, stride, tx_type);
@@ -1944,26 +1955,6 @@ void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
 #if CONFIG_EXT_TX
   } else if (is_dst_used(tx_type)) {
     vp9_iht8x8_64_add_c(input, dest, stride, tx_type);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud(dest, stride, 8);
-    vp9_iht8x8_64_add(input, dest, stride, ADST_DCT);
-    flipud(dest, stride, 8);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr(dest, stride, 8);
-    vp9_iht8x8_64_add(input, dest, stride, DCT_ADST);
-    fliplr(dest, stride, 8);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud(dest, stride, 8);
-    vp9_iht8x8_64_add(input, dest, stride, ADST_ADST);
-    fliplrud(dest, stride, 8);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr(dest, stride, 8);
-    vp9_iht8x8_64_add(input, dest, stride, ADST_ADST);
-    fliplr(dest, stride, 8);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud(dest, stride, 8);
-    vp9_iht8x8_64_add(input, dest, stride, ADST_ADST);
-    flipud(dest, stride, 8);
 #endif  // CONFIG_EXT_TX
   } else {
     vp9_iht8x8_64_add(input, dest, stride, tx_type);
@@ -1977,26 +1968,6 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
 #if CONFIG_EXT_TX
   } else if (is_dst_used(tx_type)) {
     vp9_iht16x16_256_add_c(input, dest, stride, tx_type);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud(dest, stride, 16);
-    vp9_iht16x16_256_add(input, dest, stride, ADST_DCT);
-    flipud(dest, stride, 16);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr(dest, stride, 16);
-    vp9_iht16x16_256_add(input, dest, stride, DCT_ADST);
-    fliplr(dest, stride, 16);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud(dest, stride, 16);
-    vp9_iht16x16_256_add(input, dest, stride, ADST_ADST);
-    fliplrud(dest, stride, 16);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr(dest, stride, 16);
-    vp9_iht16x16_256_add(input, dest, stride, ADST_ADST);
-    fliplr(dest, stride, 16);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud(dest, stride, 16);
-    vp9_iht16x16_256_add(input, dest, stride, ADST_ADST);
-    flipud(dest, stride, 16);
 #endif  // CONFIG_EXT_TX
   } else {
     vp9_iht16x16_256_add(input, dest, stride, tx_type);
@@ -2775,7 +2746,7 @@ static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
 
 void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int tx_type, int bd) {
-  const highbd_transform_2d IHT_4[] = {
+  const highbd_transform_2d HIGH_IHT_4[] = {
     { vp9_highbd_idct4, vp9_highbd_idct4  },    // DCT_DCT  = 0
     { highbd_iadst4, vp9_highbd_idct4 },    // ADST_DCT = 1
     { vp9_highbd_idct4, highbd_iadst4 },    // DCT_ADST = 2
@@ -2798,25 +2769,43 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   int i, j;
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[4], temp_out[4];
+  tran_low_t tmp;
+  tran_low_t out[4][4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 4;
 
-  // Inverse transform row vectors.
+  // inverse transform row vectors
   for (i = 0; i < 4; ++i) {
-    IHT_4[tx_type].rows(input, outptr, bd);
+    HIGH_IHT_4[tx_type].rows(input, out[i], bd);
     input  += 4;
-    outptr += 4;
   }
 
-  // Inverse transform column vectors.
+  // transpose
+  for (i = 1 ; i < 4; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 4; ++i) {
+    HIGH_IHT_4[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides((uint8_t**)&dest,
+                     &stride, &outp, &outstride, tx_type, 4 * 2);
+#endif
+
+  // Sum with the destination
   for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j * 4 + i];
-    IHT_4[tx_type].cols(temp_in, temp_out, bd);
     for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 4), bd);
     }
   }
 }
@@ -2921,28 +2910,46 @@ static const highbd_transform_2d HIGH_IHT_8[] = {
 
 void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int tx_type, int bd) {
-  int i, j;
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[8], temp_out[8];
-  const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // Inverse transform row vectors.
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[8][8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 8;
+
+  // inverse transform row vectors
   for (i = 0; i < 8; ++i) {
-    ht.rows(input, outptr, bd);
-    input += 8;
-    outptr += 8;
+    HIGH_IHT_8[tx_type].rows(input, out[i], bd);
+    input  += 8;
   }
 
-  // Inverse transform column vectors.
+  // transpose
+  for (i = 1 ; i < 8; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 8; ++i) {
+    HIGH_IHT_8[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides((uint8_t**)&dest,
+                     &stride, &outp, &outstride, tx_type, 8 * 2);
+#endif
+
+  // Sum with the destination
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
-    ht.cols(temp_in, temp_out, bd);
     for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 5), bd);
     }
   }
 }
@@ -3361,28 +3368,46 @@ static const highbd_transform_2d HIGH_IHT_16[] = {
 
 void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
                                    int stride, int tx_type, int bd) {
-  int i, j;
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[16], temp_out[16];
-  const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // Rows
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[16][16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 16;
+
+  // inverse transform row vectors
   for (i = 0; i < 16; ++i) {
-    ht.rows(input, outptr, bd);
-    input += 16;
-    outptr += 16;
+    HIGH_IHT_16[tx_type].rows(input, out[i], bd);
+    input  += 16;
   }
 
-  // Columns
+  // transpose
+  for (i = 1 ; i < 16; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 16; ++i) {
+    HIGH_IHT_16[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides((uint8_t**)&dest, &stride,
+                     &outp, &outstride, tx_type, 16 * 2);
+#endif
+
+  // Sum with the destination
   for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j * 16 + i];
-    ht.cols(temp_in, temp_out, bd);
     for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 6), bd);
     }
   }
 }
@@ -3954,26 +3979,6 @@ void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
 #if CONFIG_EXT_TX
   } else if (is_dst_used(tx_type)) {
     vp9_highbd_iht4x4_16_add_c(input, dest, stride, tx_type, bd);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-    vp9_highbd_iht4x4_16_add(input, dest, stride, ADST_DCT, bd);
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-    vp9_highbd_iht4x4_16_add(input, dest, stride, DCT_ADST, bd);
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-    vp9_highbd_iht4x4_16_add(input, dest, stride, ADST_ADST, bd);
-    fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-    vp9_highbd_iht4x4_16_add(input, dest, stride, ADST_ADST, bd);
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
-    vp9_highbd_iht4x4_16_add(input, dest, stride, ADST_ADST, bd);
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4);
 #endif  // CONFIG_EXT_TX
   } else {
     vp9_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
@@ -3987,26 +3992,6 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
 #if CONFIG_EXT_TX
   } else if (is_dst_used(tx_type)) {
     vp9_highbd_iht8x8_64_add_c(input, dest, stride, tx_type, bd);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-    vp9_highbd_iht8x8_64_add(input, dest, stride, ADST_DCT, bd);
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-    vp9_highbd_iht8x8_64_add(input, dest, stride, DCT_ADST, bd);
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-    vp9_highbd_iht8x8_64_add(input, dest, stride, ADST_ADST, bd);
-    fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-    vp9_highbd_iht8x8_64_add(input, dest, stride, ADST_ADST, bd);
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
-    vp9_highbd_iht8x8_64_add(input, dest, stride, ADST_ADST, bd);
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8);
 #endif  // CONFIG_EXT_TX
   } else {
     vp9_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
@@ -4020,26 +4005,6 @@ void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
 #if CONFIG_EXT_TX
   } else if (is_dst_used(tx_type)) {
     vp9_highbd_iht16x16_256_add_c(input, dest, stride, tx_type, bd);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-    vp9_highbd_iht16x16_256_add(input, dest, stride, ADST_DCT, bd);
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-    vp9_highbd_iht16x16_256_add(input, dest, stride, DCT_ADST, bd);
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-    vp9_highbd_iht16x16_256_add(input, dest, stride, ADST_ADST, bd);
-    fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-    vp9_highbd_iht16x16_256_add(input, dest, stride, ADST_ADST, bd);
-    fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
-    vp9_highbd_iht16x16_256_add(input, dest, stride, ADST_ADST, bd);
-    flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16);
 #endif  // CONFIG_EXT_TX
   } else {
     vp9_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
@@ -4276,6 +4241,19 @@ void vp9_iht4x4_16_c(const tran_low_t *input, int16_t *dest, int stride,
   tran_low_t *outptr = out;
   tran_low_t temp_in[4], temp_out[4];
 
+  // FIXME: If the SR_MODE experiment is resurrected, then this function must
+  // be fixed to handle the FLIPADST cases by actually flipping its output
+  // See the other vp9_iht*add_c functions
+#if CONFIG_EXT_TX
+  assert(tx_type != FLIPADST_DCT);
+  assert(tx_type != DCT_FLIPADST);
+  assert(tx_type != FLIPADST_FLIPADST);
+  assert(tx_type != ADST_FLIPADST);
+  assert(tx_type != FLIPADST_ADST);
+  assert(tx_type != DST_FLIPADST);
+  assert(tx_type != FLIPADST_DST);
+#endif  // CONFIG_EXT_TX
+
   // inverse transform row vectors
   for (i = 0; i < 4; ++i) {
     IHT_4[tx_type].rows(input, outptr);
@@ -4302,6 +4280,19 @@ void vp9_iht8x8_64_c(const tran_low_t *input, int16_t *dest, int stride,
   tran_low_t temp_in[8], temp_out[8];
   const transform_2d ht = IHT_8[tx_type];
 
+  // FIXME: If the SR_MODE experiment is resurrected, then this function must
+  // be fixed to handle the FLIPADST cases by actually flipping its output
+  // See the other vp9_iht*add_c functions
+#if CONFIG_EXT_TX
+  assert(tx_type != FLIPADST_DCT);
+  assert(tx_type != DCT_FLIPADST);
+  assert(tx_type != FLIPADST_FLIPADST);
+  assert(tx_type != ADST_FLIPADST);
+  assert(tx_type != FLIPADST_ADST);
+  assert(tx_type != DST_FLIPADST);
+  assert(tx_type != FLIPADST_DST);
+#endif  // CONFIG_EXT_TX
+
   // inverse transform row vectors
   for (i = 0; i < 8; ++i) {
     ht.rows(input, outptr);
@@ -4378,6 +4369,19 @@ void vp9_iht16x16_256_c(const tran_low_t *input, int16_t *dest, int stride,
   tran_low_t temp_in[16], temp_out[16];
   const transform_2d ht = IHT_16[tx_type];
 
+  // FIXME: If the SR_MODE experiment is resurrected, then this function must
+  // be fixed to handle the FLIPADST cases by actually flipping its output
+  // See the other vp9_iht*add_c functions
+#if CONFIG_EXT_TX
+  assert(tx_type != FLIPADST_DCT);
+  assert(tx_type != DCT_FLIPADST);
+  assert(tx_type != FLIPADST_FLIPADST);
+  assert(tx_type != ADST_FLIPADST);
+  assert(tx_type != FLIPADST_ADST);
+  assert(tx_type != DST_FLIPADST);
+  assert(tx_type != FLIPADST_DST);
+#endif  // CONFIG_EXT_TX
+
   // Rows
   for (i = 0; i < 16; ++i) {
     ht.rows(input, outptr);
@@ -4582,26 +4586,6 @@ void vp9_iht4x4(TX_TYPE tx_type, const tran_low_t *input, int16_t *dest,
 #if CONFIG_EXT_TX
   } else if (is_dst_used(tx_type)) {
     vp9_iht4x4_16_c(input, dest, stride, tx_type);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud(dest, stride, 4);
-    vp9_iht4x4_16(input, dest, stride, ADST_DCT);
-    flipud(dest, stride, 4);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr(dest, stride, 4);
-    vp9_iht4x4_16(input, dest, stride, DCT_ADST);
-    fliplr(dest, stride, 4);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud(dest, stride, 4);
-    vp9_iht4x4_16(input, dest, stride, ADST_ADST);
-    fliplrud(dest, stride, 4);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr(dest, stride, 4);
-    vp9_iht4x4_16(input, dest, stride, ADST_ADST);
-    fliplr(dest, stride, 4);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud(dest, stride, 4);
-    vp9_iht4x4_16(input, dest, stride, ADST_ADST);
-    flipud(dest, stride, 4);
 #endif  // CONFIG_EXT_TX
   } else {
     vp9_iht4x4_16(input, dest, stride, tx_type);
@@ -4615,26 +4599,6 @@ void vp9_iht8x8(TX_TYPE tx_type, const tran_low_t *input, int16_t *dest,
 #if CONFIG_EXT_TX
   } else if (is_dst_used(tx_type)) {
     vp9_iht8x8_64_c(input, dest, stride, tx_type);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud(dest, stride, 8);
-    vp9_iht8x8_64(input, dest, stride, ADST_DCT);
-    flipud(dest, stride, 8);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr(dest, stride, 8);
-    vp9_iht8x8_64(input, dest, stride, DCT_ADST);
-    fliplr(dest, stride, 8);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud(dest, stride, 8);
-    vp9_iht8x8_64(input, dest, stride, ADST_ADST);
-    fliplrud(dest, stride, 8);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr(dest, stride, 8);
-    vp9_iht8x8_64(input, dest, stride, ADST_ADST);
-    fliplr(dest, stride, 8);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud(dest, stride, 8);
-    vp9_iht8x8_64(input, dest, stride, ADST_ADST);
-    flipud(dest, stride, 8);
 #endif  // CONFIG_EXT_TX
   } else {
     vp9_iht8x8_64(input, dest, stride, tx_type);
@@ -4648,26 +4612,6 @@ void vp9_iht16x16(TX_TYPE tx_type, const tran_low_t *input, int16_t *dest,
 #if CONFIG_EXT_TX
   } else if (is_dst_used(tx_type)) {
     vp9_iht16x16_256_c(input, dest, stride, tx_type);
-  } else if (tx_type == FLIPADST_DCT) {
-    flipud(dest, stride, 16);
-    vp9_iht16x16_256(input, dest, stride, ADST_DCT);
-    flipud(dest, stride, 16);
-  } else if (tx_type == DCT_FLIPADST) {
-    fliplr(dest, stride, 16);
-    vp9_iht16x16_256(input, dest, stride, DCT_ADST);
-    fliplr(dest, stride, 16);
-  } else if (tx_type == FLIPADST_FLIPADST) {
-    fliplrud(dest, stride, 16);
-    vp9_iht16x16_256(input, dest, stride, ADST_ADST);
-    fliplrud(dest, stride, 16);
-  } else if (tx_type == ADST_FLIPADST) {
-    fliplr(dest, stride, 16);
-    vp9_iht16x16_256(input, dest, stride, ADST_ADST);
-    fliplr(dest, stride, 16);
-  } else if (tx_type == FLIPADST_ADST) {
-    flipud(dest, stride, 16);
-    vp9_iht16x16_256(input, dest, stride, ADST_ADST);
-    flipud(dest, stride, 16);
 #endif  // CONFIG_EXT_TX
   } else {
     vp9_iht16x16_256(input, dest, stride, tx_type);
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 45fd95b81..5270f5903 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -11,6 +11,55 @@
 #include "vp9/common/x86/vp9_idct_intrin_sse2.h"
 #include "vp9/common/vp9_idct.h"
 
+#include "vp9/common/vp9_enums.h"
+
+#if CONFIG_EXT_TX
+// Reverse the 8 16 bit words in __m128i
+static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+  const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
+  const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
+  return _mm_shuffle_epi32(b, 0x4e);
+}
+
+static INLINE void fliplr_4x4(__m128i in[2]) {
+  in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+  in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
+  in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+  in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
+}
+
+static INLINE void fliplr_8x8(__m128i in[8]) {
+  in[0] = mm_reverse_epi16(in[0]);
+  in[1] = mm_reverse_epi16(in[1]);
+  in[2] = mm_reverse_epi16(in[2]);
+  in[3] = mm_reverse_epi16(in[3]);
+
+  in[4] = mm_reverse_epi16(in[4]);
+  in[5] = mm_reverse_epi16(in[5]);
+  in[6] = mm_reverse_epi16(in[6]);
+  in[7] = mm_reverse_epi16(in[7]);
+}
+
+static INLINE void fliplr_16x8(__m128i in[16]) {
+  fliplr_8x8(&in[0]);
+  fliplr_8x8(&in[8]);
+}
+
+#define FLIPLR_16x16(in0, in1) do {             \
+  __m128i *tmp;                                 \
+  fliplr_16x8(in0);                             \
+  fliplr_16x8(in1);                             \
+  tmp = (in0);                                  \
+  (in0) = (in1);                                \
+  (in1) = tmp;                                  \
+} while (0)
+
+#define FLIPUD_PTR(dest, stride, size) do {     \
+    (dest) = (dest) + ((size) - 1) * (stride);  \
+    (stride) = - (stride);                      \
+} while (0)
+#endif
+
 #define RECON_AND_STORE4X4(dest, in_x) \
 {                                                     \
   __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
@@ -126,12 +175,12 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
 
   // Reconstruction and Store
   {
-     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
+     __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
      __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-     d0 = _mm_unpacklo_epi32(d0,
-          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
-     d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
-                    *(const int *) (dest + stride * 3)), d2);
+     __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+     d0 = _mm_unpacklo_epi32(d0, d1);
+     d2 = _mm_unpacklo_epi32(d3, d2);
      d0 = _mm_unpacklo_epi8(d0, zero);
      d2 = _mm_unpacklo_epi8(d2, zero);
      d0 = _mm_add_epi16(d0, input2);
@@ -271,22 +320,50 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
   in[1]= _mm_loadu_si128((const __m128i *)(input + 8));
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
       idct4_sse2(in);
       idct4_sse2(in);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
       idct4_sse2(in);
       iadst4_sse2(in);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       iadst4_sse2(in);
       idct4_sse2(in);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
       iadst4_sse2(in);
       iadst4_sse2(in);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct4_sse2(in);
+      iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      break;
+    case DCT_FLIPADST:
+      iadst4_sse2(in);
+      idct4_sse2(in);
+      fliplr_4x4(in);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      fliplr_4x4(in);
+      break;
+    case ADST_FLIPADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      fliplr_4x4(in);
+      break;
+    case FLIPADST_ADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -875,22 +952,50 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
   in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
+      idct8_sse2(in);
       idct8_sse2(in);
+      break;
+    case ADST_DCT:
+      idct8_sse2(in);
+      iadst8_sse2(in);
+      break;
+    case DCT_ADST:
+      iadst8_sse2(in);
       idct8_sse2(in);
       break;
-    case 1:  // ADST_DCT
+    case ADST_ADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
       idct8_sse2(in);
       iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
       break;
-    case 2:  // DCT_ADST
+    case DCT_FLIPADST:
       iadst8_sse2(in);
       idct8_sse2(in);
+      fliplr_8x8(in);
       break;
-    case 3:  // ADST_ADST
+    case FLIPADST_FLIPADST:
       iadst8_sse2(in);
       iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      fliplr_8x8(in);
       break;
+    case ADST_FLIPADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      fliplr_8x8(in);
+      break;
+    case FLIPADST_ADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -2331,29 +2436,59 @@ static void iadst16_sse2(__m128i *in0, __m128i *in1) {
 
 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
                                int tx_type) {
-  __m128i in0[16], in1[16];
+  __m128i in[32];
+  __m128i *in0 = &in[0];
+  __m128i *in1 = &in[16];
 
   load_buffer_8x16(input, in0);
   input += 8;
   load_buffer_8x16(input, in1);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
       idct16_sse2(in0, in1);
       idct16_sse2(in0, in1);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
       idct16_sse2(in0, in1);
       iadst16_sse2(in0, in1);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       iadst16_sse2(in0, in1);
       idct16_sse2(in0, in1);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case DCT_FLIPADST:
+      iadst16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case ADST_FLIPADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_ADST:
       iadst16_sse2(in0, in1);
       iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
       break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
-- 
2.50.0