From: Geza Lore <gezalore@gmail.com>
Date: Tue, 3 Nov 2015 11:10:20 +0000 (+0000)
Subject: Eliminate copying for FLIPADST in fwd transforms.
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=01bb4a318dc0f9069264b7fd5641bc3014f47f32;p=libvpx

Eliminate copying for FLIPADST in fwd transforms.

This patch eliminates the copying of data when using FLIPADST forward
transforms, by incorporating the necessary data flipping into the
load_buffer_* functions of the SSE2 optimized forward transforms. The
load_buffer_* functions are normally inlined, so the overhead of copying
the data is removed and the overhead of flipping is minimized. Left to
right flipping is still not free, as the columns need to be shuffled in
registers.

To preserve identity between the C and SSE2 implementations, the
appropriate C implementations now also do the data flipping as part of
the transform, rather than relying on the caller for flipping the input.

Overall speedup is about 1.5-2% in encode on my tests. Note that these
are only the forward transforms. Inverse transforms to come in a later
patch.

There are also a few code hygiene changes:
- Fixed some indents of switch statements.
- DCT_DCT transform now always use vp10_fht* functions, which dispatch
  to vpx_fdct* for DCT_DCT (some of them used to call vpx_fdct*
  directly, some of them used to call vp10_fht*).

Change-Id: I93439257dc5cd104ac6129cfed45af142fb64574
---

diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c
index b388981e4..c77e1430b 100644
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -1161,6 +1161,106 @@ static void fadst16(const tran_low_t *input, tran_low_t *output) {
   output[15] = (tran_low_t)-x1;
 }
 
+#if CONFIG_EXT_TX
+static void copy_block(const int16_t *src, int src_stride, int l,
+                       int16_t *dest, int dest_stride) {
+  int i;
+  for (i = 0; i < l; ++i) {
+    memcpy(dest + dest_stride * i, src + src_stride * i,
+           l * sizeof(int16_t));
+  }
+}
+
+static void fliplr(int16_t *dest, int stride, int l) {
+  int i, j;
+  for (i = 0; i < l; ++i) {
+    for (j = 0; j < l / 2; ++j) {
+      const int16_t tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[i * stride + l - 1 - j];
+      dest[i * stride + l - 1 - j] = tmp;
+    }
+  }
+}
+
+static void flipud(int16_t *dest, int stride, int l) {
+  int i, j;
+  for (j = 0; j < l; ++j) {
+    for (i = 0; i < l / 2; ++i) {
+      const int16_t tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
+      dest[(l - 1 - i) * stride + j] = tmp;
+    }
+  }
+}
+
+static void fliplrud(int16_t *dest, int stride, int l) {
+  int i, j;
+  for (i = 0; i < l / 2; ++i) {
+    for (j = 0; j < l; ++j) {
+      const int16_t tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
+      dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
+    }
+  }
+}
+
+static void copy_fliplr(const int16_t *src, int src_stride, int l,
+                          int16_t *dest, int dest_stride) {
+  copy_block(src, src_stride, l, dest, dest_stride);
+  fliplr(dest, dest_stride, l);
+}
+
+static void copy_flipud(const int16_t *src, int src_stride, int l,
+                          int16_t *dest, int dest_stride) {
+  copy_block(src, src_stride, l, dest, dest_stride);
+  flipud(dest, dest_stride, l);
+}
+
+static void copy_fliplrud(const int16_t *src, int src_stride, int l,
+                            int16_t *dest, int dest_stride) {
+  copy_block(src, src_stride, l, dest, dest_stride);
+  fliplrud(dest, dest_stride, l);
+}
+
+static void maybe_flip_input(const int16_t **src, int *src_stride, int l,
+                             int16_t *buff, int tx_type) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case DST_DST:
+    case DCT_DST:
+    case DST_DCT:
+    case DST_ADST:
+    case ADST_DST:
+      break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case FLIPADST_DST:
+      copy_flipud(*src, *src_stride, l, buff, l);
+      *src = buff;
+      *src_stride = l;
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case DST_FLIPADST:
+      copy_fliplr(*src, *src_stride, l, buff, l);
+      *src = buff;
+      *src_stride = l;
+      break;
+    case FLIPADST_FLIPADST:
+      copy_fliplrud(*src, *src_stride, l, buff, l);
+      *src = buff;
+      *src_stride = l;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+#endif  // CONFIG_EXT_TX
+
 static const transform_2d FHT_4[] = {
   { fdct4,  fdct4  },  // DCT_DCT           = 0,
   { fadst4, fdct4  },  // ADST_DCT          = 1,
@@ -1234,6 +1334,11 @@ void vp10_fht4x4_c(const int16_t *input, tran_low_t *output,
     tran_low_t temp_in[4], temp_out[4];
     const transform_2d ht = FHT_4[tx_type];
 
+#if CONFIG_EXT_TX
+    int16_t flipped_input[4 * 4];
+    maybe_flip_input(&input, &stride, 4, flipped_input, tx_type);
+#endif
+
     // Columns
     for (i = 0; i < 4; ++i) {
       for (j = 0; j < 4; ++j)
@@ -1378,6 +1483,11 @@ void vp10_fht8x8_c(const int16_t *input, tran_low_t *output,
     tran_low_t temp_in[8], temp_out[8];
     const transform_2d ht = FHT_8[tx_type];
 
+#if CONFIG_EXT_TX
+    int16_t flipped_input[8 * 8];
+    maybe_flip_input(&input, &stride, 8, flipped_input, tx_type);
+#endif
+
     // Columns
     for (i = 0; i < 8; ++i) {
       for (j = 0; j < 8; ++j)
@@ -1464,6 +1574,11 @@ void vp10_fht16x16_c(const int16_t *input, tran_low_t *output,
     tran_low_t temp_in[16], temp_out[16];
     const transform_2d ht = FHT_16[tx_type];
 
+#if CONFIG_EXT_TX
+    int16_t flipped_input[16 * 16];
+    maybe_flip_input(&input, &stride, 16, flipped_input, tx_type);
+#endif
+
     // Columns
     for (i = 0; i < 16; ++i) {
       for (j = 0; j < 16; ++j)
diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index 5f6e3b2af..00e37801a 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@@ -326,66 +326,6 @@ static INLINE void highbd_fdct32x32(int rd_transform, const int16_t *src,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_EXT_TX
-static void copy_block(const int16_t *src, int src_stride, int l,
-                       int16_t *dest, int dest_stride) {
-  int i;
-  for (i = 0; i < l; ++i) {
-    memcpy(dest + dest_stride * i, src + src_stride * i,
-           l * sizeof(int16_t));
-  }
-}
-
-static void fliplr(int16_t *dest, int stride, int l) {
-  int i, j;
-  for (i = 0; i < l; ++i) {
-    for (j = 0; j < l / 2; ++j) {
-      const int16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[i * stride + l - 1 - j];
-      dest[i * stride + l - 1 - j] = tmp;
-    }
-  }
-}
-
-static void flipud(int16_t *dest, int stride, int l) {
-  int i, j;
-  for (j = 0; j < l; ++j) {
-    for (i = 0; i < l / 2; ++i) {
-      const int16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
-      dest[(l - 1 - i) * stride + j] = tmp;
-    }
-  }
-}
-
-static void fliplrud(int16_t *dest, int stride, int l) {
-  int i, j;
-  for (i = 0; i < l / 2; ++i) {
-    for (j = 0; j < l; ++j) {
-      const int16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
-      dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
-    }
-  }
-}
-
-static void copy_fliplr(const int16_t *src, int src_stride, int l,
-                          int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, dest, dest_stride);
-  fliplr(dest, dest_stride, l);
-}
-
-static void copy_flipud(const int16_t *src, int src_stride, int l,
-                          int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, dest, dest_stride);
-  flipud(dest, dest_stride, l);
-}
-
-static void copy_fliplrud(const int16_t *src, int src_stride, int l,
-                            int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, dest, dest_stride);
-  fliplrud(dest, dest_stride, l);
-}
-
 // Forward identity transform.
 static void fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
                        int bs) {
@@ -404,15 +344,13 @@ static void fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
 void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
                        int diff_stride, TX_TYPE tx_type, int lossless) {
   if (lossless) {
+    assert(tx_type == DCT_DCT);
     vp10_fwht4x4(src_diff, coeff, diff_stride);
-  } else {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[16];
-#endif  // CONFIG_EXT_TX
+    return;
+  }
+
   switch (tx_type) {
     case DCT_DCT:
-      vpx_fdct4x4(src_diff, coeff, diff_stride);
-      break;
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
@@ -420,40 +358,21 @@ void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_fht4x4(src_diff2, coeff, 4, ADST_DCT);
-      break;
     case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_fht4x4(src_diff2, coeff, 4, DCT_ADST);
-      break;
     case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_fht4x4(src_diff2, coeff, 4, ADST_ADST);
-      break;
     case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_fht4x4(src_diff2, coeff, 4, ADST_ADST);
-      break;
     case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_fht4x4(src_diff2, coeff, 4, ADST_ADST);
+      vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
       break;
     case DST_DST:
     case DCT_DST:
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
-      break;
     case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_fht4x4_c(src_diff2, coeff, 4, DST_ADST);
-      break;
     case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_fht4x4_c(src_diff2, coeff, 4, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
       fwd_idtx_c(src_diff, coeff, diff_stride, 4);
@@ -462,15 +381,11 @@ void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
     default:
       assert(0);
       break;
-    }
   }
 }
 
 static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TX_TYPE tx_type) {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[64];
-#endif  // CONFIG_EXT_TX
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
@@ -480,40 +395,21 @@ static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, ADST_DCT);
-      break;
     case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, DCT_ADST);
-      break;
     case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
-      break;
     case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
-      break;
     case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
+      vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
       break;
     case DST_DST:
     case DCT_DST:
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
-      break;
     case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8_c(src_diff2, coeff, 8, DST_ADST);
-      break;
     case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8_c(src_diff2, coeff, 8, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
       fwd_idtx_c(src_diff, coeff, diff_stride, 8);
@@ -527,9 +423,6 @@ static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
 
 static void fwd_txfm_8x8_1(const int16_t *src_diff, tran_low_t *coeff,
                            int diff_stride, TX_TYPE tx_type) {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[64];
-#endif  // CONFIG_EXT_TX
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
@@ -539,56 +432,34 @@ static void fwd_txfm_8x8_1(const int16_t *src_diff, tran_low_t *coeff,
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, ADST_DCT);
-      break;
     case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, DCT_ADST);
-      break;
     case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
-      break;
     case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
-      break;
     case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
+      vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
       break;
     case DST_DST:
     case DCT_DST:
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
-        break;
     case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8_c(src_diff2, coeff, 8, DST_ADST);
-      break;
     case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8_c(src_diff2, coeff, 8, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
       fwd_idtx_c(src_diff, coeff, diff_stride, 8);
       break;
 #endif  // CONFIG_EXT_TX
-      default:
-        assert(0);
-        break;
+    default:
+      assert(0);
+      break;
   }
 }
 
 static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
                            int diff_stride, TX_TYPE tx_type) {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[256];
-#endif  // CONFIG_EXT_TX
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
@@ -598,40 +469,21 @@ static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, ADST_DCT);
-      break;
     case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, DCT_ADST);
-      break;
     case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
-      break;
     case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
-      break;
     case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
+      vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
       break;
     case DST_DST:
     case DCT_DST:
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
-      break;
     case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16_c(src_diff2, coeff, 16, DST_ADST);
-      break;
     case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16_c(src_diff2, coeff, 16, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
       fwd_idtx_c(src_diff, coeff, diff_stride, 16);
@@ -645,9 +497,6 @@ static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
 
 static void fwd_txfm_16x16_1(const int16_t *src_diff, tran_low_t *coeff,
                              int diff_stride, TX_TYPE tx_type) {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[256];
-#endif  // CONFIG_EXT_TX
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
@@ -657,40 +506,21 @@ static void fwd_txfm_16x16_1(const int16_t *src_diff, tran_low_t *coeff,
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, ADST_DCT);
-      break;
     case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, DCT_ADST);
-      break;
     case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
-      break;
     case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
-      break;
     case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
+      vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
       break;
     case DST_DST:
     case DCT_DST:
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
-      break;
     case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16_c(src_diff2, coeff, 16, DST_ADST);
-      break;
     case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16_c(src_diff2, coeff, 16, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
       fwd_idtx_c(src_diff, coeff, diff_stride, 16);
@@ -754,76 +584,48 @@ void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
   if (lossless) {
     assert(tx_type == DCT_DCT);
     vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
-  } else {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[16];
-#endif  // CONFIG_EXT_TX
-    switch (tx_type) {
-      case DCT_DCT:
-      vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
-        break;
-      case ADST_DCT:
-      case DCT_ADST:
-      case ADST_ADST:
-        vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
-        break;
+    return;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
+      break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_highbd_fht4x4(src_diff2, coeff, 4, ADST_DCT);
-      break;
     case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_highbd_fht4x4(src_diff2, coeff, 4, DCT_ADST);
-      break;
     case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_highbd_fht4x4(src_diff2, coeff, 4, ADST_ADST);
-      break;
     case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_highbd_fht4x4(src_diff2, coeff, 4, ADST_ADST);
-      break;
     case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_highbd_fht4x4(src_diff2, coeff, 4, ADST_ADST);
+      vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
       break;
     case DST_DST:
     case DCT_DST:
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
-      break;
     case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_highbd_fht4x4_c(src_diff2, coeff, 4, DST_ADST);
-      break;
     case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_highbd_fht4x4_c(src_diff2, coeff, 4, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
       fwd_idtx_c(src_diff, coeff, diff_stride, 4);
       break;
 #endif  // CONFIG_EXT_TX
-      default:
-        assert(0);
-        break;
-    }
+    default:
+      assert(0);
+      break;
   }
 }
 
 static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TX_TYPE tx_type) {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[64];
-#endif  // CONFIG_EXT_TX
   switch (tx_type) {
     case DCT_DCT:
-      vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-      break;
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
@@ -831,40 +633,21 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_DCT);
-      break;
     case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, DCT_ADST);
-      break;
     case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
-      break;
     case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
-      break;
     case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
+      vp10_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
       break;
     case DST_DST:
     case DCT_DST:
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
-      break;
     case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8_c(src_diff2, coeff, 8, DST_ADST);
-      break;
     case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8_c(src_diff2, coeff, 8, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
       fwd_idtx_c(src_diff, coeff, diff_stride, 8);
@@ -878,13 +661,8 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
 
 static void highbd_fwd_txfm_8x8_1(const int16_t *src_diff, tran_low_t *coeff,
                                   int diff_stride, TX_TYPE tx_type) {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[64];
-#endif  // CONFIG_EXT_TX
   switch (tx_type) {
     case DCT_DCT:
-      vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
-      break;
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
@@ -892,40 +670,21 @@ static void highbd_fwd_txfm_8x8_1(const int16_t *src_diff, tran_low_t *coeff,
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_DCT);
-      break;
     case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, DCT_ADST);
-      break;
     case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
-      break;
     case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
-      break;
     case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
+      vp10_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
       break;
     case DST_DST:
     case DCT_DST:
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
-      break;
     case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8_c(src_diff2, coeff, 8, DST_ADST);
-      break;
     case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8_c(src_diff2, coeff, 8, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
       fwd_idtx_c(src_diff, coeff, diff_stride, 8);
@@ -939,13 +698,8 @@ static void highbd_fwd_txfm_8x8_1(const int16_t *src_diff, tran_low_t *coeff,
 
 static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
                                   int diff_stride, TX_TYPE tx_type) {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[256];
-#endif  // CONFIG_EXT_TX
   switch (tx_type) {
     case DCT_DCT:
-      vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-      break;
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
@@ -953,40 +707,21 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_DCT);
-      break;
     case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, DCT_ADST);
-      break;
     case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
-      break;
     case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
-      break;
     case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
+      vp10_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
       break;
     case DST_DST:
     case DCT_DST:
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
-      break;
     case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16_c(src_diff2, coeff, 16, DST_ADST);
-      break;
     case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16_c(src_diff2, coeff, 16, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
       fwd_idtx_c(src_diff, coeff, diff_stride, 16);
@@ -1000,13 +735,8 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
 
 static void highbd_fwd_txfm_16x16_1(const int16_t *src_diff, tran_low_t *coeff,
                                     int diff_stride, TX_TYPE tx_type) {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[256];
-#endif  // CONFIG_EXT_TX
   switch (tx_type) {
     case DCT_DCT:
-      vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
-      break;
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
@@ -1014,40 +744,21 @@ static void highbd_fwd_txfm_16x16_1(const int16_t *src_diff, tran_low_t *coeff,
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_DCT);
-      break;
     case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, DCT_ADST);
-      break;
     case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
-      break;
     case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
-      break;
     case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
+      vp10_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
       break;
     case DST_DST:
     case DCT_DST:
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
-      break;
     case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16_c(src_diff2, coeff, 16, DST_ADST);
-      break;
     case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16_c(src_diff2, coeff, 16, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
       fwd_idtx_c(src_diff, coeff, diff_stride, 16);
diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_sse2.c
index e1111570a..976fe45fb 100644
--- a/vp10/encoder/x86/dct_sse2.c
+++ b/vp10/encoder/x86/dct_sse2.c
@@ -18,16 +18,37 @@
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
 
+// Reverse the 8 16 bit words in __m128i
+static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+  const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
+  const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
+  return _mm_shuffle_epi32(b, 0x4e);
+}
+
 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
-                                   int stride) {
+                                   int stride, int flipud, int fliplr) {
   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
   __m128i mask;
 
-  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
-  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  if (!flipud) {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  } else {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+  }
 
   in[0] = _mm_slli_epi16(in[0], 4);
   in[1] = _mm_slli_epi16(in[1], 4);
@@ -160,23 +181,55 @@ void vp10_fht4x4_sse2(const int16_t *input, tran_low_t *output,
       vpx_fdct4x4_sse2(input, output, stride);
       break;
     case ADST_DCT:
-      load_buffer_4x4(input, in, stride);
+      load_buffer_4x4(input, in, stride, 0, 0);
       fadst4_sse2(in);
       fdct4_sse2(in);
       write_buffer_4x4(output, in);
       break;
     case DCT_ADST:
-      load_buffer_4x4(input, in, stride);
+      load_buffer_4x4(input, in, stride, 0, 0);
       fdct4_sse2(in);
       fadst4_sse2(in);
       write_buffer_4x4(output, in);
       break;
     case ADST_ADST:
-      load_buffer_4x4(input, in, stride);
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_4x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fdct4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x4(input, in, stride, 0, 1);
+      fdct4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x4(input, in, stride, 1, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x4(input, in, stride, 0, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x4(input, in, stride, 1, 0);
       fadst4_sse2(in);
       fadst4_sse2(in);
       write_buffer_4x4(output, in);
       break;
+#endif  // CONFIG_EXT_TX
    default:
      assert(0);
      break;
@@ -627,15 +680,37 @@ void vp10_fdct8x8_quant_sse2(const int16_t *input, int stride,
 
 // load 8x8 array
 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
-                                   int stride) {
-  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+                                   int stride, int flipud, int fliplr) {
+  if (!flipud) {
+    in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  } else {
+    in[0]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+    in[1]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[2]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[3]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[4]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[5]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[6]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[7]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = mm_reverse_epi16(in[0]);
+    in[1] = mm_reverse_epi16(in[1]);
+    in[2] = mm_reverse_epi16(in[2]);
+    in[3] = mm_reverse_epi16(in[3]);
+    in[4] = mm_reverse_epi16(in[4]);
+    in[5] = mm_reverse_epi16(in[5]);
+    in[6] = mm_reverse_epi16(in[6]);
+    in[7] = mm_reverse_epi16(in[7]);
+  }
 
   in[0] = _mm_slli_epi16(in[0], 2);
   in[1] = _mm_slli_epi16(in[1], 2);
@@ -1144,26 +1219,63 @@ void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output,
       vpx_fdct8x8_sse2(input, output, stride);
       break;
     case ADST_DCT:
-      load_buffer_8x8(input, in, stride);
+      load_buffer_8x8(input, in, stride, 0, 0);
       fadst8_sse2(in);
       fdct8_sse2(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
     case DCT_ADST:
-      load_buffer_8x8(input, in, stride);
+      load_buffer_8x8(input, in, stride, 0, 0);
       fdct8_sse2(in);
       fadst8_sse2(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
     case ADST_ADST:
-      load_buffer_8x8(input, in, stride);
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_8x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1);
+      fdct8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 1);
       fadst8_sse2(in);
       fadst8_sse2(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
+    case ADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -1171,15 +1283,37 @@ void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output,
 }
 
 static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
-                                     __m128i *in1, int stride) {
+                                     __m128i *in1, int stride,
+                                     int flipud, int fliplr) {
+  // Load 4 8x8 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+  const int16_t *botL = input + 8 * stride;
+  const int16_t *botR = input + 8 * stride + 8;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL; topL = botL; botL = tmp;
+    // Swap right columns
+    tmp = topR; topR = botR; botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL; topL = topR; topR = tmp;
+    // Swap bottom rows
+    tmp = botL; botL = botR; botR = tmp;
+  }
+
   // load first 8 columns
-  load_buffer_8x8(input, in0, stride);
-  load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+  load_buffer_8x8(topL, in0,     stride, flipud, fliplr);
+  load_buffer_8x8(botL, in0 + 8, stride, flipud, fliplr);
 
-  input += 8;
   // load second 8 columns
-  load_buffer_8x8(input, in1, stride);
-  load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+  load_buffer_8x8(topR, in1,     stride, flipud, fliplr);
+  load_buffer_8x8(botR, in1 + 8, stride, flipud, fliplr);
 }
 
 static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
@@ -2031,26 +2165,63 @@ void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output,
       vpx_fdct16x16_sse2(input, output, stride);
       break;
     case ADST_DCT:
-      load_buffer_16x16(input, in0, in1, stride);
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
       fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fdct16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
     case DCT_ADST:
-      load_buffer_16x16(input, in0, in1, stride);
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
       fdct16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
     case ADST_ADST:
-      load_buffer_16x16(input, in0, in1, stride);
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 1);
+      fdct16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 1, 1);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 1);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x16(input, in0, in1, stride, 1, 0);
       fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;