Eliminate copying for FLIPADST in fwd transforms.

author Geza Lore <gezalore@gmail.com>

Tue, 3 Nov 2015 11:10:20 +0000 (11:10 +0000)

committer Geza Lore <gezalore@gmail.com>

Tue, 3 Nov 2015 17:10:55 +0000 (17:10 +0000)
author Geza Lore <gezalore@gmail.com>
Tue, 3 Nov 2015 11:10:20 +0000 (11:10 +0000)
committer Geza Lore <gezalore@gmail.com>
Tue, 3 Nov 2015 17:10:55 +0000 (17:10 +0000)
diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c

index b388981e4387ab0c4b299ae45bb1be8451e92b67..c77e1430b934f104f9bdc34e59f6bb5158513297 100644 (file)
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -1161,6 +1161,106 @@ static void fadst16(const tran_low_t *input, tran_low_t *output) {
    output[15] = (tran_low_t)-x1;
  }
  
+#if CONFIG_EXT_TX
+static void copy_block(const int16_t *src, int src_stride, int l,
+                       int16_t *dest, int dest_stride) {
+  int i;
+  for (i = 0; i < l; ++i) {
+    memcpy(dest + dest_stride * i, src + src_stride * i,
+           l * sizeof(int16_t));
+  }
+}
+
+static void fliplr(int16_t *dest, int stride, int l) {
+  int i, j;
+  for (i = 0; i < l; ++i) {
+    for (j = 0; j < l / 2; ++j) {
+      const int16_t tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[i * stride + l - 1 - j];
+      dest[i * stride + l - 1 - j] = tmp;
+    }
+  }
+}
+
+static void flipud(int16_t *dest, int stride, int l) {
+  int i, j;
+  for (j = 0; j < l; ++j) {
+    for (i = 0; i < l / 2; ++i) {
+      const int16_t tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
+      dest[(l - 1 - i) * stride + j] = tmp;
+    }
+  }
+}
+
+static void fliplrud(int16_t *dest, int stride, int l) {
+  int i, j;
+  for (i = 0; i < l / 2; ++i) {
+    for (j = 0; j < l; ++j) {
+      const int16_t tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
+      dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
+    }
+  }
+}
+
+static void copy_fliplr(const int16_t *src, int src_stride, int l,
+                          int16_t *dest, int dest_stride) {
+  copy_block(src, src_stride, l, dest, dest_stride);
+  fliplr(dest, dest_stride, l);
+}
+
+static void copy_flipud(const int16_t *src, int src_stride, int l,
+                          int16_t *dest, int dest_stride) {
+  copy_block(src, src_stride, l, dest, dest_stride);
+  flipud(dest, dest_stride, l);
+}
+
+static void copy_fliplrud(const int16_t *src, int src_stride, int l,
+                            int16_t *dest, int dest_stride) {
+  copy_block(src, src_stride, l, dest, dest_stride);
+  fliplrud(dest, dest_stride, l);
+}
+
+static void maybe_flip_input(const int16_t **src, int *src_stride, int l,
+                             int16_t *buff, int tx_type) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case DST_DST:
+    case DCT_DST:
+    case DST_DCT:
+    case DST_ADST:
+    case ADST_DST:
+      break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case FLIPADST_DST:
+      copy_flipud(*src, *src_stride, l, buff, l);
+      *src = buff;
+      *src_stride = l;
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case DST_FLIPADST:
+      copy_fliplr(*src, *src_stride, l, buff, l);
+      *src = buff;
+      *src_stride = l;
+      break;
+    case FLIPADST_FLIPADST:
+      copy_fliplrud(*src, *src_stride, l, buff, l);
+      *src = buff;
+      *src_stride = l;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+#endif  // CONFIG_EXT_TX
+
  static const transform_2d FHT_4[] = {
    { fdct4,  fdct4  },  // DCT_DCT           = 0,
    { fadst4, fdct4  },  // ADST_DCT          = 1,
@@ -1234,6 +1334,11 @@ void vp10_fht4x4_c(const int16_t *input, tran_low_t *output,
      tran_low_t temp_in[4], temp_out[4];
      const transform_2d ht = FHT_4[tx_type];
  
+#if CONFIG_EXT_TX
+    int16_t flipped_input[4 * 4];
+    maybe_flip_input(&input, &stride, 4, flipped_input, tx_type);
+#endif
+
      // Columns
      for (i = 0; i < 4; ++i) {
        for (j = 0; j < 4; ++j)
@@ -1378,6 +1483,11 @@ void vp10_fht8x8_c(const int16_t *input, tran_low_t *output,
      tran_low_t temp_in[8], temp_out[8];
      const transform_2d ht = FHT_8[tx_type];
  
+#if CONFIG_EXT_TX
+    int16_t flipped_input[8 * 8];
+    maybe_flip_input(&input, &stride, 8, flipped_input, tx_type);
+#endif
+
      // Columns
      for (i = 0; i < 8; ++i) {
        for (j = 0; j < 8; ++j)
@@ -1464,6 +1574,11 @@ void vp10_fht16x16_c(const int16_t *input, tran_low_t *output,
      tran_low_t temp_in[16], temp_out[16];
      const transform_2d ht = FHT_16[tx_type];
  
+#if CONFIG_EXT_TX
+    int16_t flipped_input[16 * 16];
+    maybe_flip_input(&input, &stride, 16, flipped_input, tx_type);
+#endif
+
      // Columns
      for (i = 0; i < 16; ++i) {
        for (j = 0; j < 16; ++j)
diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c

index 5f6e3b2afcb783f68a5c298a6ef308cff51ed646..00e37801a09d0d82a855a0a1ab4f813e6b2290e4 100644 (file)
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@@ -326,66 +326,6 @@ static INLINE void highbd_fdct32x32(int rd_transform, const int16_t *src,
  #endif  // CONFIG_VP9_HIGHBITDEPTH
  
  #if CONFIG_EXT_TX
-static void copy_block(const int16_t *src, int src_stride, int l,
-                       int16_t *dest, int dest_stride) {
-  int i;
-  for (i = 0; i < l; ++i) {
-    memcpy(dest + dest_stride * i, src + src_stride * i,
-           l * sizeof(int16_t));
-  }
-}
-
-static void fliplr(int16_t *dest, int stride, int l) {
-  int i, j;
-  for (i = 0; i < l; ++i) {
-    for (j = 0; j < l / 2; ++j) {
-      const int16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[i * stride + l - 1 - j];
-      dest[i * stride + l - 1 - j] = tmp;
-    }
-  }
-}
-
-static void flipud(int16_t *dest, int stride, int l) {
-  int i, j;
-  for (j = 0; j < l; ++j) {
-    for (i = 0; i < l / 2; ++i) {
-      const int16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
-      dest[(l - 1 - i) * stride + j] = tmp;
-    }
-  }
-}
-
-static void fliplrud(int16_t *dest, int stride, int l) {
-  int i, j;
-  for (i = 0; i < l / 2; ++i) {
-    for (j = 0; j < l; ++j) {
-      const int16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
-      dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
-    }
-  }
-}
-
-static void copy_fliplr(const int16_t *src, int src_stride, int l,
-                          int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, dest, dest_stride);
-  fliplr(dest, dest_stride, l);
-}
-
-static void copy_flipud(const int16_t *src, int src_stride, int l,
-                          int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, dest, dest_stride);
-  flipud(dest, dest_stride, l);
-}
-
-static void copy_fliplrud(const int16_t *src, int src_stride, int l,
-                            int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, dest, dest_stride);
-  fliplrud(dest, dest_stride, l);
-}
-
  // Forward identity transform.
  static void fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
                         int bs) {
@@ -404,15 +344,13 @@ static void fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
  void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
                         int diff_stride, TX_TYPE tx_type, int lossless) {
    if (lossless) {
+    assert(tx_type == DCT_DCT);
      vp10_fwht4x4(src_diff, coeff, diff_stride);
-  } else {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[16];
-#endif  // CONFIG_EXT_TX
+    return;
+  }
+
    switch (tx_type) {
      case DCT_DCT:
-      vpx_fdct4x4(src_diff, coeff, diff_stride);
-      break;
      case ADST_DCT:
      case DCT_ADST:
      case ADST_ADST:
@@ -420,40 +358,21 @@ void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
        break;
  #if CONFIG_EXT_TX
      case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_fht4x4(src_diff2, coeff, 4, ADST_DCT);
-      break;
      case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_fht4x4(src_diff2, coeff, 4, DCT_ADST);
-      break;
      case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_fht4x4(src_diff2, coeff, 4, ADST_ADST);
-      break;
      case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_fht4x4(src_diff2, coeff, 4, ADST_ADST);
-      break;
      case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_fht4x4(src_diff2, coeff, 4, ADST_ADST);
+      vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
        break;
      case DST_DST:
      case DCT_DST:
      case DST_DCT:
      case DST_ADST:
      case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
-      break;
      case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_fht4x4_c(src_diff2, coeff, 4, DST_ADST);
-      break;
      case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_fht4x4_c(src_diff2, coeff, 4, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
        break;
      case IDTX:
        fwd_idtx_c(src_diff, coeff, diff_stride, 4);
@@ -462,15 +381,11 @@ void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
      default:
        assert(0);
        break;
-    }
    }
  }
  
  static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                           int diff_stride, TX_TYPE tx_type) {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[64];
-#endif  // CONFIG_EXT_TX
    switch (tx_type) {
      case DCT_DCT:
      case ADST_DCT:
@@ -480,40 +395,21 @@ static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
        break;
  #if CONFIG_EXT_TX
      case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, ADST_DCT);
-      break;
      case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, DCT_ADST);
-      break;
      case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
-      break;
      case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
-      break;
      case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
+      vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
        break;
      case DST_DST:
      case DCT_DST:
      case DST_DCT:
      case DST_ADST:
      case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
-      break;
      case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8_c(src_diff2, coeff, 8, DST_ADST);
-      break;
      case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8_c(src_diff2, coeff, 8, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
        break;
      case IDTX:
        fwd_idtx_c(src_diff, coeff, diff_stride, 8);
@@ -527,9 +423,6 @@ static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
  
  static void fwd_txfm_8x8_1(const int16_t *src_diff, tran_low_t *coeff,
                             int diff_stride, TX_TYPE tx_type) {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[64];
-#endif  // CONFIG_EXT_TX
    switch (tx_type) {
      case DCT_DCT:
      case ADST_DCT:
@@ -539,56 +432,34 @@ static void fwd_txfm_8x8_1(const int16_t *src_diff, tran_low_t *coeff,
        break;
  #if CONFIG_EXT_TX
      case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, ADST_DCT);
-      break;
      case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, DCT_ADST);
-      break;
      case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
-      break;
      case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
-      break;
      case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8(src_diff2, coeff, 8, ADST_ADST);
+      vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
        break;
      case DST_DST:
      case DCT_DST:
      case DST_DCT:
      case DST_ADST:
      case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
-        break;
      case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8_c(src_diff2, coeff, 8, DST_ADST);
-      break;
      case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_fht8x8_c(src_diff2, coeff, 8, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
        break;
      case IDTX:
        fwd_idtx_c(src_diff, coeff, diff_stride, 8);
        break;
  #endif  // CONFIG_EXT_TX
-      default:
-        assert(0);
-        break;
+    default:
+      assert(0);
+      break;
    }
  }
  
  static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
                             int diff_stride, TX_TYPE tx_type) {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[256];
-#endif  // CONFIG_EXT_TX
    switch (tx_type) {
      case DCT_DCT:
      case ADST_DCT:
@@ -598,40 +469,21 @@ static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
        break;
  #if CONFIG_EXT_TX
      case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, ADST_DCT);
-      break;
      case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, DCT_ADST);
-      break;
      case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
-      break;
      case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
-      break;
      case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
+      vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
        break;
      case DST_DST:
      case DCT_DST:
      case DST_DCT:
      case DST_ADST:
      case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
-      break;
      case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16_c(src_diff2, coeff, 16, DST_ADST);
-      break;
      case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16_c(src_diff2, coeff, 16, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
        break;
      case IDTX:
        fwd_idtx_c(src_diff, coeff, diff_stride, 16);
@@ -645,9 +497,6 @@ static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
  
  static void fwd_txfm_16x16_1(const int16_t *src_diff, tran_low_t *coeff,
                               int diff_stride, TX_TYPE tx_type) {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[256];
-#endif  // CONFIG_EXT_TX
    switch (tx_type) {
      case DCT_DCT:
      case ADST_DCT:
@@ -657,40 +506,21 @@ static void fwd_txfm_16x16_1(const int16_t *src_diff, tran_low_t *coeff,
        break;
  #if CONFIG_EXT_TX
      case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, ADST_DCT);
-      break;
      case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, DCT_ADST);
-      break;
      case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
-      break;
      case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
-      break;
      case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16(src_diff2, coeff, 16, ADST_ADST);
+      vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
        break;
      case DST_DST:
      case DCT_DST:
      case DST_DCT:
      case DST_ADST:
      case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
-      break;
      case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16_c(src_diff2, coeff, 16, DST_ADST);
-      break;
      case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_fht16x16_c(src_diff2, coeff, 16, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
        break;
      case IDTX:
        fwd_idtx_c(src_diff, coeff, diff_stride, 16);
@@ -754,76 +584,48 @@ void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
    if (lossless) {
      assert(tx_type == DCT_DCT);
      vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
-  } else {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[16];
-#endif  // CONFIG_EXT_TX
-    switch (tx_type) {
-      case DCT_DCT:
-      vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
-        break;
-      case ADST_DCT:
-      case DCT_ADST:
-      case ADST_ADST:
-        vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
-        break;
+    return;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
+      break;
  #if CONFIG_EXT_TX
      case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_highbd_fht4x4(src_diff2, coeff, 4, ADST_DCT);
-      break;
      case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_highbd_fht4x4(src_diff2, coeff, 4, DCT_ADST);
-      break;
      case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_highbd_fht4x4(src_diff2, coeff, 4, ADST_ADST);
-      break;
      case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_highbd_fht4x4(src_diff2, coeff, 4, ADST_ADST);
-      break;
      case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_highbd_fht4x4(src_diff2, coeff, 4, ADST_ADST);
+      vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
        break;
      case DST_DST:
      case DCT_DST:
      case DST_DCT:
      case DST_ADST:
      case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
-      break;
      case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_highbd_fht4x4_c(src_diff2, coeff, 4, DST_ADST);
-      break;
      case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
-      vp10_highbd_fht4x4_c(src_diff2, coeff, 4, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
        break;
      case IDTX:
        fwd_idtx_c(src_diff, coeff, diff_stride, 4);
        break;
  #endif  // CONFIG_EXT_TX
-      default:
-        assert(0);
-        break;
-    }
+    default:
+      assert(0);
+      break;
    }
  }
  
  static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                                  int diff_stride, TX_TYPE tx_type) {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[64];
-#endif  // CONFIG_EXT_TX
    switch (tx_type) {
      case DCT_DCT:
-      vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-      break;
      case ADST_DCT:
      case DCT_ADST:
      case ADST_ADST:
@@ -831,40 +633,21 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
        break;
  #if CONFIG_EXT_TX
      case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_DCT);
-      break;
      case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, DCT_ADST);
-      break;
      case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
-      break;
      case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
-      break;
      case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
+      vp10_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
        break;
      case DST_DST:
      case DCT_DST:
      case DST_DCT:
      case DST_ADST:
      case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
-      break;
      case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8_c(src_diff2, coeff, 8, DST_ADST);
-      break;
      case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8_c(src_diff2, coeff, 8, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
        break;
      case IDTX:
        fwd_idtx_c(src_diff, coeff, diff_stride, 8);
@@ -878,13 +661,8 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
  
  static void highbd_fwd_txfm_8x8_1(const int16_t *src_diff, tran_low_t *coeff,
                                    int diff_stride, TX_TYPE tx_type) {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[64];
-#endif  // CONFIG_EXT_TX
    switch (tx_type) {
      case DCT_DCT:
-      vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
-      break;
      case ADST_DCT:
      case DCT_ADST:
      case ADST_ADST:
@@ -892,40 +670,21 @@ static void highbd_fwd_txfm_8x8_1(const int16_t *src_diff, tran_low_t *coeff,
        break;
  #if CONFIG_EXT_TX
      case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_DCT);
-      break;
      case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, DCT_ADST);
-      break;
      case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
-      break;
      case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
-      break;
      case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
+      vp10_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
        break;
      case DST_DST:
      case DCT_DST:
      case DST_DCT:
      case DST_ADST:
      case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
-      break;
      case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8_c(src_diff2, coeff, 8, DST_ADST);
-      break;
      case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
-      vp10_highbd_fht8x8_c(src_diff2, coeff, 8, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
        break;
      case IDTX:
        fwd_idtx_c(src_diff, coeff, diff_stride, 8);
@@ -939,13 +698,8 @@ static void highbd_fwd_txfm_8x8_1(const int16_t *src_diff, tran_low_t *coeff,
  
  static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
                                    int diff_stride, TX_TYPE tx_type) {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[256];
-#endif  // CONFIG_EXT_TX
    switch (tx_type) {
      case DCT_DCT:
-      vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-      break;
      case ADST_DCT:
      case DCT_ADST:
      case ADST_ADST:
@@ -953,40 +707,21 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
        break;
  #if CONFIG_EXT_TX
      case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_DCT);
-      break;
      case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, DCT_ADST);
-      break;
      case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
-      break;
      case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
-      break;
      case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
+      vp10_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
        break;
      case DST_DST:
      case DCT_DST:
      case DST_DCT:
      case DST_ADST:
      case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
-      break;
      case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16_c(src_diff2, coeff, 16, DST_ADST);
-      break;
      case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16_c(src_diff2, coeff, 16, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
        break;
      case IDTX:
        fwd_idtx_c(src_diff, coeff, diff_stride, 16);
@@ -1000,13 +735,8 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
  
  static void highbd_fwd_txfm_16x16_1(const int16_t *src_diff, tran_low_t *coeff,
                                      int diff_stride, TX_TYPE tx_type) {
-#if CONFIG_EXT_TX
-  int16_t src_diff2[256];
-#endif  // CONFIG_EXT_TX
    switch (tx_type) {
      case DCT_DCT:
-      vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
-      break;
      case ADST_DCT:
      case DCT_ADST:
      case ADST_ADST:
@@ -1014,40 +744,21 @@ static void highbd_fwd_txfm_16x16_1(const int16_t *src_diff, tran_low_t *coeff,
        break;
  #if CONFIG_EXT_TX
      case FLIPADST_DCT:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_DCT);
-      break;
      case DCT_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, DCT_ADST);
-      break;
      case FLIPADST_FLIPADST:
-      copy_fliplrud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
-      break;
      case ADST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
-      break;
      case FLIPADST_ADST:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
+      vp10_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
        break;
      case DST_DST:
      case DCT_DST:
      case DST_DCT:
      case DST_ADST:
      case ADST_DST:
-      // Use C version since DST exists only in C
-      vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
-      break;
      case DST_FLIPADST:
-      copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16_c(src_diff2, coeff, 16, DST_ADST);
-      break;
      case FLIPADST_DST:
-      copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
-      vp10_highbd_fht16x16_c(src_diff2, coeff, 16, ADST_DST);
+      // Use C version since DST exists only in C
+      vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
        break;
      case IDTX:
        fwd_idtx_c(src_diff, coeff, diff_stride, 16);
diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_sse2.c

index e1111570a26d1a8afab915c001c41f8b5a7bae78..976fe45fb7b60a26dae8e0d363cdbeb35efd99f5 100644 (file)
--- a/vp10/encoder/x86/dct_sse2.c
+++ b/vp10/encoder/x86/dct_sse2.c
@@ -18,16 +18,37 @@
  #include "vpx_dsp/x86/txfm_common_sse2.h"
  #include "vpx_ports/mem.h"
  
+// Reverse the 8 16 bit words in __m128i
+static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+  const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
+  const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
+  return _mm_shuffle_epi32(b, 0x4e);
+}
+
  static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
-                                   int stride) {
+                                   int stride, int flipud, int fliplr) {
    const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
    const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
    __m128i mask;
  
-  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
-  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  if (!flipud) {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  } else {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+  }
  
    in[0] = _mm_slli_epi16(in[0], 4);
    in[1] = _mm_slli_epi16(in[1], 4);
@@ -160,23 +181,55 @@ void vp10_fht4x4_sse2(const int16_t *input, tran_low_t *output,
        vpx_fdct4x4_sse2(input, output, stride);
        break;
      case ADST_DCT:
-      load_buffer_4x4(input, in, stride);
+      load_buffer_4x4(input, in, stride, 0, 0);
        fadst4_sse2(in);
        fdct4_sse2(in);
        write_buffer_4x4(output, in);
        break;
      case DCT_ADST:
-      load_buffer_4x4(input, in, stride);
+      load_buffer_4x4(input, in, stride, 0, 0);
        fdct4_sse2(in);
        fadst4_sse2(in);
        write_buffer_4x4(output, in);
        break;
      case ADST_ADST:
-      load_buffer_4x4(input, in, stride);
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_4x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fdct4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x4(input, in, stride, 0, 1);
+      fdct4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x4(input, in, stride, 1, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x4(input, in, stride, 0, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x4(input, in, stride, 1, 0);
        fadst4_sse2(in);
        fadst4_sse2(in);
        write_buffer_4x4(output, in);
        break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -627,15 +680,37 @@ void vp10_fdct8x8_quant_sse2(const int16_t *input, int stride,
  
  // load 8x8 array
  static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
-                                   int stride) {
-  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+                                   int stride, int flipud, int fliplr) {
+  if (!flipud) {
+    in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  } else {
+    in[0]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+    in[1]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[2]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[3]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[4]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[5]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[6]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[7]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = mm_reverse_epi16(in[0]);
+    in[1] = mm_reverse_epi16(in[1]);
+    in[2] = mm_reverse_epi16(in[2]);
+    in[3] = mm_reverse_epi16(in[3]);
+    in[4] = mm_reverse_epi16(in[4]);
+    in[5] = mm_reverse_epi16(in[5]);
+    in[6] = mm_reverse_epi16(in[6]);
+    in[7] = mm_reverse_epi16(in[7]);
+  }
  
    in[0] = _mm_slli_epi16(in[0], 2);
    in[1] = _mm_slli_epi16(in[1], 2);
@@ -1144,26 +1219,63 @@ void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output,
        vpx_fdct8x8_sse2(input, output, stride);
        break;
      case ADST_DCT:
-      load_buffer_8x8(input, in, stride);
+      load_buffer_8x8(input, in, stride, 0, 0);
        fadst8_sse2(in);
        fdct8_sse2(in);
        right_shift_8x8(in, 1);
        write_buffer_8x8(output, in, 8);
        break;
      case DCT_ADST:
-      load_buffer_8x8(input, in, stride);
+      load_buffer_8x8(input, in, stride, 0, 0);
        fdct8_sse2(in);
        fadst8_sse2(in);
        right_shift_8x8(in, 1);
        write_buffer_8x8(output, in, 8);
        break;
      case ADST_ADST:
-      load_buffer_8x8(input, in, stride);
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_8x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1);
+      fdct8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 1);
        fadst8_sse2(in);
        fadst8_sse2(in);
        right_shift_8x8(in, 1);
        write_buffer_8x8(output, in, 8);
        break;
+    case ADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+#endif  // CONFIG_EXT_TX
      default:
        assert(0);
        break;
@@ -1171,15 +1283,37 @@ void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output,
  }
  
  static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
-                                     __m128i *in1, int stride) {
+                                     __m128i *in1, int stride,
+                                     int flipud, int fliplr) {
+  // Load 4 8x8 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+  const int16_t *botL = input + 8 * stride;
+  const int16_t *botR = input + 8 * stride + 8;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL; topL = botL; botL = tmp;
+    // Swap right columns
+    tmp = topR; topR = botR; botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL; topL = topR; topR = tmp;
+    // Swap bottom rows
+    tmp = botL; botL = botR; botR = tmp;
+  }
+
    // load first 8 columns
-  load_buffer_8x8(input, in0, stride);
-  load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+  load_buffer_8x8(topL, in0,     stride, flipud, fliplr);
+  load_buffer_8x8(botL, in0 + 8, stride, flipud, fliplr);
  
-  input += 8;
    // load second 8 columns
-  load_buffer_8x8(input, in1, stride);
-  load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+  load_buffer_8x8(topR, in1,     stride, flipud, fliplr);
+  load_buffer_8x8(botR, in1 + 8, stride, flipud, fliplr);
  }
  
  static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
@@ -2031,26 +2165,63 @@ void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output,
        vpx_fdct16x16_sse2(input, output, stride);
        break;
      case ADST_DCT:
-      load_buffer_16x16(input, in0, in1, stride);
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
        fadst16_sse2(in0, in1);
        right_shift_16x16(in0, in1);
        fdct16_sse2(in0, in1);
        write_buffer_16x16(output, in0, in1, 16);
        break;
      case DCT_ADST:
-      load_buffer_16x16(input, in0, in1, stride);
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
        fdct16_sse2(in0, in1);
        right_shift_16x16(in0, in1);
        fadst16_sse2(in0, in1);
        write_buffer_16x16(output, in0, in1, 16);
        break;
      case ADST_ADST:
-      load_buffer_16x16(input, in0, in1, stride);
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 1);
+      fdct16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 1, 1);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 1);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x16(input, in0, in1, stride, 1, 0);
        fadst16_sse2(in0, in1);
        right_shift_16x16(in0, in1);
        fadst16_sse2(in0, in1);
        write_buffer_16x16(output, in0, in1, 16);
        break;
+#endif  // CONFIG_EXT_TX
      default:
        assert(0);
        break;
author	Geza Lore <gezalore@gmail.com>
	Tue, 3 Nov 2015 11:10:20 +0000 (11:10 +0000)
committer	Geza Lore <gezalore@gmail.com>
	Tue, 3 Nov 2015 17:10:55 +0000 (17:10 +0000)
vp10/encoder/dct.c		patch \| blob \| history
vp10/encoder/encodemb.c		patch \| blob \| history
vp10/encoder/x86/dct_sse2.c		patch \| blob \| history