output[3] = fdct_round_shift(s3);
}
+#if CONFIG_EXT_TX
+static void copy_block(const int16_t *src, int src_stride, int l,
+ int16_t *dest, int dest_stride) {
+ int i;
+ for (i = 0; i < l; ++i) {
+ memcpy(dest + dest_stride * i, src + src_stride * i,
+ l * sizeof(int16_t));
+ }
+}
+
+static void fliplr(int16_t *dest, int stride, int l) {
+ int i, j;
+ for (i = 0; i < l; ++i) {
+ for (j = 0; j < l / 2; ++j) {
+ const int16_t tmp = dest[i * stride + j];
+ dest[i * stride + j] = dest[i * stride + l - 1 - j];
+ dest[i * stride + l - 1 - j] = tmp;
+ }
+ }
+}
+
+static void flipud(int16_t *dest, int stride, int l) {
+ int i, j;
+ for (j = 0; j < l; ++j) {
+ for (i = 0; i < l / 2; ++i) {
+ const int16_t tmp = dest[i * stride + j];
+ dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
+ dest[(l - 1 - i) * stride + j] = tmp;
+ }
+ }
+}
+
+static void fliplrud(int16_t *dest, int stride, int l) {
+ int i, j;
+ for (i = 0; i < l / 2; ++i) {
+ for (j = 0; j < l; ++j) {
+ const int16_t tmp = dest[i * stride + j];
+ dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
+ dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
+ }
+ }
+}
+
+static void copy_fliplr(const int16_t *src, int src_stride, int l,
+ int16_t *dest, int dest_stride) {
+ copy_block(src, src_stride, l, dest, dest_stride);
+ fliplr(dest, dest_stride, l);
+}
+
+static void copy_flipud(const int16_t *src, int src_stride, int l,
+ int16_t *dest, int dest_stride) {
+ copy_block(src, src_stride, l, dest, dest_stride);
+ flipud(dest, dest_stride, l);
+}
+
+static void copy_fliplrud(const int16_t *src, int src_stride, int l,
+ int16_t *dest, int dest_stride) {
+ copy_block(src, src_stride, l, dest, dest_stride);
+ fliplrud(dest, dest_stride, l);
+}
+
+static void maybe_flip_input(const int16_t **src, int *src_stride, int l,
+ int16_t *buff, int tx_type) {
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ break;
+ case FLIPADST_DCT:
+ case FLIPADST_ADST:
+ copy_flipud(*src, *src_stride, l, buff, l);
+ *src = buff;
+ *src_stride = l;
+ break;
+ case DCT_FLIPADST:
+ case ADST_FLIPADST:
+ copy_fliplr(*src, *src_stride, l, buff, l);
+ *src = buff;
+ *src_stride = l;
+ break;
+ case FLIPADST_FLIPADST:
+ copy_fliplrud(*src, *src_stride, l, buff, l);
+ *src = buff;
+ *src_stride = l;
+ break;
+#if CONFIG_DST1
+ case DST_DST:
+ case DCT_DST:
+ case DST_DCT:
+ case DST_ADST:
+ case ADST_DST:
+ break;
+ case FLIPADST_DST:
+ copy_flipud(*src, *src_stride, l, buff, l);
+ *src = buff;
+ *src_stride = l;
+ break;
+ case DST_FLIPADST:
+ copy_fliplr(*src, *src_stride, l, buff, l);
+ *src = buff;
+ *src_stride = l;
+ break;
+#endif // CONFIG_DST1
+ default:
+ assert(0);
+ break;
+ }
+}
+#endif // CONFIG_EXT_TX
+
void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
tran_low_t temp_in[4], temp_out[4];
const transform_2d ht = FHT_4[tx_type];
+#if CONFIG_EXT_TX
+ int16_t flipped_input[4 * 4];
+ maybe_flip_input(&input, &stride, 4, flipped_input, tx_type);
+#endif
+
// Columns
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
tran_low_t temp_in[8], temp_out[8];
const transform_2d ht = FHT_8[tx_type];
+#if CONFIG_EXT_TX
+ int16_t flipped_input[8 * 8];
+ maybe_flip_input(&input, &stride, 8, flipped_input, tx_type);
+#endif
+
// Columns
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
tran_low_t temp_in[16], temp_out[16];
const transform_2d ht = FHT_16[tx_type];
+#if CONFIG_EXT_TX
+ int16_t flipped_input[16 * 16];
+ maybe_flip_input(&input, &stride, 16, flipped_input, tx_type);
+#endif
+
// Columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_EXT_TX
-static void copy_block(const int16_t *src, int src_stride, int l,
- int16_t *dest, int dest_stride) {
- int i;
- for (i = 0; i < l; ++i) {
- vpx_memcpy(dest + dest_stride * i, src + src_stride * i,
- l * sizeof(int16_t));
- }
-}
-
-static void fliplr(int16_t *dest, int stride, int l) {
- int i, j;
- for (i = 0; i < l; ++i) {
- for (j = 0; j < l / 2; ++j) {
- const int16_t tmp = dest[i * stride + j];
- dest[i * stride + j] = dest[i * stride + l - 1 - j];
- dest[i * stride + l - 1 - j] = tmp;
- }
- }
-}
-
-static void flipud(int16_t *dest, int stride, int l) {
- int i, j;
- for (j = 0; j < l; ++j) {
- for (i = 0; i < l / 2; ++i) {
- const int16_t tmp = dest[i * stride + j];
- dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
- dest[(l - 1 - i) * stride + j] = tmp;
- }
- }
-}
-
-static void fliplrud(int16_t *dest, int stride, int l) {
- int i, j;
- for (i = 0; i < l / 2; ++i) {
- for (j = 0; j < l; ++j) {
- const int16_t tmp = dest[i * stride + j];
- dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
- dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
- }
- }
-}
-
-static void copy_fliplr(const int16_t *src, int src_stride, int l,
- int16_t *dest, int dest_stride) {
- copy_block(src, src_stride, l, dest, dest_stride);
- fliplr(dest, dest_stride, l);
-}
-
-static void copy_flipud(const int16_t *src, int src_stride, int l,
- int16_t *dest, int dest_stride) {
- copy_block(src, src_stride, l, dest, dest_stride);
- flipud(dest, dest_stride, l);
-}
-
-static void copy_fliplrud(const int16_t *src, int src_stride, int l,
- int16_t *dest, int dest_stride) {
- copy_block(src, src_stride, l, dest, dest_stride);
- fliplrud(dest, dest_stride, l);
-}
-
#if CONFIG_WAVELETS
static void forw_tx32x32(MACROBLOCK *x, int plane,
const int16_t *src_diff, int diff_stride,
const int16_t *src_diff, int diff_stride,
tran_low_t *const coeff) {
MACROBLOCKD *const xd = &x->e_mbd;
- int16_t src_diff2[256];
TX_TYPE tx_type = get_tx_type(plane, xd);
if (tx_type == DCT_DCT) {
vp9_fdct16x16(src_diff, coeff, diff_stride);
} else if (is_dst_used(tx_type)) {
vp9_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
#endif // CONFIG_DST1
- } else if (tx_type == FLIPADST_DCT) {
- copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
- vp9_fht16x16(src_diff2, coeff, 16, ADST_DCT);
- } else if (tx_type == DCT_FLIPADST) {
- copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
- vp9_fht16x16(src_diff2, coeff, 16, DCT_ADST);
- } else if (tx_type == FLIPADST_FLIPADST) {
- copy_fliplrud(src_diff, diff_stride, 16, src_diff2, 16);
- vp9_fht16x16(src_diff2, coeff, 16, ADST_ADST);
- } else if (tx_type == ADST_FLIPADST) {
- copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
- vp9_fht16x16(src_diff2, coeff, 16, ADST_ADST);
- } else if (tx_type == FLIPADST_ADST) {
- copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
- vp9_fht16x16(src_diff2, coeff, 16, ADST_ADST);
} else {
vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
}
const int16_t *src_diff, int diff_stride,
tran_low_t *const coeff) {
MACROBLOCKD *const xd = &x->e_mbd;
- int16_t src_diff2[64];
TX_TYPE tx_type = get_tx_type(plane, xd);
if (tx_type == DCT_DCT) {
vp9_fdct8x8(src_diff, coeff, diff_stride);
} else if (is_dst_used(tx_type)) {
vp9_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
#endif // CONFIG_DST1
- } else if (tx_type == FLIPADST_DCT) {
- copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
- vp9_fht8x8(src_diff2, coeff, 8, ADST_DCT);
- } else if (tx_type == DCT_FLIPADST) {
- copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
- vp9_fht8x8(src_diff2, coeff, 8, DCT_ADST);
- } else if (tx_type == FLIPADST_FLIPADST) {
- copy_fliplrud(src_diff, diff_stride, 8, src_diff2, 8);
- vp9_fht8x8(src_diff2, coeff, 8, ADST_ADST);
- } else if (tx_type == ADST_FLIPADST) {
- copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
- vp9_fht8x8(src_diff2, coeff, 8, ADST_ADST);
- } else if (tx_type == FLIPADST_ADST) {
- copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
- vp9_fht8x8(src_diff2, coeff, 8, ADST_ADST);
} else {
vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
}
const int16_t *src_diff, int diff_stride,
tran_low_t *const coeff) {
MACROBLOCKD *const xd = &x->e_mbd;
- int16_t src_diff2[16];
TX_TYPE tx_type = get_tx_type_4x4(plane, xd, block);
if (tx_type == DCT_DCT) {
x->fwd_txm4x4(src_diff, coeff, diff_stride);
} else if (is_dst_used(tx_type)) {
vp9_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
#endif // CONFIG_DST1
- } else if (tx_type == FLIPADST_DCT) {
- copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
- vp9_fht4x4(src_diff2, coeff, 4, ADST_DCT);
- } else if (tx_type == DCT_FLIPADST) {
- copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
- vp9_fht4x4(src_diff2, coeff, 4, DCT_ADST);
- } else if (tx_type == FLIPADST_FLIPADST) {
- copy_fliplrud(src_diff, diff_stride, 4, src_diff2, 4);
- vp9_fht4x4(src_diff2, coeff, 4, ADST_ADST);
- } else if (tx_type == ADST_FLIPADST) {
- copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
- vp9_fht4x4(src_diff2, coeff, 4, ADST_ADST);
- } else if (tx_type == FLIPADST_ADST) {
- copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
- vp9_fht4x4(src_diff2, coeff, 4, ADST_ADST);
} else {
vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
}
const int16_t *src_diff, int diff_stride,
tran_low_t *const coeff) {
MACROBLOCKD *const xd = &x->e_mbd;
- int16_t src_diff2[256];
TX_TYPE tx_type = get_tx_type(plane, xd);
if (tx_type == DCT_DCT) {
vp9_highbd_fdct16x16(src_diff, coeff, diff_stride);
} else if (is_dst_used(tx_type)) {
vp9_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
#endif // CONFIG_DST1
- } else if (tx_type == FLIPADST_DCT) {
- copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
- vp9_highbd_fht16x16(src_diff2, coeff, 16, ADST_DCT);
- } else if (tx_type == DCT_FLIPADST) {
- copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
- vp9_highbd_fht16x16(src_diff2, coeff, 16, DCT_ADST);
- } else if (tx_type == FLIPADST_FLIPADST) {
- copy_fliplrud(src_diff, diff_stride, 16, src_diff2, 16);
- vp9_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
- } else if (tx_type == ADST_FLIPADST) {
- copy_fliplr(src_diff, diff_stride, 16, src_diff2, 16);
- vp9_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
- } else if (tx_type == FLIPADST_ADST) {
- copy_flipud(src_diff, diff_stride, 16, src_diff2, 16);
- vp9_highbd_fht16x16(src_diff2, coeff, 16, ADST_ADST);
} else {
vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
}
const int16_t *src_diff, int diff_stride,
tran_low_t *const coeff) {
MACROBLOCKD *const xd = &x->e_mbd;
- int16_t src_diff2[64];
TX_TYPE tx_type = get_tx_type(plane, xd);
if (tx_type == DCT_DCT) {
vp9_highbd_fdct8x8(src_diff, coeff, diff_stride);
} else if (is_dst_used(tx_type)) {
vp9_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
#endif // CONFIG_DST1
- } else if (tx_type == FLIPADST_DCT) {
- copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
- vp9_highbd_fht8x8(src_diff2, coeff, 8, ADST_DCT);
- } else if (tx_type == DCT_FLIPADST) {
- copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
- vp9_highbd_fht8x8(src_diff2, coeff, 8, DCT_ADST);
- } else if (tx_type == FLIPADST_FLIPADST) {
- copy_fliplrud(src_diff, diff_stride, 8, src_diff2, 8);
- vp9_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
- } else if (tx_type == ADST_FLIPADST) {
- copy_fliplr(src_diff, diff_stride, 8, src_diff2, 8);
- vp9_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
- } else if (tx_type == FLIPADST_ADST) {
- copy_flipud(src_diff, diff_stride, 8, src_diff2, 8);
- vp9_highbd_fht8x8(src_diff2, coeff, 8, ADST_ADST);
} else {
vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
}
const int16_t *src_diff, int diff_stride,
tran_low_t *const coeff) {
MACROBLOCKD *const xd = &x->e_mbd;
- int16_t src_diff2[16];
TX_TYPE tx_type = get_tx_type_4x4(plane, xd, block);
if (tx_type == DCT_DCT) {
x->fwd_txm4x4(src_diff, coeff, diff_stride);
} else if (is_dst_used(tx_type)) {
vp9_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
#endif // CONFIG_DST1
- } else if (tx_type == FLIPADST_DCT) {
- copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
- vp9_highbd_fht4x4(src_diff2, coeff, 4, ADST_DCT);
- } else if (tx_type == DCT_FLIPADST) {
- copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
- vp9_highbd_fht4x4(src_diff2, coeff, 4, DCT_ADST);
- } else if (tx_type == FLIPADST_FLIPADST) {
- copy_fliplrud(src_diff, diff_stride, 4, src_diff2, 4);
- vp9_highbd_fht4x4(src_diff2, coeff, 4, ADST_ADST);
- } else if (tx_type == ADST_FLIPADST) {
- copy_fliplr(src_diff, diff_stride, 4, src_diff2, 4);
- vp9_highbd_fht4x4(src_diff2, coeff, 4, ADST_ADST);
- } else if (tx_type == FLIPADST_ADST) {
- copy_flipud(src_diff, diff_stride, 4, src_diff2, 4);
- vp9_highbd_fht4x4(src_diff2, coeff, 4, ADST_ADST);
} else {
vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
}
*/
#include <emmintrin.h> // SSE2
+#include "./vp9_rtcd.h"
#include "vp9/common/vp9_idct.h" // for cospi constants
#include "vp9/encoder/vp9_dct.h"
#include "vp9/encoder/x86/vp9_dct_sse2.h"
store_output(in0, output);
}
+// Reverse the 8 16 bit words in __m128i
+static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+ const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
+ const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
+ return _mm_shuffle_epi32(b, 0x4e);
+}
+
static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
- int stride) {
+ int stride, int flipud, int fliplr) {
const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
__m128i mask;
- in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
- in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+ if (!flipud) {
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+ } else {
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ }
+
+ if (fliplr) {
+ in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+ in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+ in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+ in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+ }
in[0] = _mm_slli_epi16(in[0], 4);
in[1] = _mm_slli_epi16(in[1], 4);
vp9_fdct4x4_sse2(input, output, stride);
break;
case ADST_DCT:
- load_buffer_4x4(input, in, stride);
+ load_buffer_4x4(input, in, stride, 0, 0);
fadst4_sse2(in);
fdct4_sse2(in);
write_buffer_4x4(output, in);
break;
case DCT_ADST:
- load_buffer_4x4(input, in, stride);
+ load_buffer_4x4(input, in, stride, 0, 0);
fdct4_sse2(in);
fadst4_sse2(in);
write_buffer_4x4(output, in);
break;
case ADST_ADST:
- load_buffer_4x4(input, in, stride);
+ load_buffer_4x4(input, in, stride, 0, 0);
+ fadst4_sse2(in);
+ fadst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ load_buffer_4x4(input, in, stride, 1, 0);
+ fadst4_sse2(in);
+ fdct4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_4x4(input, in, stride, 0, 1);
+ fdct4_sse2(in);
+ fadst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_4x4(input, in, stride, 1, 1);
+ fadst4_sse2(in);
+ fadst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_4x4(input, in, stride, 0, 1);
+ fadst4_sse2(in);
+ fadst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_4x4(input, in, stride, 1, 0);
fadst4_sse2(in);
fadst4_sse2(in);
write_buffer_4x4(output, in);
break;
+#endif // CONFIG_EXT_TX
default:
assert(0);
break;
// load 8x8 array
static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
- int stride) {
- in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
- in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
- in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
- in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
- in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
- in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
- in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
- in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ int stride, int flipud, int fliplr) {
+ if (!flipud) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ } else {
+ in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ }
+
+ if (fliplr) {
+ in[0] = mm_reverse_epi16(in[0]);
+ in[1] = mm_reverse_epi16(in[1]);
+ in[2] = mm_reverse_epi16(in[2]);
+ in[3] = mm_reverse_epi16(in[3]);
+ in[4] = mm_reverse_epi16(in[4]);
+ in[5] = mm_reverse_epi16(in[5]);
+ in[6] = mm_reverse_epi16(in[6]);
+ in[7] = mm_reverse_epi16(in[7]);
+ }
in[0] = _mm_slli_epi16(in[0], 2);
in[1] = _mm_slli_epi16(in[1], 2);
vp9_fdct8x8_sse2(input, output, stride);
break;
case ADST_DCT:
- load_buffer_8x8(input, in, stride);
+ load_buffer_8x8(input, in, stride, 0, 0);
fadst8_sse2(in);
fdct8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case DCT_ADST:
- load_buffer_8x8(input, in, stride);
+ load_buffer_8x8(input, in, stride, 0, 0);
fdct8_sse2(in);
fadst8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case ADST_ADST:
- load_buffer_8x8(input, in, stride);
+ load_buffer_8x8(input, in, stride, 0, 0);
+ fadst8_sse2(in);
+ fadst8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ load_buffer_8x8(input, in, stride, 1, 0);
+ fadst8_sse2(in);
+ fdct8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_8x8(input, in, stride, 0, 1);
+ fdct8_sse2(in);
+ fadst8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_8x8(input, in, stride, 1, 1);
fadst8_sse2(in);
fadst8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
+ case ADST_FLIPADST:
+ load_buffer_8x8(input, in, stride, 0, 1);
+ fadst8_sse2(in);
+ fadst8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_8x8(input, in, stride, 1, 0);
+ fadst8_sse2(in);
+ fadst8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+#endif // CONFIG_EXT_TX
default:
assert(0);
break;
}
static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
- __m128i *in1, int stride) {
+ __m128i *in1, int stride,
+ int flipud, int fliplr) {
+ // Load 4 8x8 blocks
+ const int16_t *topL = input;
+ const int16_t *topR = input + 8;
+ const int16_t *botL = input + 8 * stride;
+ const int16_t *botR = input + 8 * stride + 8;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ // Swap left columns
+ tmp = topL; topL = botL; botL = tmp;
+ // Swap right columns
+ tmp = topR; topR = botR; botR = tmp;
+ }
+
+ if (fliplr) {
+ // Swap top rows
+ tmp = topL; topL = topR; topR = tmp;
+ // Swap bottom rows
+ tmp = botL; botL = botR; botR = tmp;
+ }
+
// load first 8 columns
- load_buffer_8x8(input, in0, stride);
- load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+ load_buffer_8x8(topL, in0, stride, flipud, fliplr);
+ load_buffer_8x8(botL, in0 + 8, stride, flipud, fliplr);
- input += 8;
// load second 8 columns
- load_buffer_8x8(input, in1, stride);
- load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+ load_buffer_8x8(topR, in1, stride, flipud, fliplr);
+ load_buffer_8x8(botR, in1 + 8, stride, flipud, fliplr);
}
static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
vp9_fdct16x16_sse2(input, output, stride);
break;
case ADST_DCT:
- load_buffer_16x16(input, in0, in1, stride);
+ load_buffer_16x16(input, in0, in1, stride, 0, 0);
fadst16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fdct16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
case DCT_ADST:
- load_buffer_16x16(input, in0, in1, stride);
+ load_buffer_16x16(input, in0, in1, stride, 0, 0);
fdct16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
case ADST_ADST:
- load_buffer_16x16(input, in0, in1, stride);
+ load_buffer_16x16(input, in0, in1, stride, 0, 0);
+ fadst16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ load_buffer_16x16(input, in0, in1, stride, 1, 0);
+ fadst16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fdct16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_16x16(input, in0, in1, stride, 0, 1);
+ fdct16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_16x16(input, in0, in1, stride, 1, 1);
+ fadst16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_16x16(input, in0, in1, stride, 0, 1);
+ fadst16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_16x16(input, in0, in1, stride, 1, 0);
fadst16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
+#endif // CONFIG_EXT_TX
default:
assert(0);
break;
if (tx_type == DCT_DCT) {
vp9_highbd_fdct4x4_sse2(input, output, stride);
} else {
- tran_low_t out[4 * 4];
- tran_low_t *outptr = &out[0];
- int i, j;
- tran_low_t temp_in[4], temp_out[4];
- const transform_2d ht = FHT_4[tx_type];
-
- // Columns
- for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j)
- temp_in[j] = input[j * stride + i] * 16;
- if (i == 0 && temp_in[0])
- temp_in[0] += 1;
- ht.cols(temp_in, temp_out);
- for (j = 0; j < 4; ++j)
- outptr[j * 4 + i] = temp_out[j];
- }
-
- // Rows
- for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j)
- temp_in[j] = out[j + i * 4];
- ht.rows(temp_in, temp_out);
- for (j = 0; j < 4; ++j)
- output[j + i * 4] = (temp_out[j] + 1) >> 2;
- }
+ vp9_highbd_fht4x4_c(input, output, stride, tx_type);
}
}
if (tx_type == DCT_DCT) {
vp9_highbd_fdct8x8_sse2(input, output, stride);
} else {
- tran_low_t out[64];
- tran_low_t *outptr = &out[0];
- int i, j;
- tran_low_t temp_in[8], temp_out[8];
- const transform_2d ht = FHT_8[tx_type];
-
- // Columns
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j)
- temp_in[j] = input[j * stride + i] * 4;
- ht.cols(temp_in, temp_out);
- for (j = 0; j < 8; ++j)
- outptr[j * 8 + i] = temp_out[j];
- }
-
- // Rows
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j)
- temp_in[j] = out[j + i * 8];
- ht.rows(temp_in, temp_out);
- for (j = 0; j < 8; ++j)
- output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
- }
+ vp9_highbd_fht8x8_c(input, output, stride, tx_type);
}
}
-void vp9_highbd_fht16x16_sse2(int16_t *input, tran_low_t *output,
+void vp9_highbd_fht16x16_sse2(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
vp9_highbd_fdct16x16_sse2(input, output, stride);
} else {
- tran_low_t out[256];
- tran_low_t *outptr = &out[0];
- int i, j;
- tran_low_t temp_in[16], temp_out[16];
- const transform_2d ht = FHT_16[tx_type];
-
- // Columns
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j)
- temp_in[j] = input[j * stride + i] * 4;
- ht.cols(temp_in, temp_out);
- for (j = 0; j < 16; ++j)
- outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
- }
-
- // Rows
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j)
- temp_in[j] = out[j + i * 16];
- ht.rows(temp_in, temp_out);
- for (j = 0; j < 16; ++j)
- output[j + i * 16] = temp_out[j];
- }
+ vp9_highbd_fht16x16_c(input, output, stride, tx_type);
}
}
#endif // CONFIG_VP9_HIGHBITDEPTH