From 85ab9d56cc8c8e00cb2d5bb6fc8283ab40a30fe0 Mon Sep 17 00:00:00 2001 From: Geza Lore Date: Fri, 13 Nov 2015 15:16:28 +0000 Subject: [PATCH] Flip the result of the inv transform for FLIPADST. This is a port of 4f5108090a6047d5d4d9ce1df302da23b2ef4bc5 This commit also fixes a bug where FLIPADST transforms when combined with a DST (that is FLIPADST_DST and DST_FLIPADST) did not actually did a flipped transform but a straight ADST instead. This was due to the C implementation that it fell back on not implementing flipping. This is now fixed as well and FLIPADST_DST and DST_FLIPADST does what it is supposed to do. There are 3 functions in the SR_MODE experiment that should be updated, but given that the build of SR_MODE is broken at the upstream tip of nextgen, I could not test these, so I have put in assertions and FIXME notes at the problematic places. Change-Id: I5b8175b85f944f2369b183a26256e08d97f4bdef --- vp9/common/vp9_idct.c | 596 ++++++++++++-------------- vp9/common/x86/vp9_idct_intrin_sse2.c | 171 +++++++- 2 files changed, 423 insertions(+), 344 deletions(-) diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index 81106446d..a9431f4da 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -16,6 +16,59 @@ #include "vp9/common/vp9_idct.h" #if CONFIG_EXT_TX +#define FLIPUD_PTR(dest, stride, size) do { \ + (dest) = (dest) + ((size) - 1) * (stride); \ + (stride) = - (stride); \ +} while (0) + +static void maybe_flip_strides(uint8_t **dst, int *dstride, + tran_low_t **src, int *sstride, + int tx_type, int size) { + // Note that the transpose of src will be added to dst. In order to LR + // flip the addends (in dst coordinates), we UD flip the src. To UD flip + // the addends, we UD flip the dst. + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + break; + case FLIPADST_DCT: + case FLIPADST_ADST: + // flip UD + FLIPUD_PTR(*dst, *dstride, size); + break; + case DCT_FLIPADST: + case ADST_FLIPADST: + // flip LR + FLIPUD_PTR(*src, *sstride, size); + break; + case FLIPADST_FLIPADST: + // flip UD + FLIPUD_PTR(*dst, *dstride, size); + // flip LR + FLIPUD_PTR(*src, *sstride, size); + break; + case DST_DST: + case DCT_DST: + case DST_DCT: + case DST_ADST: + case ADST_DST: + break; + case DST_FLIPADST: + // flip LR + FLIPUD_PTR(*src, *sstride, size); + break; + case FLIPADST_DST: + // flip UD + FLIPUD_PTR(*dst, *dstride, size); + break; + default: + assert(0); + break; + } +} + void idst4(const tran_low_t *input, tran_low_t *output) { // {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2) static const int32_t sinvalue_lookup[] = { @@ -635,25 +688,41 @@ void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, }; int i, j; - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - tran_low_t temp_in[4], temp_out[4]; + tran_low_t tmp; + tran_low_t out[4][4]; + tran_low_t *outp = &out[0][0]; + int outstride = 4; // inverse transform row vectors for (i = 0; i < 4; ++i) { - IHT_4[tx_type].rows(input, outptr); + IHT_4[tx_type].rows(input, out[i]); input += 4; - outptr += 4; + } + + // transpose + for (i = 1 ; i < 4; i++) { + for (j = 0; j < i; j++) { + tmp = out[i][j]; + out[i][j] = out[j][i]; + out[j][i] = tmp; + } } // inverse transform column vectors for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = out[j * 4 + i]; - IHT_4[tx_type].cols(temp_in, temp_out); + IHT_4[tx_type].cols(out[i], out[i]); + } + +#if CONFIG_EXT_TX + maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4); +#endif + + // Sum with the destination + for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 4)); + int d = i * stride + j; + int s = j * outstride + i; + dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4)); } } } @@ -756,97 +825,44 @@ static const transform_2d IHT_8[] = { #endif // CONFIG_EXT_TX }; -#if CONFIG_EXT_TX -void fliplr(uint8_t *dest, int stride, int l) { - int i, j; - for (i = 0; i < l; ++i) { - for (j = 0; j < l / 2; ++j) { - const uint8_t tmp = dest[i * stride + j]; - dest[i * stride + j] = dest[i * stride + l - 1 - j]; - dest[i * stride + l - 1 - j] = tmp; - } - } -} - -void flipud(uint8_t *dest, int stride, int l) { - int i, j; - for (j = 0; j < l; ++j) { - for (i = 0; i < l / 2; ++i) { - const uint8_t tmp = dest[i * stride + j]; - dest[i * stride + j] = dest[(l - 1 - i) * stride + j]; - dest[(l - 1 - i) * stride + j] = tmp; - } - } -} - -void fliplrud(uint8_t *dest, int stride, int l) { - int i, j; - for (i = 0; i < l / 2; ++i) { - for (j = 0; j < l; ++j) { - const uint8_t tmp = dest[i * stride + j]; - dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j]; - dest[(l - 1 - i) * stride + l - 1 - j] = tmp; - } - } -} - -void fliplr16(uint16_t *dest, int stride, int l) { +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { int i, j; - for (i = 0; i < l; ++i) { - for (j = 0; j < l / 2; ++j) { - const uint16_t tmp = dest[i * stride + j]; - dest[i * stride + j] = dest[i * stride + l - 1 - j]; - dest[i * stride + l - 1 - j] = tmp; - } - } -} + tran_low_t tmp; + tran_low_t out[8][8]; + tran_low_t *outp = &out[0][0]; + int outstride = 8; -void flipud16(uint16_t *dest, int stride, int l) { - int i, j; - for (j = 0; j < l; ++j) { - for (i = 0; i < l / 2; ++i) { - const uint16_t tmp = dest[i * stride + j]; - dest[i * stride + j] = dest[(l - 1 - i) * stride + j]; - dest[(l - 1 - i) * stride + j] = tmp; - } + // inverse transform row vectors + for (i = 0; i < 8; ++i) { + IHT_8[tx_type].rows(input, out[i]); + input += 8; } -} -void fliplrud16(uint16_t *dest, int stride, int l) { - int i, j; - for (i = 0; i < l / 2; ++i) { - for (j = 0; j < l; ++j) { - const uint16_t tmp = dest[i * stride + j]; - dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j]; - dest[(l - 1 - i) * stride + l - 1 - j] = tmp; + // transpose + for (i = 1 ; i < 8; i++) { + for (j = 0; j < i; j++) { + tmp = out[i][j]; + out[i][j] = out[j][i]; + out[j][i] = tmp; } } -} -#endif // CONFIG_EXT_TX - -void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, - int tx_type) { - int i, j; - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - tran_low_t temp_in[8], temp_out[8]; - const transform_2d ht = IHT_8[tx_type]; - // inverse transform row vectors + // inverse transform column vectors for (i = 0; i < 8; ++i) { - ht.rows(input, outptr); - input += 8; - outptr += 8; + IHT_8[tx_type].cols(out[i], out[i]); } - // inverse transform column vectors +#if CONFIG_EXT_TX + maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8); +#endif + + // Sum with the destination for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j * 8 + i]; - ht.cols(temp_in, temp_out); for (j = 0; j < 8; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 5)); + int d = i * stride + j; + int s = j * outstride + i; + dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5)); } } } @@ -1291,26 +1307,41 @@ static const transform_2d IHT_16[] = { void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { int i, j; - tran_low_t out[16 * 16]; - tran_low_t *outptr = out; - tran_low_t temp_in[16], temp_out[16]; - const transform_2d ht = IHT_16[tx_type]; + tran_low_t tmp; + tran_low_t out[16][16]; + tran_low_t *outp = &out[0][0]; + int outstride = 16; - // Rows + // inverse transform row vectors for (i = 0; i < 16; ++i) { - ht.rows(input, outptr); - input += 16; - outptr += 16; + IHT_16[tx_type].rows(input, out[i]); + input += 16; } - // Columns + // transpose + for (i = 1 ; i < 16; i++) { + for (j = 0; j < i; j++) { + tmp = out[i][j]; + out[i][j] = out[j][i]; + out[j][i] = tmp; + } + } + + // inverse transform column vectors + for (i = 0; i < 16; ++i) { + IHT_16[tx_type].cols(out[i], out[i]); + } + +#if CONFIG_EXT_TX + maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 16); +#endif + + // Sum with the destination for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j * 16 + i]; - ht.cols(temp_in, temp_out); for (j = 0; j < 16; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)); + int d = i * stride + j; + int s = j * outstride + i; + dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6)); } } } @@ -1911,26 +1942,6 @@ void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, #if CONFIG_EXT_TX } else if (is_dst_used(tx_type)) { vp9_iht4x4_16_add_c(input, dest, stride, tx_type); - } else if (tx_type == FLIPADST_DCT) { - flipud(dest, stride, 4); - vp9_iht4x4_16_add(input, dest, stride, ADST_DCT); - flipud(dest, stride, 4); - } else if (tx_type == DCT_FLIPADST) { - fliplr(dest, stride, 4); - vp9_iht4x4_16_add(input, dest, stride, DCT_ADST); - fliplr(dest, stride, 4); - } else if (tx_type == FLIPADST_FLIPADST) { - fliplrud(dest, stride, 4); - vp9_iht4x4_16_add(input, dest, stride, ADST_ADST); - fliplrud(dest, stride, 4); - } else if (tx_type == ADST_FLIPADST) { - fliplr(dest, stride, 4); - vp9_iht4x4_16_add(input, dest, stride, ADST_ADST); - fliplr(dest, stride, 4); - } else if (tx_type == FLIPADST_ADST) { - flipud(dest, stride, 4); - vp9_iht4x4_16_add(input, dest, stride, ADST_ADST); - flipud(dest, stride, 4); #endif // CONFIG_EXT_TX } else { vp9_iht4x4_16_add(input, dest, stride, tx_type); @@ -1944,26 +1955,6 @@ void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, #if CONFIG_EXT_TX } else if (is_dst_used(tx_type)) { vp9_iht8x8_64_add_c(input, dest, stride, tx_type); - } else if (tx_type == FLIPADST_DCT) { - flipud(dest, stride, 8); - vp9_iht8x8_64_add(input, dest, stride, ADST_DCT); - flipud(dest, stride, 8); - } else if (tx_type == DCT_FLIPADST) { - fliplr(dest, stride, 8); - vp9_iht8x8_64_add(input, dest, stride, DCT_ADST); - fliplr(dest, stride, 8); - } else if (tx_type == FLIPADST_FLIPADST) { - fliplrud(dest, stride, 8); - vp9_iht8x8_64_add(input, dest, stride, ADST_ADST); - fliplrud(dest, stride, 8); - } else if (tx_type == ADST_FLIPADST) { - fliplr(dest, stride, 8); - vp9_iht8x8_64_add(input, dest, stride, ADST_ADST); - fliplr(dest, stride, 8); - } else if (tx_type == FLIPADST_ADST) { - flipud(dest, stride, 8); - vp9_iht8x8_64_add(input, dest, stride, ADST_ADST); - flipud(dest, stride, 8); #endif // CONFIG_EXT_TX } else { vp9_iht8x8_64_add(input, dest, stride, tx_type); @@ -1977,26 +1968,6 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, #if CONFIG_EXT_TX } else if (is_dst_used(tx_type)) { vp9_iht16x16_256_add_c(input, dest, stride, tx_type); - } else if (tx_type == FLIPADST_DCT) { - flipud(dest, stride, 16); - vp9_iht16x16_256_add(input, dest, stride, ADST_DCT); - flipud(dest, stride, 16); - } else if (tx_type == DCT_FLIPADST) { - fliplr(dest, stride, 16); - vp9_iht16x16_256_add(input, dest, stride, DCT_ADST); - fliplr(dest, stride, 16); - } else if (tx_type == FLIPADST_FLIPADST) { - fliplrud(dest, stride, 16); - vp9_iht16x16_256_add(input, dest, stride, ADST_ADST); - fliplrud(dest, stride, 16); - } else if (tx_type == ADST_FLIPADST) { - fliplr(dest, stride, 16); - vp9_iht16x16_256_add(input, dest, stride, ADST_ADST); - fliplr(dest, stride, 16); - } else if (tx_type == FLIPADST_ADST) { - flipud(dest, stride, 16); - vp9_iht16x16_256_add(input, dest, stride, ADST_ADST); - flipud(dest, stride, 16); #endif // CONFIG_EXT_TX } else { vp9_iht16x16_256_add(input, dest, stride, tx_type); @@ -2775,7 +2746,7 @@ static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) { void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int tx_type, int bd) { - const highbd_transform_2d IHT_4[] = { + const highbd_transform_2d HIGH_IHT_4[] = { { vp9_highbd_idct4, vp9_highbd_idct4 }, // DCT_DCT = 0 { highbd_iadst4, vp9_highbd_idct4 }, // ADST_DCT = 1 { vp9_highbd_idct4, highbd_iadst4 }, // DCT_ADST = 2 @@ -2798,25 +2769,43 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); int i, j; - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - tran_low_t temp_in[4], temp_out[4]; + tran_low_t tmp; + tran_low_t out[4][4]; + tran_low_t *outp = &out[0][0]; + int outstride = 4; - // Inverse transform row vectors. + // inverse transform row vectors for (i = 0; i < 4; ++i) { - IHT_4[tx_type].rows(input, outptr, bd); + HIGH_IHT_4[tx_type].rows(input, out[i], bd); input += 4; - outptr += 4; } - // Inverse transform column vectors. + // transpose + for (i = 1 ; i < 4; i++) { + for (j = 0; j < i; j++) { + tmp = out[i][j]; + out[i][j] = out[j][i]; + out[j][i] = tmp; + } + } + + // inverse transform column vectors + for (i = 0; i < 4; ++i) { + HIGH_IHT_4[tx_type].cols(out[i], out[i], bd); + } + +#if CONFIG_EXT_TX + maybe_flip_strides((uint8_t**)&dest, + &stride, &outp, &outstride, tx_type, 4 * 2); +#endif + + // Sum with the destination for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = out[j * 4 + i]; - IHT_4[tx_type].cols(temp_in, temp_out, bd); for (j = 0; j < 4; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + int d = i * stride + j; + int s = j * outstride + i; + dest[d] = highbd_clip_pixel_add(dest[d], + ROUND_POWER_OF_TWO(outp[s], 4), bd); } } } @@ -2921,28 +2910,46 @@ static const highbd_transform_2d HIGH_IHT_8[] = { void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int tx_type, int bd) { - int i, j; - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - tran_low_t temp_in[8], temp_out[8]; - const highbd_transform_2d ht = HIGH_IHT_8[tx_type]; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - // Inverse transform row vectors. + int i, j; + tran_low_t tmp; + tran_low_t out[8][8]; + tran_low_t *outp = &out[0][0]; + int outstride = 8; + + // inverse transform row vectors for (i = 0; i < 8; ++i) { - ht.rows(input, outptr, bd); - input += 8; - outptr += 8; + HIGH_IHT_8[tx_type].rows(input, out[i], bd); + input += 8; } - // Inverse transform column vectors. + // transpose + for (i = 1 ; i < 8; i++) { + for (j = 0; j < i; j++) { + tmp = out[i][j]; + out[i][j] = out[j][i]; + out[j][i] = tmp; + } + } + + // inverse transform column vectors + for (i = 0; i < 8; ++i) { + HIGH_IHT_8[tx_type].cols(out[i], out[i], bd); + } + +#if CONFIG_EXT_TX + maybe_flip_strides((uint8_t**)&dest, + &stride, &outp, &outstride, tx_type, 8 * 2); +#endif + + // Sum with the destination for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j * 8 + i]; - ht.cols(temp_in, temp_out, bd); for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + int d = i * stride + j; + int s = j * outstride + i; + dest[d] = highbd_clip_pixel_add(dest[d], + ROUND_POWER_OF_TWO(outp[s], 5), bd); } } } @@ -3361,28 +3368,46 @@ static const highbd_transform_2d HIGH_IHT_16[] = { void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int tx_type, int bd) { - int i, j; - tran_low_t out[16 * 16]; - tran_low_t *outptr = out; - tran_low_t temp_in[16], temp_out[16]; - const highbd_transform_2d ht = HIGH_IHT_16[tx_type]; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - // Rows + int i, j; + tran_low_t tmp; + tran_low_t out[16][16]; + tran_low_t *outp = &out[0][0]; + int outstride = 16; + + // inverse transform row vectors for (i = 0; i < 16; ++i) { - ht.rows(input, outptr, bd); - input += 16; - outptr += 16; + HIGH_IHT_16[tx_type].rows(input, out[i], bd); + input += 16; } - // Columns + // transpose + for (i = 1 ; i < 16; i++) { + for (j = 0; j < i; j++) { + tmp = out[i][j]; + out[i][j] = out[j][i]; + out[j][i] = tmp; + } + } + + // inverse transform column vectors + for (i = 0; i < 16; ++i) { + HIGH_IHT_16[tx_type].cols(out[i], out[i], bd); + } + +#if CONFIG_EXT_TX + maybe_flip_strides((uint8_t**)&dest, &stride, + &outp, &outstride, tx_type, 16 * 2); +#endif + + // Sum with the destination for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j * 16 + i]; - ht.cols(temp_in, temp_out, bd); for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + int d = i * stride + j; + int s = j * outstride + i; + dest[d] = highbd_clip_pixel_add(dest[d], + ROUND_POWER_OF_TWO(outp[s], 6), bd); } } } @@ -3954,26 +3979,6 @@ void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, #if CONFIG_EXT_TX } else if (is_dst_used(tx_type)) { vp9_highbd_iht4x4_16_add_c(input, dest, stride, tx_type, bd); - } else if (tx_type == FLIPADST_DCT) { - flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4); - vp9_highbd_iht4x4_16_add(input, dest, stride, ADST_DCT, bd); - flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4); - } else if (tx_type == DCT_FLIPADST) { - fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4); - vp9_highbd_iht4x4_16_add(input, dest, stride, DCT_ADST, bd); - fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4); - } else if (tx_type == FLIPADST_FLIPADST) { - fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 4); - vp9_highbd_iht4x4_16_add(input, dest, stride, ADST_ADST, bd); - fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 4); - } else if (tx_type == ADST_FLIPADST) { - fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4); - vp9_highbd_iht4x4_16_add(input, dest, stride, ADST_ADST, bd); - fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 4); - } else if (tx_type == FLIPADST_ADST) { - flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4); - vp9_highbd_iht4x4_16_add(input, dest, stride, ADST_ADST, bd); - flipud16(CONVERT_TO_SHORTPTR(dest), stride, 4); #endif // CONFIG_EXT_TX } else { vp9_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd); @@ -3987,26 +3992,6 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, #if CONFIG_EXT_TX } else if (is_dst_used(tx_type)) { vp9_highbd_iht8x8_64_add_c(input, dest, stride, tx_type, bd); - } else if (tx_type == FLIPADST_DCT) { - flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8); - vp9_highbd_iht8x8_64_add(input, dest, stride, ADST_DCT, bd); - flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8); - } else if (tx_type == DCT_FLIPADST) { - fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8); - vp9_highbd_iht8x8_64_add(input, dest, stride, DCT_ADST, bd); - fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8); - } else if (tx_type == FLIPADST_FLIPADST) { - fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 8); - vp9_highbd_iht8x8_64_add(input, dest, stride, ADST_ADST, bd); - fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 8); - } else if (tx_type == ADST_FLIPADST) { - fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8); - vp9_highbd_iht8x8_64_add(input, dest, stride, ADST_ADST, bd); - fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 8); - } else if (tx_type == FLIPADST_ADST) { - flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8); - vp9_highbd_iht8x8_64_add(input, dest, stride, ADST_ADST, bd); - flipud16(CONVERT_TO_SHORTPTR(dest), stride, 8); #endif // CONFIG_EXT_TX } else { vp9_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd); @@ -4020,26 +4005,6 @@ void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, #if CONFIG_EXT_TX } else if (is_dst_used(tx_type)) { vp9_highbd_iht16x16_256_add_c(input, dest, stride, tx_type, bd); - } else if (tx_type == FLIPADST_DCT) { - flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16); - vp9_highbd_iht16x16_256_add(input, dest, stride, ADST_DCT, bd); - flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16); - } else if (tx_type == DCT_FLIPADST) { - fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16); - vp9_highbd_iht16x16_256_add(input, dest, stride, DCT_ADST, bd); - fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16); - } else if (tx_type == FLIPADST_FLIPADST) { - fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 16); - vp9_highbd_iht16x16_256_add(input, dest, stride, ADST_ADST, bd); - fliplrud16(CONVERT_TO_SHORTPTR(dest), stride, 16); - } else if (tx_type == ADST_FLIPADST) { - fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16); - vp9_highbd_iht16x16_256_add(input, dest, stride, ADST_ADST, bd); - fliplr16(CONVERT_TO_SHORTPTR(dest), stride, 16); - } else if (tx_type == FLIPADST_ADST) { - flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16); - vp9_highbd_iht16x16_256_add(input, dest, stride, ADST_ADST, bd); - flipud16(CONVERT_TO_SHORTPTR(dest), stride, 16); #endif // CONFIG_EXT_TX } else { vp9_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd); @@ -4276,6 +4241,19 @@ void vp9_iht4x4_16_c(const tran_low_t *input, int16_t *dest, int stride, tran_low_t *outptr = out; tran_low_t temp_in[4], temp_out[4]; + // FIXME: If the SR_MODE experiment is resurrected, then this function must + // be fixed to handle the FLIPADST cases by actually flipping its output + // See the other vp9_iht*add_c functions +#if CONFIG_EXT_TX + assert(tx_type != FLIPADST_DCT); + assert(tx_type != DCT_FLIPADST); + assert(tx_type != FLIPADST_FLIPADST); + assert(tx_type != ADST_FLIPADST); + assert(tx_type != FLIPADST_ADST); + assert(tx_type != DST_FLIPADST); + assert(tx_type != FLIPADST_DST); +#endif // CONFIG_EXT_TX + // inverse transform row vectors for (i = 0; i < 4; ++i) { IHT_4[tx_type].rows(input, outptr); @@ -4302,6 +4280,19 @@ void vp9_iht8x8_64_c(const tran_low_t *input, int16_t *dest, int stride, tran_low_t temp_in[8], temp_out[8]; const transform_2d ht = IHT_8[tx_type]; + // FIXME: If the SR_MODE experiment is resurrected, then this function must + // be fixed to handle the FLIPADST cases by actually flipping its output + // See the other vp9_iht*add_c functions +#if CONFIG_EXT_TX + assert(tx_type != FLIPADST_DCT); + assert(tx_type != DCT_FLIPADST); + assert(tx_type != FLIPADST_FLIPADST); + assert(tx_type != ADST_FLIPADST); + assert(tx_type != FLIPADST_ADST); + assert(tx_type != DST_FLIPADST); + assert(tx_type != FLIPADST_DST); +#endif // CONFIG_EXT_TX + // inverse transform row vectors for (i = 0; i < 8; ++i) { ht.rows(input, outptr); @@ -4378,6 +4369,19 @@ void vp9_iht16x16_256_c(const tran_low_t *input, int16_t *dest, int stride, tran_low_t temp_in[16], temp_out[16]; const transform_2d ht = IHT_16[tx_type]; + // FIXME: If the SR_MODE experiment is resurrected, then this function must + // be fixed to handle the FLIPADST cases by actually flipping its output + // See the other vp9_iht*add_c functions +#if CONFIG_EXT_TX + assert(tx_type != FLIPADST_DCT); + assert(tx_type != DCT_FLIPADST); + assert(tx_type != FLIPADST_FLIPADST); + assert(tx_type != ADST_FLIPADST); + assert(tx_type != FLIPADST_ADST); + assert(tx_type != DST_FLIPADST); + assert(tx_type != FLIPADST_DST); +#endif // CONFIG_EXT_TX + // Rows for (i = 0; i < 16; ++i) { ht.rows(input, outptr); @@ -4582,26 +4586,6 @@ void vp9_iht4x4(TX_TYPE tx_type, const tran_low_t *input, int16_t *dest, #if CONFIG_EXT_TX } else if (is_dst_used(tx_type)) { vp9_iht4x4_16_c(input, dest, stride, tx_type); - } else if (tx_type == FLIPADST_DCT) { - flipud(dest, stride, 4); - vp9_iht4x4_16(input, dest, stride, ADST_DCT); - flipud(dest, stride, 4); - } else if (tx_type == DCT_FLIPADST) { - fliplr(dest, stride, 4); - vp9_iht4x4_16(input, dest, stride, DCT_ADST); - fliplr(dest, stride, 4); - } else if (tx_type == FLIPADST_FLIPADST) { - fliplrud(dest, stride, 4); - vp9_iht4x4_16(input, dest, stride, ADST_ADST); - fliplrud(dest, stride, 4); - } else if (tx_type == ADST_FLIPADST) { - fliplr(dest, stride, 4); - vp9_iht4x4_16(input, dest, stride, ADST_ADST); - fliplr(dest, stride, 4); - } else if (tx_type == FLIPADST_ADST) { - flipud(dest, stride, 4); - vp9_iht4x4_16(input, dest, stride, ADST_ADST); - flipud(dest, stride, 4); #endif // CONFIG_EXT_TX } else { vp9_iht4x4_16(input, dest, stride, tx_type); @@ -4615,26 +4599,6 @@ void vp9_iht8x8(TX_TYPE tx_type, const tran_low_t *input, int16_t *dest, #if CONFIG_EXT_TX } else if (is_dst_used(tx_type)) { vp9_iht8x8_64_c(input, dest, stride, tx_type); - } else if (tx_type == FLIPADST_DCT) { - flipud(dest, stride, 8); - vp9_iht8x8_64(input, dest, stride, ADST_DCT); - flipud(dest, stride, 8); - } else if (tx_type == DCT_FLIPADST) { - fliplr(dest, stride, 8); - vp9_iht8x8_64(input, dest, stride, DCT_ADST); - fliplr(dest, stride, 8); - } else if (tx_type == FLIPADST_FLIPADST) { - fliplrud(dest, stride, 8); - vp9_iht8x8_64(input, dest, stride, ADST_ADST); - fliplrud(dest, stride, 8); - } else if (tx_type == ADST_FLIPADST) { - fliplr(dest, stride, 8); - vp9_iht8x8_64(input, dest, stride, ADST_ADST); - fliplr(dest, stride, 8); - } else if (tx_type == FLIPADST_ADST) { - flipud(dest, stride, 8); - vp9_iht8x8_64(input, dest, stride, ADST_ADST); - flipud(dest, stride, 8); #endif // CONFIG_EXT_TX } else { vp9_iht8x8_64(input, dest, stride, tx_type); @@ -4648,26 +4612,6 @@ void vp9_iht16x16(TX_TYPE tx_type, const tran_low_t *input, int16_t *dest, #if CONFIG_EXT_TX } else if (is_dst_used(tx_type)) { vp9_iht16x16_256_c(input, dest, stride, tx_type); - } else if (tx_type == FLIPADST_DCT) { - flipud(dest, stride, 16); - vp9_iht16x16_256(input, dest, stride, ADST_DCT); - flipud(dest, stride, 16); - } else if (tx_type == DCT_FLIPADST) { - fliplr(dest, stride, 16); - vp9_iht16x16_256(input, dest, stride, DCT_ADST); - fliplr(dest, stride, 16); - } else if (tx_type == FLIPADST_FLIPADST) { - fliplrud(dest, stride, 16); - vp9_iht16x16_256(input, dest, stride, ADST_ADST); - fliplrud(dest, stride, 16); - } else if (tx_type == ADST_FLIPADST) { - fliplr(dest, stride, 16); - vp9_iht16x16_256(input, dest, stride, ADST_ADST); - fliplr(dest, stride, 16); - } else if (tx_type == FLIPADST_ADST) { - flipud(dest, stride, 16); - vp9_iht16x16_256(input, dest, stride, ADST_ADST); - flipud(dest, stride, 16); #endif // CONFIG_EXT_TX } else { vp9_iht16x16_256(input, dest, stride, tx_type); diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 45fd95b81..5270f5903 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -11,6 +11,55 @@ #include "vp9/common/x86/vp9_idct_intrin_sse2.h" #include "vp9/common/vp9_idct.h" +#include "vp9/common/vp9_enums.h" + +#if CONFIG_EXT_TX +// Reverse the 8 16 bit words in __m128i +static INLINE __m128i mm_reverse_epi16(const __m128i x) { + const __m128i a = _mm_shufflelo_epi16(x, 0x1b); + const __m128i b = _mm_shufflehi_epi16(a, 0x1b); + return _mm_shuffle_epi32(b, 0x4e); +} + +static INLINE void fliplr_4x4(__m128i in[2]) { + in[0] = _mm_shufflelo_epi16(in[0], 0x1b); + in[0] = _mm_shufflehi_epi16(in[0], 0x1b); + in[1] = _mm_shufflelo_epi16(in[1], 0x1b); + in[1] = _mm_shufflehi_epi16(in[1], 0x1b); +} + +static INLINE void fliplr_8x8(__m128i in[8]) { + in[0] = mm_reverse_epi16(in[0]); + in[1] = mm_reverse_epi16(in[1]); + in[2] = mm_reverse_epi16(in[2]); + in[3] = mm_reverse_epi16(in[3]); + + in[4] = mm_reverse_epi16(in[4]); + in[5] = mm_reverse_epi16(in[5]); + in[6] = mm_reverse_epi16(in[6]); + in[7] = mm_reverse_epi16(in[7]); +} + +static INLINE void fliplr_16x8(__m128i in[16]) { + fliplr_8x8(&in[0]); + fliplr_8x8(&in[8]); +} + +#define FLIPLR_16x16(in0, in1) do { \ + __m128i *tmp; \ + fliplr_16x8(in0); \ + fliplr_16x8(in1); \ + tmp = (in0); \ + (in0) = (in1); \ + (in1) = tmp; \ +} while (0) + +#define FLIPUD_PTR(dest, stride, size) do { \ + (dest) = (dest) + ((size) - 1) * (stride); \ + (stride) = - (stride); \ +} while (0) +#endif + #define RECON_AND_STORE4X4(dest, in_x) \ { \ __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ @@ -126,12 +175,12 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { // Reconstruction and Store { - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0)); + __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1)); __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi32(d0, - _mm_cvtsi32_si128(*(const int *) (dest + stride))); - d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128( - *(const int *) (dest + stride * 3)), d2); + __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); + d0 = _mm_unpacklo_epi32(d0, d1); + d2 = _mm_unpacklo_epi32(d3, d2); d0 = _mm_unpacklo_epi8(d0, zero); d2 = _mm_unpacklo_epi8(d2, zero); d0 = _mm_add_epi16(d0, input2); @@ -271,22 +320,50 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, in[1]= _mm_loadu_si128((const __m128i *)(input + 8)); switch (tx_type) { - case 0: // DCT_DCT + case DCT_DCT: idct4_sse2(in); idct4_sse2(in); break; - case 1: // ADST_DCT + case ADST_DCT: idct4_sse2(in); iadst4_sse2(in); break; - case 2: // DCT_ADST + case DCT_ADST: iadst4_sse2(in); idct4_sse2(in); break; - case 3: // ADST_ADST + case ADST_ADST: iadst4_sse2(in); iadst4_sse2(in); break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + idct4_sse2(in); + iadst4_sse2(in); + FLIPUD_PTR(dest, stride, 4); + break; + case DCT_FLIPADST: + iadst4_sse2(in); + idct4_sse2(in); + fliplr_4x4(in); + break; + case FLIPADST_FLIPADST: + iadst4_sse2(in); + iadst4_sse2(in); + FLIPUD_PTR(dest, stride, 4); + fliplr_4x4(in); + break; + case ADST_FLIPADST: + iadst4_sse2(in); + iadst4_sse2(in); + fliplr_4x4(in); + break; + case FLIPADST_ADST: + iadst4_sse2(in); + iadst4_sse2(in); + FLIPUD_PTR(dest, stride, 4); + break; +#endif // CONFIG_EXT_TX default: assert(0); break; @@ -875,22 +952,50 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); switch (tx_type) { - case 0: // DCT_DCT + case DCT_DCT: + idct8_sse2(in); idct8_sse2(in); + break; + case ADST_DCT: + idct8_sse2(in); + iadst8_sse2(in); + break; + case DCT_ADST: + iadst8_sse2(in); idct8_sse2(in); break; - case 1: // ADST_DCT + case ADST_ADST: + iadst8_sse2(in); + iadst8_sse2(in); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: idct8_sse2(in); iadst8_sse2(in); + FLIPUD_PTR(dest, stride, 8); break; - case 2: // DCT_ADST + case DCT_FLIPADST: iadst8_sse2(in); idct8_sse2(in); + fliplr_8x8(in); break; - case 3: // ADST_ADST + case FLIPADST_FLIPADST: iadst8_sse2(in); iadst8_sse2(in); + FLIPUD_PTR(dest, stride, 8); + fliplr_8x8(in); break; + case ADST_FLIPADST: + iadst8_sse2(in); + iadst8_sse2(in); + fliplr_8x8(in); + break; + case FLIPADST_ADST: + iadst8_sse2(in); + iadst8_sse2(in); + FLIPUD_PTR(dest, stride, 8); + break; +#endif // CONFIG_EXT_TX default: assert(0); break; @@ -2331,29 +2436,59 @@ static void iadst16_sse2(__m128i *in0, __m128i *in1) { void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, int tx_type) { - __m128i in0[16], in1[16]; + __m128i in[32]; + __m128i *in0 = &in[0]; + __m128i *in1 = &in[16]; load_buffer_8x16(input, in0); input += 8; load_buffer_8x16(input, in1); switch (tx_type) { - case 0: // DCT_DCT + case DCT_DCT: idct16_sse2(in0, in1); idct16_sse2(in0, in1); break; - case 1: // ADST_DCT + case ADST_DCT: idct16_sse2(in0, in1); iadst16_sse2(in0, in1); break; - case 2: // DCT_ADST + case DCT_ADST: iadst16_sse2(in0, in1); idct16_sse2(in0, in1); break; - case 3: // ADST_ADST + case ADST_ADST: + iadst16_sse2(in0, in1); + iadst16_sse2(in0, in1); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + idct16_sse2(in0, in1); + iadst16_sse2(in0, in1); + FLIPUD_PTR(dest, stride, 16); + break; + case DCT_FLIPADST: + iadst16_sse2(in0, in1); + idct16_sse2(in0, in1); + FLIPLR_16x16(in0, in1); + break; + case FLIPADST_FLIPADST: + iadst16_sse2(in0, in1); + iadst16_sse2(in0, in1); + FLIPUD_PTR(dest, stride, 16); + FLIPLR_16x16(in0, in1); + break; + case ADST_FLIPADST: + iadst16_sse2(in0, in1); + iadst16_sse2(in0, in1); + FLIPLR_16x16(in0, in1); + break; + case FLIPADST_ADST: iadst16_sse2(in0, in1); iadst16_sse2(in0, in1); + FLIPUD_PTR(dest, stride, 16); break; +#endif // CONFIG_EXT_TX default: assert(0); break; -- 2.49.0