From 74854987735599e2c42359d4ee0a59d4b78ebb47 Mon Sep 17 00:00:00 2001 From: Debargha Mukherjee Date: Fri, 12 Feb 2016 16:44:33 -0800 Subject: [PATCH] Extends ext-tx to support 32x32 masked transforms Adds new 32x32 masked 1-d transforms that combine 1-D length-16 DCT with length-16 identity transforms. To be continued in subsequent patches. Change-Id: I0b4f66492d44c079b3c3b531ba48a97201de1484 --- vp10/common/idct.c | 236 ++++++++++++++++++++++++++++++++-- vp10/common/vp10_rtcd_defs.pl | 9 ++ vp10/encoder/dct.c | 100 +++++++++++++- vpx_dsp/inv_txfm.c | 12 +- vpx_dsp/inv_txfm.h | 1 + vpx_dsp/txfm_common.h | 5 +- 6 files changed, 343 insertions(+), 20 deletions(-) diff --git a/vp10/common/idct.c b/vp10/common/idct.c index 6f38f74a7..dbb50fbba 100644 --- a/vp10/common/idct.c +++ b/vp10/common/idct.c @@ -259,6 +259,73 @@ void idst16_c(const tran_low_t *input, tran_low_t *output) { output[15] = WRAPLOW(-step2[0] + step2[15], 8); } +#if CONFIG_EXT_TX +// For use in lieu of DST +static void ihalfcenter32_c(const tran_low_t *input, tran_low_t *output) { + int i; + tran_low_t inputhalf[16]; + for (i = 0; i < 8; ++i) { + output[i] = input[16 + i] * 4; + output[24 + i] = input[24 + i] * 4; + } + // Multiply input by sqrt(2) + for (i = 0; i < 16; ++i) { + inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2); + } + idct16_c(inputhalf, output + 8); + // Note overall scaling factor is 4 times orthogonal +} + +static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) { + int i; + tran_low_t inputhalf[16]; + for (i = 0; i < 16; ++i) { + output[i] = input[16 + i] * 4; + } + // Multiply input by sqrt(2) + for (i = 0; i < 16; ++i) { + inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2); + } + idct16_c(inputhalf, output + 16); + // Note overall scaling factor is 4 times orthogonal +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_ihalfcenter32_c(const tran_low_t *input, tran_low_t *output, + int bd) { + int i; + tran_low_t inputhalf[16]; + for (i = 0; i < 8; ++i) { + output[i] = input[16 + i] * 4; + output[24 + i] = input[24 + i] * 4; + } + // Multiply input by sqrt(2) + for (i = 0; i < 16; ++i) { + inputhalf[i] = (tran_low_t)highbd_dct_const_round_shift( + input[i] * Sqrt2, bd); + } + vpx_highbd_idct16_c(inputhalf, output + 8, bd); + // Note overall scaling factor is 4 times orthogonal +} + +static void highbd_ihalfright32_c(const tran_low_t *input, tran_low_t *output, + int bd) { + int i; + tran_low_t inputhalf[16]; + for (i = 0; i < 16; ++i) { + output[i] = input[16 + i] * 4; + } + // Multiply input by sqrt(2) + for (i = 0; i < 16; ++i) { + inputhalf[i] = (tran_low_t)highbd_dct_const_round_shift( + input[i] * Sqrt2, bd); + } + vpx_highbd_idct16_c(inputhalf, output + 16, bd); + // Note overall scaling factor is 4 times orthogonal +} +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // CONFIG_EXT_TX + // Inverse identiy transform and add. static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bs) { @@ -808,6 +875,67 @@ void vp10_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, } } +#if CONFIG_EXT_TX +void vp10_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + static const transform_2d IHT_32[] = { + { idct32_c, idct32_c }, // DCT_DCT = 0, + { ihalfright32_c, idct32_c }, // ADST_DCT = 1, + { idct32_c, ihalfright32_c }, // DCT_ADST = 2, + { ihalfright32_c, ihalfright32_c }, // ADST_ADST = 3, + { ihalfright32_c, idct32_c }, // FLIPADST_DCT = 4, + { idct32_c, ihalfright32_c }, // DCT_FLIPADST = 5, + { ihalfright32_c, ihalfright32_c }, // FLIPADST_FLIPADST = 6, + { ihalfright32_c, ihalfright32_c }, // ADST_FLIPADST = 7, + { ihalfright32_c, ihalfright32_c }, // FLIPADST_ADST = 8, + { ihalfcenter32_c, idct32_c }, // DST_DCT = 9, + { idct32_c, ihalfcenter32_c }, // DCT_DST = 10, + { ihalfcenter32_c, ihalfright32_c }, // DST_ADST = 11, + { ihalfright32_c, ihalfcenter32_c }, // ADST_DST = 12, + { ihalfcenter32_c, ihalfright32_c }, // DST_FLIPADST = 13, + { ihalfright32_c, ihalfcenter32_c }, // FLIPADST_DST = 14, + { ihalfcenter32_c, ihalfcenter32_c }, // DST_DST = 15 + }; + + int i, j; + tran_low_t tmp; + tran_low_t out[32][32]; + tran_low_t *outp = &out[0][0]; + int outstride = 32; + + // inverse transform row vectors + for (i = 0; i < 32; ++i) { + IHT_32[tx_type].rows(input, out[i]); + input += 32; + } + + // transpose + for (i = 1 ; i < 32; i++) { + for (j = 0; j < i; j++) { + tmp = out[i][j]; + out[i][j] = out[j][i]; + out[j][i] = tmp; + } + } + + // inverse transform column vectors + for (i = 0; i < 32; ++i) { + IHT_32[tx_type].cols(out[i], out[i]); + } + + maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 32); + + // Sum with the destination + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) { + int d = i * stride + j; + int s = j * outstride + i; + dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6)); + } + } +} +#endif // CONFIG_EXT_TX + // idct void vp10_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, int eob) { @@ -998,15 +1126,27 @@ void vp10_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest, vp10_idct32x32_add(input, dest, stride, eob); break; #if CONFIG_EXT_TX - case IDTX: - inv_idtx_add_c(input, dest, stride, 32); - break; -#endif // CONFIG_EXT_TX case ADST_DCT: case DCT_ADST: case ADST_ADST: - assert(0); + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + case DST_DST: + case DST_DCT: + case DCT_DST: + case DST_ADST: + case ADST_DST: + case FLIPADST_DST: + case DST_FLIPADST: + vp10_iht32x32_1024_add_c(input, dest, stride, tx_type); break; + case IDTX: + inv_idtx_add_c(input, dest, stride, 32); + break; +#endif // CONFIG_EXT_TX default: assert(0); break; @@ -1212,6 +1352,70 @@ void vp10_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, } } +#if CONFIG_EXT_TX +void vp10_highbd_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int tx_type, int bd) { + static const highbd_transform_2d HIGH_IHT_32[] = { + { vpx_highbd_idct32_c, vpx_highbd_idct32_c }, // DCT_DCT + { highbd_ihalfright32_c, vpx_highbd_idct32_c }, // ADST_DCT + { vpx_highbd_idct32_c, highbd_ihalfright32_c }, // DCT_ADST + { highbd_ihalfright32_c, highbd_ihalfright32_c }, // ADST_ADST + { highbd_ihalfright32_c, vpx_highbd_idct32_c }, // FLIPADST_DCT + { vpx_highbd_idct32_c, highbd_ihalfright32_c }, // DCT_FLIPADST + { highbd_ihalfright32_c, highbd_ihalfright32_c }, // FLIPADST_FLIPADST + { highbd_ihalfright32_c, highbd_ihalfright32_c }, // ADST_FLIPADST + { highbd_ihalfright32_c, highbd_ihalfright32_c }, // FLIPADST_ADST + { highbd_ihalfcenter32_c, vpx_highbd_idct32_c }, // DST_DCT + { vpx_highbd_idct32_c, highbd_ihalfcenter32_c }, // DCT_DST + { highbd_ihalfcenter32_c, highbd_ihalfright32_c }, // DST_ADST + { highbd_ihalfright32_c, highbd_ihalfcenter32_c }, // ADST_DST + { highbd_ihalfcenter32_c, highbd_ihalfright32_c }, // DST_FLIPADST + { highbd_ihalfright32_c, highbd_ihalfcenter32_c }, // FLIPADST_DST + { highbd_ihalfcenter32_c, highbd_ihalfcenter32_c }, // DST_DST + }; + + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + int i, j; + tran_low_t tmp; + tran_low_t out[32][32]; + tran_low_t *outp = &out[0][0]; + int outstride = 32; + + // inverse transform row vectors + for (i = 0; i < 32; ++i) { + HIGH_IHT_32[tx_type].rows(input, out[i], bd); + input += 32; + } + + // transpose + for (i = 1 ; i < 32; i++) { + for (j = 0; j < i; j++) { + tmp = out[i][j]; + out[i][j] = out[j][i]; + out[j][i] = tmp; + } + } + + // inverse transform column vectors + for (i = 0; i < 32; ++i) { + HIGH_IHT_32[tx_type].cols(out[i], out[i], bd); + } + + maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 32); + + // Sum with the destination + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) { + int d = i * stride + j; + int s = j * outstride + i; + dest[d] = highbd_clip_pixel_add(dest[d], + ROUND_POWER_OF_TWO(outp[s], 6), bd); + } + } +} +#endif // CONFIG_EXT_TX + // idct void vp10_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, int eob, int bd) { @@ -1409,15 +1613,27 @@ void vp10_highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest, vp10_highbd_idct32x32_add(input, dest, stride, eob, bd); break; #if CONFIG_EXT_TX - case IDTX: - highbd_inv_idtx_add_c(input, dest, stride, 32, bd); - break; -#endif // CONFIG_EXT_TX case ADST_DCT: case DCT_ADST: case ADST_ADST: - assert(0); + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + case DST_DST: + case DST_DCT: + case DCT_DST: + case DST_ADST: + case ADST_DST: + case FLIPADST_DST: + case DST_FLIPADST: + vp10_highbd_iht32x32_1024_add_c(input, dest, stride, tx_type, bd); break; + case IDTX: + highbd_inv_idtx_add_c(input, dest, stride, 32, bd); + break; +#endif // CONFIG_EXT_TX default: assert(0); break; diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl index 9860baedf..c9f02953f 100644 --- a/vp10/common/vp10_rtcd_defs.pl +++ b/vp10/common/vp10_rtcd_defs.pl @@ -404,6 +404,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp10_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/vp10_fht16x16 sse2/; + add_proto qw/void vp10_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp10_fht32x32/; + add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp10_fwht4x4/, "$mmx_x86inc"; } else { @@ -416,6 +419,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp10_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/vp10_fht16x16 sse2 msa/; + add_proto qw/void vp10_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp10_fht32x32/; + add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc"; } @@ -642,6 +648,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp10_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/vp10_highbd_fht16x16/; + add_proto qw/void vp10_highbd_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp10_highbd_fht32x32/; + add_proto qw/void vp10_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp10_highbd_fwht4x4/; diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c index cdb732a44..333adbbcb 100644 --- a/vp10/encoder/dct.c +++ b/vp10/encoder/dct.c @@ -14,7 +14,6 @@ #include "./vp10_rtcd.h" #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" - #include "vp10/common/blockd.h" #include "vp10/common/idct.h" #include "vpx_dsp/fwd_txfm.h" @@ -538,7 +537,7 @@ static void fdct16(const tran_low_t *input, tran_low_t *output) { range_check(output, 16, 16); } -/* TODO(angiebird): Unify this with vp10_fwd_txfm.c: vp10_fdct32 +#if CONFIG_EXT_TX static void fdct32(const tran_low_t *input, tran_low_t *output) { tran_high_t temp; tran_low_t step[32]; @@ -936,7 +935,7 @@ static void fdct32(const tran_low_t *input, tran_low_t *output) { range_check(output, 32, 18); } -*/ +#endif // CONFIG_EXT_TX static void fadst4(const tran_low_t *input, tran_low_t *output) { tran_high_t x0, x1, x2, x3; @@ -1213,6 +1212,37 @@ static void fadst16(const tran_low_t *input, tran_low_t *output) { } #if CONFIG_EXT_TX +// For use in lieu of DST +static void fhalfcenter32(const tran_low_t *input, tran_low_t *output) { + int i; + tran_low_t inputhalf[16]; + for (i = 0; i < 8; ++i) { + output[16 + i] = input[i] * 4; + output[24 + i] = input[24 + i] * 4; + } + // Multiply input by sqrt(2) + for (i = 0; i < 16; ++i) { + inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 8] * Sqrt2); + } + fdct16(inputhalf, output); + // Note overall scaling factor is 4 times orthogonal +} + +// For use in lieu of ADST +static void fhalfright32(const tran_low_t *input, tran_low_t *output) { + int i; + tran_low_t inputhalf[16]; + for (i = 0; i < 16; ++i) { + output[16 + i] = input[i] * 4; + } + // Multiply input by sqrt(2) + for (i = 0; i < 16; ++i) { + inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2); + } + fdct16(inputhalf, output); + // Note overall scaling factor is 4 times orthogonal +} + static void copy_block(const int16_t *src, int src_stride, int l, int16_t *dest, int dest_stride) { int i; @@ -1375,6 +1405,27 @@ static const transform_2d FHT_16[] = { #endif // CONFIG_EXT_TX }; +#if CONFIG_EXT_TX +static const transform_2d FHT_32[] = { + { fdct32, fdct32 }, // DCT_DCT = 0, + { fhalfright32, fdct32 }, // ADST_DCT = 1, + { fdct32, fhalfright32 }, // DCT_ADST = 2, + { fhalfright32, fhalfright32 }, // ADST_ADST = 3, + { fhalfright32, fdct32 }, // FLIPADST_DCT = 4, + { fdct32, fhalfright32 }, // DCT_FLIPADST = 5, + { fhalfright32, fhalfright32 }, // FLIPADST_FLIPADST = 6, + { fhalfright32, fhalfright32 }, // ADST_FLIPADST = 7, + { fhalfright32, fhalfright32 }, // FLIPADST_ADST = 8, + { fhalfcenter32, fdct32 }, // DST_DCT = 9, + { fdct32, fhalfcenter32 }, // DCT_DST = 10, + { fhalfcenter32, fhalfright32 }, // DST_ADST = 11, + { fhalfright32, fhalfcenter32 }, // ADST_DST = 12, + { fhalfcenter32, fhalfright32 }, // DST_FLIPADST = 13, + { fhalfright32, fhalfcenter32 }, // FLIPADST_DST = 14, + { fhalfcenter32, fhalfcenter32 }, // DST_DST = 15 +}; +#endif // CONFIG_EXT_TX + void vp10_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type) { if (tx_type == DCT_DCT) { @@ -1671,3 +1722,46 @@ void vp10_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, vp10_fht16x16_c(input, output, stride, tx_type); } #endif // CONFIG_VP9_HIGHBITDEPTH + +#if CONFIG_EXT_TX +void vp10_fht32x32_c(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + if (tx_type == DCT_DCT) { + vpx_fdct32x32_c(input, output, stride); + } else { + tran_low_t out[1024]; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + const transform_2d ht = FHT_32[tx_type]; + + int16_t flipped_input[32 * 32]; + maybe_flip_input(&input, &stride, 32, flipped_input, tx_type); + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = input[j * stride + i] * 4; + ht.cols(temp_in, temp_out); + for (j = 0; j < 32; ++j) + out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + } + + // Rows + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j + i * 32]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 32; ++j) + output[j + i * 32] = + (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); + } + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vp10_highbd_fht32x32_c(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + vp10_fht32x32_c(input, output, stride, tx_type); +} +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // CONFIG_EXT_TX diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c index a0f59bf75..402fd9a23 100644 --- a/vpx_dsp/inv_txfm.c +++ b/vpx_dsp/inv_txfm.c @@ -2057,8 +2057,8 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, } } -static void highbd_idct32_c(const tran_low_t *input, - tran_low_t *output, int bd) { +void vpx_highbd_idct32_c(const tran_low_t *input, + tran_low_t *output, int bd) { tran_low_t step1[32], step2[32]; tran_high_t temp1, temp2; (void) bd; @@ -2447,7 +2447,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; if (zero_coeff[0] | zero_coeff[1]) - highbd_idct32_c(input, outptr, bd); + vpx_highbd_idct32_c(input, outptr, bd); else memset(outptr, 0, sizeof(tran_low_t) * 32); input += 32; @@ -2458,7 +2458,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, for (i = 0; i < 32; ++i) { for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - highbd_idct32_c(temp_in, temp_out, bd); + vpx_highbd_idct32_c(temp_in, temp_out, bd); for (j = 0; j < 32; ++j) { dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); @@ -2477,7 +2477,7 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, // Rows // Only upper-left 8x8 has non-zero coeff. for (i = 0; i < 8; ++i) { - highbd_idct32_c(input, outptr, bd); + vpx_highbd_idct32_c(input, outptr, bd); input += 32; outptr += 32; } @@ -2485,7 +2485,7 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, for (i = 0; i < 32; ++i) { for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - highbd_idct32_c(temp_in, temp_out, bd); + vpx_highbd_idct32_c(temp_in, temp_out, bd); for (j = 0; j < 32; ++j) { dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h index 23588139e..adbb83872 100644 --- a/vpx_dsp/inv_txfm.h +++ b/vpx_dsp/inv_txfm.h @@ -100,6 +100,7 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output); void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd); void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd); void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd); +void vpx_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd); void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd); void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd); diff --git a/vpx_dsp/txfm_common.h b/vpx_dsp/txfm_common.h index 442e6a57b..9b0e9900a 100644 --- a/vpx_dsp/txfm_common.h +++ b/vpx_dsp/txfm_common.h @@ -57,10 +57,13 @@ static const tran_high_t cospi_29_64 = 2404; static const tran_high_t cospi_30_64 = 1606; static const tran_high_t cospi_31_64 = 804; -// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 +// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 static const tran_high_t sinpi_1_9 = 5283; static const tran_high_t sinpi_2_9 = 9929; static const tran_high_t sinpi_3_9 = 13377; static const tran_high_t sinpi_4_9 = 15212; +// 16384 * sqrt(2) +static const tran_high_t Sqrt2 = 23170; + #endif // VPX_DSP_TXFM_COMMON_H_ -- 2.40.0