From 6a47cff882d027542d3320a57e14cbe5ba7404ed Mon Sep 17 00:00:00 2001 From: Debargha Mukherjee Date: Wed, 2 Nov 2016 14:57:42 -0700 Subject: [PATCH] Further work on 64x64 fwd/inv transform support For higher level fwd and inv transform functions. Change-Id: I91518250a0be7d94aada7519f6c9e7ed024574fb --- av1/common/av1_rtcd_defs.pl | 17 +++++ av1/common/enums.h | 36 ++++++---- av1/common/idct.c | 129 +++++++++++++++++++++++++++++----- av1/encoder/hybrid_fwd_txfm.c | 98 ++++++++++++++++++++++++++ 4 files changed, 249 insertions(+), 31 deletions(-) diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index 597d5b2a0..ee46820fe 100644 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl @@ -111,6 +111,9 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; specialize qw/av1_iht16x16_256_add sse2 avx2/; + + add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; + specialize qw/av1_iht32x32_1024_add/; } } else { # Force C versions if CONFIG_EMULATE_HARDWARE is 1 @@ -141,6 +144,10 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; specialize qw/av1_iht16x16_256_add/; + + add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; + specialize qw/av1_iht32x32_1024_add/; + } else { add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; specialize qw/av1_iht4x4_16_add sse2 neon dspr2/; @@ -169,6 +176,9 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; specialize qw/av1_iht16x16_256_add sse2 avx2 dspr2/; + add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; + specialize qw/av1_iht32x32_1024_add/; + if (aom_config("CONFIG_EXT_TX") ne "yes") { specialize qw/av1_iht4x4_16_add msa/; specialize qw/av1_iht8x8_64_add msa/; @@ -176,6 +186,13 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { } } } +add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; +specialize qw/av1_iht32x32_1024_add/; + +if (aom_config("CONFIG_TX64X64") eq "yes") { + add_proto qw/void av1_iht64x64_4096_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; + specialize qw/av1_iht64x64_4096_add/; +} if (aom_config("CONFIG_NEW_QUANT") eq "yes") { add_proto qw/void quantize_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band"; diff --git a/av1/common/enums.h b/av1/common/enums.h index 2ec83ecae..6c0eb3d6b 100644 --- a/av1/common/enums.h +++ b/av1/common/enums.h @@ -134,24 +134,32 @@ typedef enum ATTRIBUTE_PACKED { #if CONFIG_CB4X4 TX_2X2, // 2x2 transform #endif - TX_4X4, // 4x4 transform - TX_8X8, // 8x8 transform - TX_16X16, // 16x16 transform - TX_32X32, // 32x32 transform - TX_4X8, // 4x8 transform - TX_8X4, // 8x4 transform - TX_8X16, // 8x16 transform - TX_16X8, // 16x8 transform - TX_16X32, // 16x32 transform - TX_32X16, // 32x16 transform - TX_SIZES_ALL, // Includes rectangular transforms - TX_SIZES = TX_32X32 + 1, // Does NOT include rectangular transforms - TX_INVALID = 255 // Invalid transform size + TX_4X4, // 4x4 transform + TX_8X8, // 8x8 transform + TX_16X16, // 16x16 transform + TX_32X32, // 32x32 transform +#if CONFIG_TX64X64 + TX_64X64, // 64x64 transform +#endif // CONFIG_TX64X64 + TX_4X8, // 4x8 transform + TX_8X4, // 8x4 transform + TX_8X16, // 8x16 transform + TX_16X8, // 16x8 transform + TX_16X32, // 16x32 transform + TX_32X16, // 32x16 transform +#if 0 // CONFIG_TX64X64 + // TODO(debargha): To be enabled later + TX_32X64, // 32x64 transform + TX_64X32, // 64x32 transform +#endif // CONFIG_TX64X64 + TX_SIZES_ALL, // Includes rectangular transforms + TX_SIZES = TX_4X8, // Does NOT include rectangular transforms + TX_INVALID = 255 // Invalid transform size } TX_SIZE; #define MAX_TX_DEPTH (TX_32X32 - TX_4X4) -#define MAX_TX_SIZE_LOG2 5 +#define MAX_TX_SIZE_LOG2 (5 + CONFIG_TX64X64) #define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2) #define MIN_TX_SIZE_LOG2 2 #define MIN_TX_SIZE (1 << MIN_TX_SIZE_LOG2) diff --git a/av1/common/idct.c b/av1/common/idct.c index 223c57744..2663d2d36 100644 --- a/av1/common/idct.c +++ b/av1/common/idct.c @@ -23,14 +23,14 @@ int get_tx_scale(const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_SIZE tx_size) { (void)tx_type; -#if CONFIG_AOM_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - return txsize_sqr_up_map[tx_size] == TX_32X32; - } -#else (void)xd; -#endif - return txsize_sqr_up_map[tx_size] == TX_32X32; + if (txsize_sqr_up_map[tx_size] == TX_32X32) return 1; +#if CONFIG_TX64X64 + else if (txsize_sqr_up_map[tx_size] == TX_64X64) + return 2; +#endif // CONFIG_TX64X64 + else + return 0; } // NOTE: The implementation of all inverses need to be aware of the fact @@ -58,6 +58,14 @@ static void iidtx32_c(const tran_low_t *input, tran_low_t *output) { int i; for (i = 0; i < 32; ++i) output[i] = input[i] * 4; } + +#if CONFIG_TX64X64 +static void iidtx64_c(const tran_low_t *input, tran_low_t *output) { + int i; + for (i = 0; i < 64; ++i) + output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2); +} +#endif // CONFIG_TX64X64 #endif // CONFIG_EXT_TX // For use in lieu of ADST @@ -94,12 +102,6 @@ static void idct64_row_c(const tran_low_t *input, tran_low_t *output) { for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i]; } -static void iidtx64_c(const tran_low_t *input, tran_low_t *output) { - int i; - for (i = 0; i < 64; ++i) - output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2); -} - // For use in lieu of ADST static void ihalfright64_c(const tran_low_t *input, tran_low_t *output) { int i; @@ -174,7 +176,10 @@ static void highbd_iidtx64_c(const tran_low_t *input, tran_low_t *output, output[i] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[i] * 4 * Sqrt2), bd); } +#endif // CONFIG_TX64X64 +#endif // CONFIG_EXT_TX +#if CONFIG_TX64X64 // For use in lieu of ADST static void highbd_ihalfright64_c(const tran_low_t *input, tran_low_t *output, int bd) { @@ -215,7 +220,6 @@ static void highbd_idct64_row_c(const tran_low_t *input, tran_low_t *output, for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i]; } #endif // CONFIG_TX64X64 -#endif // CONFIG_EXT_TX #endif // CONFIG_AOM_HIGHBITDEPTH // Inverse identity transform and add. @@ -223,7 +227,7 @@ static void highbd_idct64_row_c(const tran_low_t *input, tran_low_t *output, static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bs, int tx_type) { int r, c; - const int shift = bs < 32 ? 3 : 2; + const int shift = bs < 32 ? 3 : (bs < 64 ? 2 : 1); if (tx_type == IDTX) { for (r = 0; r < bs; ++r) { for (c = 0; c < bs; ++c) @@ -929,6 +933,7 @@ void av1_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, } } } +#endif // CONFIG_EXT_TX #if CONFIG_TX64X64 void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride, @@ -938,6 +943,7 @@ void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride, { ihalfright64_c, idct64_row_c }, // ADST_DCT { idct64_col_c, ihalfright64_c }, // DCT_ADST { ihalfright64_c, ihalfright64_c }, // ADST_ADST +#if CONFIG_EXT_TX { ihalfright64_c, idct64_row_c }, // FLIPADST_DCT { idct64_col_c, ihalfright64_c }, // DCT_FLIPADST { ihalfright64_c, ihalfright64_c }, // FLIPADST_FLIPADST @@ -950,6 +956,7 @@ void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride, { iidtx64_c, ihalfright64_c }, // H_ADST { ihalfright64_c, iidtx64_c }, // V_FLIPADST { iidtx64_c, ihalfright64_c }, // H_FLIPADST +#endif // CONFIG_EXT_TX }; int i, j; @@ -979,7 +986,9 @@ void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride, IHT_64[tx_type].cols(out[i], out[i]); } +#if CONFIG_EXT_TX maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 64, 64); +#endif // CONFIG_EXT_TX // Sum with the destination for (i = 0; i < 64; ++i) { @@ -991,7 +1000,6 @@ void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride, } } #endif // CONFIG_TX64X64 -#endif // CONFIG_EXT_TX // idct void av1_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, @@ -1056,6 +1064,14 @@ void av1_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, aom_idct32x32_1024_add(input, dest, stride); } +#if CONFIG_TX64X64 +void av1_idct64x64_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { + (void)eob; + av1_iht64x64_4096_add(input, dest, stride, DCT_DCT); +} +#endif // CONFIG_TX64X64 + void av1_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, int stride, int eob, TX_TYPE tx_type, int lossless) { if (lossless) { @@ -1206,6 +1222,35 @@ void av1_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest, int stride, } } +#if CONFIG_TX64X64 +void av1_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest, int stride, + int eob, TX_TYPE tx_type) { + switch (tx_type) { + case DCT_DCT: av1_idct64x64_add(input, dest, stride, eob); break; +#if CONFIG_EXT_TX + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + av1_iht64x64_4096_add_c(input, dest, stride, tx_type); + break; + case IDTX: inv_idtx_add_c(input, dest, stride, 64, tx_type); break; +#endif // CONFIG_EXT_TX + default: assert(0); break; + } +} +#endif // CONFIG_TX64X64 + #if CONFIG_AOM_HIGHBITDEPTH void av1_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int tx_type, int bd) { @@ -1835,6 +1880,7 @@ void av1_highbd_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, } } } +#endif // CONFIG_EXT_TX #if CONFIG_TX64X64 void av1_highbd_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest8, @@ -1844,6 +1890,7 @@ void av1_highbd_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest8, { highbd_ihalfright64_c, highbd_idct64_row_c }, // ADST_DCT { highbd_idct64_col_c, highbd_ihalfright64_c }, // DCT_ADST { highbd_ihalfright64_c, highbd_ihalfright64_c }, // ADST_ADST +#if CONFIG_EXT_TX { highbd_ihalfright64_c, highbd_idct64_row_c }, // FLIPADST_DCT { highbd_idct64_col_c, highbd_ihalfright64_c }, // DCT_FLIPADST { highbd_ihalfright64_c, highbd_ihalfright64_c }, // FLIPADST_FLIPADST @@ -1856,6 +1903,7 @@ void av1_highbd_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest8, { highbd_iidtx64_c, highbd_ihalfright64_c }, // H_ADST { highbd_ihalfright64_c, highbd_iidtx64_c }, // V_FLIPADST { highbd_iidtx64_c, highbd_ihalfright64_c }, // H_FLIPADST +#endif // CONFIG_EXT_TX }; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); @@ -1887,7 +1935,9 @@ void av1_highbd_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest8, HIGH_IHT_64[tx_type].cols(out[i], out[i], bd); } +#if CONFIG_EXT_TX maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 64, 64); +#endif // CONFIG_EXT_TX // Sum with the destination for (i = 0; i < 64; ++i) { @@ -1900,7 +1950,6 @@ void av1_highbd_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest8, } } #endif // CONFIG_TX64X64 -#endif // CONFIG_EXT_TX // idct void av1_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, @@ -2155,6 +2204,42 @@ void av1_highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest, default: assert(0); break; } } + +#if CONFIG_TX64X64 +void av1_highbd_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest, + int stride, int eob, int bd, + TX_TYPE tx_type) { + (void)eob; + switch (tx_type) { + case DCT_DCT: + av1_inv_txfm2d_add_64x64(input, CONVERT_TO_SHORTPTR(dest), stride, + DCT_DCT, bd); + break; +#if CONFIG_EXT_TX + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + av1_highbd_iht64x64_4096_add_c(input, dest, stride, tx_type, bd); + break; + case IDTX: + highbd_inv_idtx_add_c(input, dest, stride, 64, tx_type, bd); + break; +#endif // CONFIG_EXT_TX + default: assert(0); break; + } +} +#endif // CONFIG_TX64X64 #endif // CONFIG_AOM_HIGHBITDEPTH void inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride, @@ -2165,6 +2250,11 @@ void inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride, const int lossless = inv_txfm_param->lossless; switch (tx_size) { +#if CONFIG_TX64X64 + case TX_64X64: + av1_inv_txfm_add_64x64(input, dest, stride, eob, tx_type); + break; +#endif // CONFIG_TX64X64 case TX_32X32: av1_inv_txfm_add_32x32(input, dest, stride, eob, tx_type); break; @@ -2206,6 +2296,11 @@ void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride, const int lossless = inv_txfm_param->lossless; switch (tx_size) { +#if CONFIG_TX64X64 + case TX_64X64: + av1_highbd_inv_txfm_add_64x64(input, dest, stride, eob, bd, tx_type); + break; +#endif // CONFIG_TX64X64 case TX_32X32: av1_highbd_inv_txfm_add_32x32(input, dest, stride, eob, bd, tx_type); break; diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c index ff0351630..a88c88435 100644 --- a/av1/encoder/hybrid_fwd_txfm.c +++ b/av1/encoder/hybrid_fwd_txfm.c @@ -24,6 +24,24 @@ static INLINE void fdct32x32(int rd_transform, const int16_t *src, av1_fht32x32(src, dst, src_stride, DCT_DCT); } +#if CONFIG_TX64X64 +static INLINE void fdct64x64(const int16_t *src, tran_low_t *dst, + int src_stride) { + av1_fht64x64(src, dst, src_stride, DCT_DCT); +} + +static INLINE void fdct64x64_1(const int16_t *src, tran_low_t *dst, + int src_stride) { + int i, j; + int32_t sum = 0; + memset(dst, 0, sizeof(*dst) * 4096); + for (i = 0; i < 64; ++i) + for (j = 0; j < 64; ++j) sum += src[i * src_stride + j]; + // Note: this scaling makes the transform 2 times unitary + dst[0] = ROUND_POWER_OF_TWO_SIGNED(sum, 5); +} +#endif // CONFIG_TX64X64 + static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TX_TYPE tx_type, int lossless) { if (lossless) { @@ -192,6 +210,41 @@ static void fwd_txfm_32x32(int rd_transform, const int16_t *src_diff, } } +#if CONFIG_TX64X64 +static void fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt) { + switch (tx_type) { + case DCT_DCT: + if (fwd_txfm_opt == FWD_TXFM_OPT_NORMAL) + fdct64x64(src_diff, coeff, diff_stride); + else // FWD_TXFM_OPT_DC + fdct64x64_1(src_diff, coeff, diff_stride); + break; +#if CONFIG_EXT_TX + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + av1_fht64x64(src_diff, coeff, diff_stride, tx_type); + break; + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: av1_fht32x32(src_diff, coeff, diff_stride, tx_type); break; + case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type); break; +#endif // CONFIG_EXT_TX + default: assert(0); break; + } +} +#endif // CONFIG_TX64X64 + #if CONFIG_AOM_HIGHBITDEPTH static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TX_TYPE tx_type, int lossless, @@ -379,6 +432,40 @@ static void highbd_fwd_txfm_32x32(int rd_transform, const int16_t *src_diff, default: assert(0); break; } } + +#if CONFIG_TX64X64 +static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt, const int bd) { + (void)fwd_txfm_opt; + (void)bd; + switch (tx_type) { + case DCT_DCT: + av1_highbd_fht64x64_c(src_diff, coeff, diff_stride, tx_type); + break; +#if CONFIG_EXT_TX + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + av1_highbd_fht64x64_c(src_diff, coeff, diff_stride, tx_type); + break; + case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type); break; +#endif // CONFIG_EXT_TX + default: assert(0); break; + } +} +#endif // CONFIG_TX64X64 #endif // CONFIG_AOM_HIGHBITDEPTH void fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, @@ -389,6 +476,11 @@ void fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, const int rd_transform = fwd_txfm_param->rd_transform; const int lossless = fwd_txfm_param->lossless; switch (tx_size) { +#if CONFIG_TX64X64 + case TX_64X64: + fwd_txfm_64x64(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt); + break; +#endif // CONFIG_TX64X64 case TX_32X32: fwd_txfm_32x32(rd_transform, src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt); @@ -434,6 +526,12 @@ void highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, const int lossless = fwd_txfm_param->lossless; const int bd = fwd_txfm_param->bd; switch (tx_size) { +#if CONFIG_TX64X64 + case TX_64X64: + highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt, + bd); + break; +#endif // CONFIG_TX64X64 case TX_32X32: highbd_fwd_txfm_32x32(rd_transform, src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt, bd); -- 2.40.0