From e5848dea5a430763af471455916f1fe4c0e2505c Mon Sep 17 00:00:00 2001 From: Debargha Mukherjee Date: Fri, 1 Jul 2016 12:57:14 -0700 Subject: [PATCH] Rectangular transforms 4x8 & 8x4 Added a new expt rect-tx to be used in conjunction with ext-tx. [rect-tx is a temporary config flag and will eventually be merged into ext-tx once it works correctly with all other experiments]. Added 4x8 and 8x4 tranforms for use initially with rectangular sub8x8 y blocks as part of this experiment. There is about a -0.2% BDRATE improvement on lowres, others pending. When var-tx is on rectangular transforms are currently not used. That will be enabled in a subsequent patch. Change-Id: Iaf3f88ede2740ffe6a0ffb1ef5fc01a16cd0283a --- configure | 1 + vp10/common/blockd.c | 32 ++- vp10/common/blockd.h | 29 ++- vp10/common/common_data.h | 96 ++++++- vp10/common/entropy.c | 31 ++- vp10/common/entropy.h | 35 ++- vp10/common/enums.h | 16 +- vp10/common/idct.c | 304 ++++++++++++++++++++-- vp10/common/idct.h | 12 + vp10/common/loopfilter.c | 13 +- vp10/common/pred_common.h | 9 +- vp10/common/reconintra.c | 10 +- vp10/common/scan.c | 211 ++++++++++++++- vp10/common/scan.h | 3 +- vp10/common/vp10_inv_txfm2d.c | 2 +- vp10/common/vp10_rtcd_defs.pl | 48 ++++ vp10/decoder/decodeframe.c | 66 +++-- vp10/decoder/decodemv.c | 64 +++-- vp10/decoder/detokenize.c | 42 +-- vp10/encoder/bitstream.c | 7 +- vp10/encoder/dct.c | 194 +++++++++++--- vp10/encoder/encodemb.c | 458 ++++++++++----------------------- vp10/encoder/hybrid_fwd_txfm.c | 52 ++++ vp10/encoder/hybrid_fwd_txfm.h | 16 -- vp10/encoder/rd.c | 18 +- vp10/encoder/rdopt.c | 167 ++++++++---- vp10/encoder/tokenize.c | 25 +- 27 files changed, 1384 insertions(+), 577 deletions(-) diff --git a/configure b/configure index cf6a7c377..473d35e02 100755 --- a/configure +++ b/configure @@ -267,6 +267,7 @@ EXPERIMENT_LIST=" fp_mb_stats emulate_hardware var_tx + rect_tx ref_mv dual_filter ext_tx diff --git a/vp10/common/blockd.c b/vp10/common/blockd.c index 5ca5c0527..60629170b 100644 --- a/vp10/common/blockd.c +++ b/vp10/common/blockd.c @@ -53,7 +53,9 @@ void vp10_foreach_transformed_block_in_plane( const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; - const int step = 1 << (tx_size << 1); + const uint8_t num_4x4_tw = num_4x4_blocks_wide_txsize_lookup[tx_size]; + const uint8_t num_4x4_th = num_4x4_blocks_high_txsize_lookup[tx_size]; + const int step = num_4x4_tw * num_4x4_th; int i = 0, r, c; // If mb_to_right_edge is < 0 we are in a situation in which @@ -63,13 +65,15 @@ void vp10_foreach_transformed_block_in_plane( xd->mb_to_right_edge >> (5 + pd->subsampling_x)); const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); - const int extra_step = ((num_4x4_w - max_blocks_wide) >> tx_size) * step; + const int extra_step = + ((num_4x4_w - max_blocks_wide) >> + num_4x4_blocks_wide_txsize_log2_lookup[tx_size]) * step; // Keep track of the row and column of the blocks we use so that we know // if we are in the unrestricted motion border. - for (r = 0; r < max_blocks_high; r += (1 << tx_size)) { + for (r = 0; r < max_blocks_high; r += num_4x4_th) { // Skip visiting the sub blocks that are wholly within the UMV. - for (c = 0; c < max_blocks_wide; c += (1 << tx_size)) { + for (c = 0; c < max_blocks_wide; c += num_4x4_tw) { visit(plane, i, r, c, plane_bsize, tx_size, arg); i += step; } @@ -82,33 +86,33 @@ void vp10_foreach_transformed_block(const MACROBLOCKD* const xd, foreach_transformed_block_visitor visit, void *arg) { int plane; - for (plane = 0; plane < MAX_MB_PLANE; ++plane) vp10_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); } void vp10_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, - BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, - int aoff, int loff) { + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, + int aoff, int loff) { ENTROPY_CONTEXT *const a = pd->above_context + aoff; ENTROPY_CONTEXT *const l = pd->left_context + loff; - const int tx_size_in_blocks = 1 << tx_size; + const int tx_w_in_blocks = num_4x4_blocks_wide_txsize_lookup[tx_size]; + const int tx_h_in_blocks = num_4x4_blocks_high_txsize_lookup[tx_size]; // above if (has_eob && xd->mb_to_right_edge < 0) { int i; const int blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize] + (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); - int above_contexts = tx_size_in_blocks; + int above_contexts = tx_w_in_blocks; if (above_contexts + aoff > blocks_wide) above_contexts = blocks_wide - aoff; for (i = 0; i < above_contexts; ++i) a[i] = has_eob; - for (i = above_contexts; i < tx_size_in_blocks; ++i) + for (i = above_contexts; i < tx_w_in_blocks; ++i) a[i] = 0; } else { - memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); + memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_w_in_blocks); } // left @@ -116,16 +120,16 @@ void vp10_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, int i; const int blocks_high = num_4x4_blocks_high_lookup[plane_bsize] + (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); - int left_contexts = tx_size_in_blocks; + int left_contexts = tx_h_in_blocks; if (left_contexts + loff > blocks_high) left_contexts = blocks_high - loff; for (i = 0; i < left_contexts; ++i) l[i] = has_eob; - for (i = left_contexts; i < tx_size_in_blocks; ++i) + for (i = left_contexts; i < tx_h_in_blocks; ++i) l[i] = 0; } else { - memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); + memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_h_in_blocks); } } diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h index 4c46cbb71..399fefe56 100644 --- a/vp10/common/blockd.h +++ b/vp10/common/blockd.h @@ -422,6 +422,18 @@ static INLINE int supertx_enabled(const MB_MODE_INFO *mbmi) { } #endif // CONFIG_SUPERTX +static INLINE int get_tx1d_width(TX_SIZE tx_size) { + return num_4x4_blocks_wide_txsize_lookup[tx_size] << 2; +} + +static INLINE int get_tx1d_height(TX_SIZE tx_size) { + return num_4x4_blocks_high_txsize_lookup[tx_size] << 2; +} + +static INLINE int get_tx2d_size(TX_SIZE tx_size) { + return num_4x4_blocks_txsize_lookup[tx_size] << 4; +} + #if CONFIG_EXT_TX #define ALLOW_INTRA_EXT_TX 1 // whether masked transforms are used for 32X32 @@ -438,6 +450,7 @@ static const int num_ext_tx_set_intra[EXT_TX_SETS_INTRA] = { #if EXT_TX_SIZES == 4 static INLINE int get_ext_tx_set(TX_SIZE tx_size, BLOCK_SIZE bs, int is_inter) { + tx_size = txsize_sqr_map[tx_size]; if (tx_size > TX_32X32 || bs < BLOCK_8X8) return 0; #if USE_REDUCED_TXSET_FOR_16X16 if (tx_size == TX_32X32) @@ -468,6 +481,7 @@ static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER][TX_SIZES] = { static INLINE int get_ext_tx_set(TX_SIZE tx_size, BLOCK_SIZE bs, int is_inter) { (void) is_inter; + tx_size = txsize_sqr_map[tx_size]; if (tx_size > TX_32X32 || bs < BLOCK_8X8) return 0; if (tx_size == TX_32X32) return 0; #if USE_REDUCED_TXSET_FOR_16X16 @@ -622,10 +636,11 @@ static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type, #if CONFIG_EXT_TX #if EXT_TX_SIZES == 4 - if (xd->lossless[mbmi->segment_id] || tx_size > TX_32X32 || - (tx_size >= TX_32X32 && !is_inter_block(mbmi))) + if (xd->lossless[mbmi->segment_id] || + txsize_sqr_map[tx_size] > TX_32X32 || + (txsize_sqr_map[tx_size] >= TX_32X32 && !is_inter_block(mbmi))) #else - if (xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32) + if (xd->lossless[mbmi->segment_id] || txsize_sqr_map[tx_size] >= TX_32X32) #endif return DCT_DCT; if (mbmi->sb_type >= BLOCK_8X8) { @@ -637,8 +652,8 @@ static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type, } if (is_inter_block(mbmi)) // UV Inter only - return (mbmi->tx_type == IDTX && tx_size == TX_32X32 ? - DCT_DCT : mbmi->tx_type); + return (mbmi->tx_type == IDTX && txsize_sqr_map[tx_size] == TX_32X32) ? + DCT_DCT : mbmi->tx_type; } // Sub8x8-Inter/Intra OR UV-Intra @@ -647,10 +662,10 @@ static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type, else // Sub8x8 Intra OR UV-Intra return intra_mode_to_tx_type_context[plane_type == PLANE_TYPE_Y ? get_y_mode(mi, block_idx) : mbmi->uv_mode]; -#else +#else // CONFIG_EXT_TX (void) block_idx; if (plane_type != PLANE_TYPE_Y || xd->lossless[mbmi->segment_id] || - tx_size >= TX_32X32) + txsize_sqr_map[tx_size] >= TX_32X32) return DCT_DCT; return mbmi->tx_type; #endif // CONFIG_EXT_TX diff --git a/vp10/common/common_data.h b/vp10/common/common_data.h index 44ebff2dc..250698670 100644 --- a/vp10/common/common_data.h +++ b/vp10/common/common_data.h @@ -50,6 +50,46 @@ static const uint8_t num_16x16_blocks_wide_lookup[BLOCK_SIZES] = static const uint8_t num_16x16_blocks_high_lookup[BLOCK_SIZES] = {1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, IF_EXT_PARTITION(8, 4, 8)}; +static const uint8_t num_4x4_blocks_txsize_lookup[TX_SIZES_ALL] = { + 1, 4, 16, 64, +#if CONFIG_EXT_TX + 2, 2 +#endif // CONFIG_EXT_TX +}; +static const uint8_t num_4x4_blocks_wide_txsize_lookup[TX_SIZES_ALL] = { + 1, 2, 4, 8, +#if CONFIG_EXT_TX + 1, 2 +#endif // CONFIG_EXT_TX +}; +static const uint8_t num_4x4_blocks_high_txsize_lookup[TX_SIZES_ALL] = { + 1, 2, 4, 8, +#if CONFIG_EXT_TX + 2, 1 +#endif // CONFIG_EXT_TX +}; + +static const uint8_t num_4x4_blocks_txsize_log2_lookup[TX_SIZES_ALL] = { + 0, 2, 4, 6, +#if CONFIG_EXT_TX + 1, 1 +#endif // CONFIG_EXT_TX +}; +static const uint8_t num_4x4_blocks_wide_txsize_log2_lookup + [TX_SIZES_ALL] = { + 0, 1, 2, 3, +#if CONFIG_EXT_TX + 0, 1 +#endif // CONFIG_EXT_TX +}; +static const uint8_t num_4x4_blocks_high_txsize_log2_lookup + [TX_SIZES_ALL] = { + 0, 1, 2, 3, +#if CONFIG_EXT_TX + 1, 0 +#endif // CONFIG_EXT_TX +}; + // VPXMIN(3, VPXMIN(b_width_log2(bsize), b_height_log2(bsize))) static const uint8_t size_group_lookup[BLOCK_SIZES] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, IF_EXT_PARTITION(3, 3, 3)}; @@ -297,13 +337,59 @@ static const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = { #endif // CONFIG_EXT_PARTITION }; -static const BLOCK_SIZE txsize_to_bsize[TX_SIZES] = { - BLOCK_4X4, // TX_4X4 - BLOCK_8X8, // TX_8X8 - BLOCK_16X16, // TX_16X16 - BLOCK_32X32, // TX_32X32 +#if CONFIG_EXT_TX +static const TX_SIZE max_txsize_rect_lookup[BLOCK_SIZES] = { + // 4X4 + TX_4X4, + // 4X8, 8X4, 8X8 + TX_4X8, TX_8X4, TX_8X8, + // 8X16, 16X8, 16X16 + TX_8X8, TX_8X8, TX_16X16, + // 16X32, 32X16, 32X32 + TX_16X16, TX_16X16, TX_32X32, + // 32X64, 64X32, 64X64 + TX_32X32, TX_32X32, TX_32X32, +#if CONFIG_EXT_PARTITION + // 64x128, 128x64, 128x128 + TX_32X32, TX_32X32, TX_32X32, +#endif // CONFIG_EXT_PARTITION +}; +#endif // CONFIG_EXT_TX + +static const BLOCK_SIZE txsize_to_bsize[TX_SIZES_ALL] = { + BLOCK_4X4, // TX_4X4 + BLOCK_8X8, // TX_8X8 + BLOCK_16X16, // TX_16X16 + BLOCK_32X32, // TX_32X32 +#if CONFIG_EXT_TX + BLOCK_4X8, // TX_4X8 + BLOCK_8X4, // TX_8X4 +#endif // CONFIG_EXT_TX +}; + +static const TX_SIZE txsize_sqr_map[TX_SIZES_ALL] = { + TX_4X4, // TX_4X4 + TX_8X8, // TX_8X8 + TX_16X16, // TX_16X16 + TX_32X32, // TX_32X32 +#if CONFIG_EXT_TX + TX_4X4, // TX_4X8 + TX_4X4, // TX_8X4 +#endif // CONFIG_EXT_TX }; +static const TX_SIZE txsize_sqr_up_map[TX_SIZES_ALL] = { + TX_4X4, // TX_4X4 + TX_8X8, // TX_8X8 + TX_16X16, // TX_16X16 + TX_32X32, // TX_32X32 +#if CONFIG_EXT_TX + TX_8X8, // TX_4X8 + TX_8X8, // TX_8X4 +#endif // CONFIG_EXT_TX +}; + + static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = { TX_4X4, // ONLY_4X4 TX_8X8, // ALLOW_8X8 diff --git a/vp10/common/entropy.c b/vp10/common/entropy.c index eea552cf4..1ce801a65 100644 --- a/vp10/common/entropy.c +++ b/vp10/common/entropy.c @@ -56,11 +56,33 @@ const vpx_prob vp10_cat6_prob_high12[] = { }; #endif +const uint16_t band_count_table[TX_SIZES_ALL][8] = { + { 1, 2, 3, 4, 3, 16 - 13, 0 }, + { 1, 2, 3, 4, 11, 64 - 21, 0 }, + { 1, 2, 3, 4, 11, 256 - 21, 0 }, + { 1, 2, 3, 4, 11, 1024 - 21, 0 }, +#if CONFIG_EXT_TX + { 1, 2, 3, 4, 8, 32 - 18, 0 }, + { 1, 2, 3, 4, 8, 32 - 18, 0 }, +#endif // CONFIG_EXT_TX +}; + +const uint16_t band_cum_count_table[TX_SIZES_ALL][8] = { + { 0, 1, 3, 6, 10, 13, 16, 0 }, + { 0, 1, 3, 6, 10, 21, 64, 0 }, + { 0, 1, 3, 6, 10, 21, 256, 0 }, + { 0, 1, 3, 6, 10, 21, 1024, 0 }, +#if CONFIG_EXT_TX + { 0, 1, 3, 6, 10, 18, 32, 0 }, + { 0, 1, 3, 6, 10, 18, 32, 0 }, +#endif // CONFIG_EXT_TX +}; + const uint8_t vp10_coefband_trans_8x8plus[1024] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, // beyond MAXBAND_INDEX+1 all values are filled as 5 - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, @@ -125,6 +147,13 @@ const uint8_t vp10_coefband_trans_8x8plus[1024] = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, }; +#if CONFIG_EXT_TX +const uint8_t vp10_coefband_trans_8x4_4x8[32] = { + 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, + 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, +}; +#endif // CONFIG_EXT_TX + const uint8_t vp10_coefband_trans_4x4[16] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, }; diff --git a/vp10/common/entropy.h b/vp10/common/entropy.h index d0ca880aa..baaa5151f 100644 --- a/vp10/common/entropy.h +++ b/vp10/common/entropy.h @@ -155,11 +155,28 @@ void vp10_partial_adapt_probs(struct VP10Common *cm, int mi_row, int mi_col); #define MAXBAND_INDEX 21 DECLARE_ALIGNED(16, extern const uint8_t, vp10_coefband_trans_8x8plus[1024]); +#if CONFIG_EXT_TX +DECLARE_ALIGNED(16, extern const uint8_t, vp10_coefband_trans_8x4_4x8[32]); +#endif // CONFIG_EXT_TX DECLARE_ALIGNED(16, extern const uint8_t, vp10_coefband_trans_4x4[16]); +DECLARE_ALIGNED(16, extern const uint16_t, + band_count_table[TX_SIZES_ALL][8]); +DECLARE_ALIGNED(16, extern const uint16_t, + band_cum_count_table[TX_SIZES_ALL][8]); + static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) { - return tx_size == TX_4X4 ? vp10_coefband_trans_4x4 - : vp10_coefband_trans_8x8plus; + switch (tx_size) { + case TX_4X4: + return vp10_coefband_trans_4x4; +#if CONFIG_EXT_TX + case TX_4X8: + case TX_8X4: + return vp10_coefband_trans_8x4_4x8; +#endif // CONFIG_EXT_TX + default: + return vp10_coefband_trans_8x8plus; + } } // 128 lists of probabilities are stored for the following ONE node probs: @@ -198,7 +215,8 @@ static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a, return (a != 0) + (b != 0); } -static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, +static INLINE int get_entropy_context(TX_SIZE tx_size, + const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l) { ENTROPY_CONTEXT above_ec = 0, left_ec = 0; @@ -207,6 +225,16 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, above_ec = a[0] != 0; left_ec = l[0] != 0; break; +#if CONFIG_EXT_TX + case TX_4X8: + above_ec = a[0] != 0; + left_ec = !!*(const uint16_t *)l; + break; + case TX_8X4: + above_ec = !!*(const uint16_t *)a; + left_ec = l[0] != 0; + break; +#endif // CONFIG_EXT_TX case TX_8X8: above_ec = !!*(const uint16_t *)a; left_ec = !!*(const uint16_t *)l; @@ -223,7 +251,6 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, assert(0 && "Invalid transform size."); break; } - return combine_entropy_contexts(above_ec, left_ec); } diff --git a/vp10/common/enums.h b/vp10/common/enums.h index d1ce121cd..a93beccc8 100644 --- a/vp10/common/enums.h +++ b/vp10/common/enums.h @@ -137,6 +137,14 @@ typedef uint8_t TX_SIZE; #define TX_32X32 ((TX_SIZE)3) // 32x32 transform #define TX_SIZES ((TX_SIZE)4) +#if CONFIG_EXT_TX +#define TX_4X8 ((TX_SIZE)4) // 4x8 transform +#define TX_8X4 ((TX_SIZE)5) // 8x4 transform +#define TX_SIZES_ALL ((TX_SIZE)6) // Includes rectangular transforms +#else +#define TX_SIZES_ALL ((TX_SIZE)4) +#endif // CONFIG_EXT_TX + #define MAX_TX_SIZE_LOG2 5 #define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2) #define MIN_TX_SIZE_LOG2 2 @@ -170,10 +178,10 @@ typedef enum { } TX_TYPE_1D; typedef enum { - DCT_DCT = 0, // DCT in both horizontal and vertical - ADST_DCT = 1, // ADST in vertical, DCT in horizontal - DCT_ADST = 2, // DCT in vertical, ADST in horizontal - ADST_ADST = 3, // ADST in both directions + DCT_DCT = 0, // DCT in both horizontal and vertical + ADST_DCT = 1, // ADST in vertical, DCT in horizontal + DCT_ADST = 2, // DCT in vertical, ADST in horizontal + ADST_ADST = 3, // ADST in both directions #if CONFIG_EXT_TX FLIPADST_DCT = 4, DCT_FLIPADST = 5, diff --git a/vp10/common/idct.c b/vp10/common/idct.c index 1a573bd19..9b70857fa 100644 --- a/vp10/common/idct.c +++ b/vp10/common/idct.c @@ -144,7 +144,7 @@ static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride, static void maybe_flip_strides(uint8_t **dst, int *dstride, tran_low_t **src, int *sstride, - int tx_type, int size) { + int tx_type, int sizey, int sizex) { // Note that the transpose of src will be added to dst. In order to LR // flip the addends (in dst coordinates), we UD flip the src. To UD flip // the addends, we UD flip the dst. @@ -163,19 +163,19 @@ static void maybe_flip_strides(uint8_t **dst, int *dstride, case FLIPADST_ADST: case V_FLIPADST: // flip UD - FLIPUD_PTR(*dst, *dstride, size); + FLIPUD_PTR(*dst, *dstride, sizey); break; case DCT_FLIPADST: case ADST_FLIPADST: case H_FLIPADST: // flip LR - FLIPUD_PTR(*src, *sstride, size); + FLIPUD_PTR(*src, *sstride, sizex); break; case FLIPADST_FLIPADST: // flip UD - FLIPUD_PTR(*dst, *dstride, size); + FLIPUD_PTR(*dst, *dstride, sizey); // flip LR - FLIPUD_PTR(*src, *sstride, size); + FLIPUD_PTR(*src, *sstride, sizex); break; default: assert(0); @@ -445,7 +445,7 @@ static void highbd_inv_idtx_add_c(const tran_low_t *input, uint8_t *dest8, static void maybe_flip_strides16(uint16_t **dst, int *dstride, tran_low_t **src, int *sstride, - int tx_type, int size) { + int tx_type, int sizey, int sizex) { // Note that the transpose of src will be added to dst. In order to LR // flip the addends (in dst coordinates), we UD flip the src. To UD flip // the addends, we UD flip the dst. @@ -464,19 +464,19 @@ static void maybe_flip_strides16(uint16_t **dst, int *dstride, case FLIPADST_ADST: case V_FLIPADST: // flip UD - FLIPUD_PTR(*dst, *dstride, size); + FLIPUD_PTR(*dst, *dstride, sizey); break; case DCT_FLIPADST: case ADST_FLIPADST: case H_FLIPADST: // flip LR - FLIPUD_PTR(*src, *sstride, size); + FLIPUD_PTR(*src, *sstride, sizex); break; case FLIPADST_FLIPADST: // flip UD - FLIPUD_PTR(*dst, *dstride, size); + FLIPUD_PTR(*dst, *dstride, sizey); // flip LR - FLIPUD_PTR(*src, *sstride, size); + FLIPUD_PTR(*src, *sstride, sizex); break; default: assert(0); @@ -536,7 +536,7 @@ void vp10_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, } #if CONFIG_EXT_TX - maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4); + maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4, 4); #endif // Sum with the destination @@ -549,6 +549,116 @@ void vp10_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, } } +void vp10_iht4x8_32_add_c(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + static const transform_2d IHT_4x8[] = { + { idct8_c, idct4_c }, // DCT_DCT + { iadst8_c, idct4_c }, // ADST_DCT + { idct8_c, iadst4_c }, // DCT_ADST + { iadst8_c, iadst4_c }, // ADST_ADST +#if CONFIG_EXT_TX + { iadst8_c, idct4_c }, // FLIPADST_DCT + { idct8_c, iadst4_c }, // DCT_FLIPADST + { iadst8_c, iadst4_c }, // FLIPADST_FLIPADST + { iadst8_c, iadst4_c }, // ADST_FLIPADST + { iadst8_c, iadst4_c }, // FLIPADST_ADST + { iidtx8_c, iidtx4_c }, // IDTX + { idct8_c, iidtx4_c }, // V_DCT + { iidtx8_c, idct4_c }, // H_DCT + { iadst8_c, iidtx4_c }, // V_ADST + { iidtx8_c, iadst4_c }, // H_ADST + { iadst8_c, iidtx4_c }, // V_FLIPADST + { iidtx8_c, iadst4_c }, // H_FLIPADST +#endif // CONFIG_EXT_TX + }; + + int i, j; + tran_low_t out[4][8], outtmp[4]; + tran_low_t *outp = &out[0][0]; + int outstride = 8; + + // inverse transform row vectors and transpose + for (i = 0; i < 8; ++i) { + IHT_4x8[tx_type].rows(input, outtmp); + for (j = 0; j < 4; ++j) + out[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2); + input += 4; + } + + // inverse transform column vectors + for (i = 0; i < 4; ++i) { + IHT_4x8[tx_type].cols(out[i], out[i]); + } + +#if CONFIG_EXT_TX + maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8, 4); +#endif + + // Sum with the destination + for (i = 0; i < 8; ++i) { + for (j = 0; j < 4; ++j) { + int d = i * stride + j; + int s = j * outstride + i; + dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5)); + } + } +} + +void vp10_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + static const transform_2d IHT_8x4[] = { + { idct4_c, idct8_c }, // DCT_DCT + { iadst4_c, idct8_c }, // ADST_DCT + { idct4_c, iadst8_c }, // DCT_ADST + { iadst4_c, iadst8_c }, // ADST_ADST +#if CONFIG_EXT_TX + { iadst4_c, idct8_c }, // FLIPADST_DCT + { idct4_c, iadst8_c }, // DCT_FLIPADST + { iadst4_c, iadst8_c }, // FLIPADST_FLIPADST + { iadst4_c, iadst8_c }, // ADST_FLIPADST + { iadst4_c, iadst8_c }, // FLIPADST_ADST + { iidtx4_c, iidtx8_c }, // IDTX + { idct4_c, iidtx8_c }, // V_DCT + { iidtx4_c, idct8_c }, // H_DCT + { iadst4_c, iidtx8_c }, // V_ADST + { iidtx4_c, iadst8_c }, // H_ADST + { iadst4_c, iidtx8_c }, // V_FLIPADST + { iidtx4_c, iadst8_c }, // H_FLIPADST +#endif // CONFIG_EXT_TX + }; + + int i, j; + tran_low_t out[8][4], outtmp[8]; + tran_low_t *outp = &out[0][0]; + int outstride = 4; + + // inverse transform row vectors and transpose + for (i = 0; i < 4; ++i) { + IHT_8x4[tx_type].rows(input, outtmp); + for (j = 0; j < 8; ++j) + out[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2); + input += 8; + } + + // inverse transform column vectors + for (i = 0; i < 8; ++i) { + IHT_8x4[tx_type].cols(out[i], out[i]); + } + +#if CONFIG_EXT_TX + maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4, 8); +#endif + + // Sum with the destination + for (i = 0; i < 4; ++i) { + for (j = 0; j < 8; ++j) { + int d = i * stride + j; + int s = j * outstride + i; + dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5)); + } + } +} + void vp10_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { static const transform_2d IHT_8[] = { @@ -599,7 +709,7 @@ void vp10_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, } #if CONFIG_EXT_TX - maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8); + maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8, 8); #endif // Sum with the destination @@ -662,7 +772,7 @@ void vp10_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, } #if CONFIG_EXT_TX - maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 16); + maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 16, 16); #endif // Sum with the destination @@ -723,7 +833,7 @@ void vp10_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, IHT_32[tx_type].cols(out[i], out[i]); } - maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 32); + maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 32, 32); // Sum with the destination for (i = 0; i < 32; ++i) { @@ -840,6 +950,20 @@ void vp10_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, } } +#if CONFIG_EXT_TX +void vp10_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, + int stride, int eob, TX_TYPE tx_type) { + (void) eob; + vp10_iht8x4_32_add(input, dest, stride, tx_type); +} + +void vp10_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, + int stride, int eob, TX_TYPE tx_type) { + (void) eob; + vp10_iht4x8_32_add(input, dest, stride, tx_type); +} +#endif // CONFIG_EXT_TX + void vp10_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, int stride, int eob, TX_TYPE tx_type) { switch (tx_type) { @@ -1002,7 +1126,7 @@ void vp10_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, } #if CONFIG_EXT_TX - maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 4); + maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 4, 4); #endif // Sum with the destination @@ -1016,6 +1140,118 @@ void vp10_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, } } +#if CONFIG_EXT_TX +void vp10_highbd_iht4x8_32_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int tx_type, int bd) { + static const highbd_transform_2d HIGH_IHT_4x8[] = { + { vpx_highbd_idct8_c, vpx_highbd_idct4_c }, // DCT_DCT + { vpx_highbd_iadst8_c, vpx_highbd_idct4_c }, // ADST_DCT + { vpx_highbd_idct8_c, vpx_highbd_iadst4_c }, // DCT_ADST + { vpx_highbd_iadst8_c, vpx_highbd_iadst4_c }, // ADST_ADST + { vpx_highbd_iadst8_c, vpx_highbd_idct4_c }, // FLIPADST_DCT + { vpx_highbd_idct8_c, vpx_highbd_iadst4_c }, // DCT_FLIPADST + { vpx_highbd_iadst8_c, vpx_highbd_iadst4_c }, // FLIPADST_FLIPADST + { vpx_highbd_iadst8_c, vpx_highbd_iadst4_c }, // ADST_FLIPADST + { vpx_highbd_iadst8_c, vpx_highbd_iadst4_c }, // FLIPADST_ADST + { highbd_iidtx8_c, highbd_iidtx4_c }, // IDTX + { vpx_highbd_idct8_c, highbd_iidtx4_c }, // V_DCT + { highbd_iidtx8_c, vpx_highbd_idct4_c }, // H_DCT + { vpx_highbd_iadst8_c, highbd_iidtx4_c }, // V_ADST + { highbd_iidtx8_c, vpx_highbd_iadst4_c }, // H_ADST + { vpx_highbd_iadst8_c, highbd_iidtx4_c }, // V_FLIPADST + { highbd_iidtx8_c, vpx_highbd_iadst4_c }, // H_FLIPADST + }; + + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + int i, j; + tran_low_t out[4][8], outtmp[4]; + tran_low_t *outp = &out[0][0]; + int outstride = 8; + + // inverse transform row vectors, and transpose + for (i = 0; i < 8; ++i) { + HIGH_IHT_4x8[tx_type].rows(input, outtmp, bd); + for (j = 0; j < 4; ++j) + out[j][i] = (tran_low_t)highbd_dct_const_round_shift(outtmp[j] * Sqrt2, + bd); + input += 4; + } + + // inverse transform column vectors + for (i = 0; i < 4; ++i) { + HIGH_IHT_4x8[tx_type].cols(out[i], out[i], bd); + } + + maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 8, 4); + + // Sum with the destination + for (i = 0; i < 8; ++i) { + for (j = 0; j < 4; ++j) { + int d = i * stride + j; + int s = j * outstride + i; + dest[d] = highbd_clip_pixel_add(dest[d], + ROUND_POWER_OF_TWO(outp[s], 5), bd); + } + } +} + +void vp10_highbd_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int tx_type, int bd) { + static const highbd_transform_2d HIGH_IHT_8x4[] = { + { vpx_highbd_idct4_c, vpx_highbd_idct8_c }, // DCT_DCT + { vpx_highbd_iadst4_c, vpx_highbd_idct8_c }, // ADST_DCT + { vpx_highbd_idct4_c, vpx_highbd_iadst8_c }, // DCT_ADST + { vpx_highbd_iadst4_c, vpx_highbd_iadst8_c }, // ADST_ADST + { vpx_highbd_iadst4_c, vpx_highbd_idct8_c }, // FLIPADST_DCT + { vpx_highbd_idct4_c, vpx_highbd_iadst8_c }, // DCT_FLIPADST + { vpx_highbd_iadst4_c, vpx_highbd_iadst8_c }, // FLIPADST_FLIPADST + { vpx_highbd_iadst4_c, vpx_highbd_iadst8_c }, // ADST_FLIPADST + { vpx_highbd_iadst4_c, vpx_highbd_iadst8_c }, // FLIPADST_ADST + { highbd_iidtx4_c, highbd_iidtx8_c }, // IDTX + { vpx_highbd_idct4_c, highbd_iidtx8_c }, // V_DCT + { highbd_iidtx4_c, vpx_highbd_idct8_c }, // H_DCT + { vpx_highbd_iadst4_c, highbd_iidtx8_c }, // V_ADST + { highbd_iidtx4_c, vpx_highbd_iadst8_c }, // H_ADST + { vpx_highbd_iadst4_c, highbd_iidtx8_c }, // V_FLIPADST + { highbd_iidtx4_c, vpx_highbd_iadst8_c }, // H_FLIPADST + }; + + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + int i, j; + tran_low_t out[8][4], outtmp[8]; + tran_low_t *outp = &out[0][0]; + int outstride = 4; + + // inverse transform row vectors, and transpose + for (i = 0; i < 4; ++i) { + HIGH_IHT_8x4[tx_type].rows(input, outtmp, bd); + for (j = 0; j < 8; ++j) + out[j][i] = (tran_low_t)highbd_dct_const_round_shift(outtmp[j] * Sqrt2, + bd); + input += 8; + } + + // inverse transform column vectors + for (i = 0; i < 8; ++i) { + HIGH_IHT_8x4[tx_type].cols(out[i], out[i], bd); + } + + maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 4, 8); + + // Sum with the destination + for (i = 0; i < 4; ++i) { + for (j = 0; j < 8; ++j) { + int d = i * stride + j; + int s = j * outstride + i; + dest[d] = highbd_clip_pixel_add(dest[d], + ROUND_POWER_OF_TWO(outp[s], 5), bd); + } + } +} +#endif // CONFIG_EXT_TX + void vp10_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int tx_type, int bd) { static const highbd_transform_2d HIGH_IHT_8[] = { @@ -1068,7 +1304,7 @@ void vp10_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, } #if CONFIG_EXT_TX - maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 8); + maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 8, 8); #endif // Sum with the destination @@ -1134,7 +1370,7 @@ void vp10_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, } #if CONFIG_EXT_TX - maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 16); + maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 16, 16); #endif // Sum with the destination @@ -1198,7 +1434,7 @@ void vp10_highbd_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, HIGH_IHT_32[tx_type].cols(out[i], out[i], bd); } - maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 32); + maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 32, 32); // Sum with the destination for (i = 0; i < 32; ++i) { @@ -1320,6 +1556,22 @@ void vp10_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, } } +#if CONFIG_EXT_TX +void vp10_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, + int stride, int eob, int bd, + TX_TYPE tx_type) { + (void) eob; + vp10_highbd_iht8x4_32_add_c(input, dest, stride, tx_type, bd); +} + +void vp10_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, + int stride, int eob, int bd, + TX_TYPE tx_type) { + (void) eob; + vp10_highbd_iht4x8_32_add_c(input, dest, stride, tx_type, bd); +} +#endif // CONFIG_EXT_TX + void vp10_highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, int stride, int eob, int bd, TX_TYPE tx_type) { @@ -1454,6 +1706,14 @@ void inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride, case TX_8X8: vp10_inv_txfm_add_8x8(input, dest, stride, eob, tx_type); break; +#if CONFIG_EXT_TX + case TX_4X8: + vp10_inv_txfm_add_4x8(input, dest, stride, eob, tx_type); + break; + case TX_8X4: + vp10_inv_txfm_add_8x4(input, dest, stride, eob, tx_type); + break; +#endif // CONFIG_EXT_TX case TX_4X4: // this is like vp10_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless @@ -1486,6 +1746,14 @@ void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride, case TX_8X8: vp10_highbd_inv_txfm_add_8x8(input, dest, stride, eob, bd, tx_type); break; +#if CONFIG_EXT_TX + case TX_4X8: + vp10_highbd_inv_txfm_add_4x8(input, dest, stride, eob, bd, tx_type); + break; + case TX_8X4: + vp10_highbd_inv_txfm_add_8x4(input, dest, stride, eob, bd, tx_type); + break; +#endif // CONFIG_EXT_TX case TX_4X4: // this is like vp10_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless diff --git a/vp10/common/idct.h b/vp10/common/idct.h index 5d5231434..f20a154ad 100644 --- a/vp10/common/idct.h +++ b/vp10/common/idct.h @@ -66,6 +66,12 @@ void vp10_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, void vp10_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, int stride, int eob, TX_TYPE tx_type, int lossless); +#if CONFIG_EXT_TX +void vp10_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, + int stride, int eob, TX_TYPE tx_type); +void vp10_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, + int stride, int eob, TX_TYPE tx_type); +#endif // CONFIG_EXT_TX void vp10_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, int stride, int eob, TX_TYPE tx_type); void vp10_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest, @@ -88,6 +94,12 @@ void vp10_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, void vp10_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, int stride, int eob, int bd, TX_TYPE tx_type, int lossless); +#if CONFIG_EXT_TX +void vp10_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, + int stride, int eob, int bd, TX_TYPE tx_type); +void vp10_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, + int stride, int eob, int bd, TX_TYPE tx_type); +#endif // CONFIG_EXT_TX void vp10_highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, int stride, int eob, int bd, TX_TYPE tx_type); void vp10_highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest, diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c index 55715d7da..59446c229 100644 --- a/vp10/common/loopfilter.c +++ b/vp10/common/loopfilter.c @@ -722,8 +722,11 @@ static void build_masks(const loop_filter_info_n *const lfi_n, LOOP_FILTER_MASK *lfm) { const MB_MODE_INFO *mbmi = &mi->mbmi; const BLOCK_SIZE block_size = mbmi->sb_type; - const TX_SIZE tx_size_y = mbmi->tx_size; - const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1); + // TODO(debargha): Check if masks can be setup correctly when + // rectangular transfroms are used with the EXT_TX expt. + const TX_SIZE tx_size_y = txsize_sqr_map[mbmi->tx_size]; + const TX_SIZE tx_size_uv = + get_uv_tx_size_impl(mbmi->tx_size, block_size, 1, 1); const int filter_level = get_filter_level(lfi_n, mbmi); uint64_t *const left_y = &lfm->left_y[tx_size_y]; uint64_t *const above_y = &lfm->above_y[tx_size_y]; @@ -803,7 +806,7 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n, #endif // CONFIG_SUPERTX LOOP_FILTER_MASK *lfm) { const MB_MODE_INFO *mbmi = &mi->mbmi; - const TX_SIZE tx_size_y = mbmi->tx_size; + const TX_SIZE tx_size_y = txsize_sqr_map[mbmi->tx_size]; #if CONFIG_SUPERTX const BLOCK_SIZE block_size = supertx_enabled ? (BLOCK_SIZE)(3 * tx_size_y) : mbmi->sb_type; @@ -1267,8 +1270,8 @@ void vp10_filter_block_plane_non420(VP10_COMMON *cm, const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1; const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; - TX_SIZE tx_size_c = tx_size; - TX_SIZE tx_size_r = tx_size; + TX_SIZE tx_size_c = num_4x4_blocks_wide_txsize_log2_lookup[tx_size]; + TX_SIZE tx_size_r = num_4x4_blocks_high_txsize_log2_lookup[tx_size]; int tx_size_mask = 0; // Filter level can vary per MI diff --git a/vp10/common/pred_common.h b/vp10/common/pred_common.h index d4ae98037..9b73eb27e 100644 --- a/vp10/common/pred_common.h +++ b/vp10/common/pred_common.h @@ -177,10 +177,11 @@ static INLINE int get_tx_size_context(const MACROBLOCKD *xd) { const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; const int has_above = xd->up_available; const int has_left = xd->left_available; - int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size - : max_tx_size; - int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size - : max_tx_size; + int above_ctx = (has_above && !above_mbmi->skip) ? + (int)txsize_sqr_map[above_mbmi->tx_size] : max_tx_size; + int left_ctx = (has_left && !left_mbmi->skip) ? + (int)txsize_sqr_map[left_mbmi->tx_size] : max_tx_size; + assert(xd->mi[0]->mbmi.sb_type >= BLOCK_8X8); if (!has_left) left_ctx = above_ctx; diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c index 89ff13b3f..fe9837301 100644 --- a/vp10/common/reconintra.c +++ b/vp10/common/reconintra.c @@ -673,7 +673,7 @@ static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, INTRA_FILTER filter_type) { const int dx = (int)dr_intra_derivative[angle][0]; const int dy = (int)dr_intra_derivative[angle][1]; - const int bs = 4 << tx_size; + const int bs = 4 * num_4x4_blocks_wide_txsize_lookup[tx_size]; assert(angle > 0 && angle < 270); if (angle > 0 && angle < 90) { @@ -1159,7 +1159,7 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd, DECLARE_ALIGNED(16, uint16_t, above_data[MAX_SB_SIZE + 16]); uint16_t *above_row = above_data + 16; const uint16_t *const_above_row = above_row; - const int bs = 4 << tx_size; + const int bs = 4 * num_4x4_blocks_wide_txsize_lookup[tx_size]; int need_left = extend_modes[mode] & NEED_LEFT; int need_above = extend_modes[mode] & NEED_ABOVE; const uint16_t *above_ref = ref - ref_stride; @@ -1331,7 +1331,7 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, DECLARE_ALIGNED(16, uint8_t, above_data[MAX_SB_SIZE + 16]); uint8_t *above_row = above_data + 16; const uint8_t *const_above_row = above_row; - const int bs = 4 << tx_size; + const int bs = 4 * num_4x4_blocks_wide_txsize_lookup[tx_size]; int need_left = extend_modes[mode] & NEED_LEFT; int need_above = extend_modes[mode] & NEED_ABOVE; #if CONFIG_EXT_INTRA @@ -1491,7 +1491,7 @@ void vp10_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, int bhl_in, const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, int col_off, int row_off, int plane) { - const int txw = (1 << tx_size); + const int txw = num_4x4_blocks_wide_txsize_lookup[tx_size]; const int have_top = row_off || xd->up_available; const int have_left = col_off || xd->left_available; const int x = col_off * 4; @@ -1531,7 +1531,7 @@ void vp10_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, int bhl_in, (hpx - y - txpx); if (xd->mi[0]->mbmi.palette_mode_info.palette_size[plane != 0] > 0) { - const int bs = 4 * (1 << tx_size); + const int bs = 4 * num_4x4_blocks_wide_txsize_lookup[tx_size]; const int stride = 4 * (1 << bwl_in); int r, c; uint8_t *map = NULL; diff --git a/vp10/common/scan.c b/vp10/common/scan.c index 8cfeb975f..4c176d3e0 100644 --- a/vp10/common/scan.c +++ b/vp10/common/scan.c @@ -49,6 +49,50 @@ DECLARE_ALIGNED(16, static const int16_t, row_scan_4x4[16]) = { 13, 11, 14, 15, }; +#if CONFIG_EXT_TX +DECLARE_ALIGNED(16, static const int16_t, default_scan_4x8[32]) = { + 0, 1, 4, 5, 2, 8, 6, 9, + 10, 3, 12, 7, 13, 11, 14, 16, + 17, 15, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x8[32]) = { + 0, 4, 8, 12, 16, 20, 24, 28, + 1, 5, 9, 13, 17, 21, 25, 29, + 2, 6, 10, 14, 18, 22, 26, 30, + 3, 7, 11, 15, 19, 23, 27, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x8[32]) = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x4[32]) = { + 0, 1, 8, 9, 2, 16, 10, 17, + 18, 3, 24, 11, 25, 19, 26, 4, + 12, 27, 20, 5, 28, 13, 21, 29, + 6, 14, 22, 30, 7, 15, 23, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x4[32]) = { + 0, 8, 16, 24, 1, 9, 17, 25, + 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, + 6, 14, 22, 30, 7, 15, 23, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x4[32]) = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, +}; +#endif // CONFIG_EXT_TX + DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = { 0, 8, 1, 16, 9, 2, 17, 24, 10, 3, 18, 25, 32, 11, 4, 26, @@ -824,6 +868,86 @@ DECLARE_ALIGNED(16, static const int16_t, 9, 12, 7, 10, 10, 13, 11, 14, 0, 0, }; +#if CONFIG_EXT_TX +DECLARE_ALIGNED(16, static const int16_t, + default_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 0, 0, 1, 4, + 1, 1, 4, 4, 2, 5, 5, 8, + 6, 9, 2, 2, 8, 8, 3, 6, + 9, 12, 7, 10, 10, 13, 12, 12, + 13, 16, 11, 14, 14, 17, 15, 18, + 16, 16, 17, 20, 18, 21, 19, 22, + 20, 20, 21, 24, 22, 25, 23, 26, + 24, 24, 25, 28, 26, 29, 27, 30, + 0, 0 +}; + +DECLARE_ALIGNED(16, static const int16_t, + mcol_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 4, 4, 8, 8, + 12, 12, 16, 16, 20, 20, 24, 24, + 0, 0, 1, 4, 5, 8, 9, 12, + 13, 16, 17, 20, 21, 24, 25, 28, + 1, 1, 2, 5, 6, 9, 10, 13, + 14, 17, 18, 21, 22, 25, 26, 29, + 2, 2, 3, 6, 7, 10, 11, 14, + 15, 18, 19, 22, 23, 26, 27, 30, + 0, 0 +}; + +DECLARE_ALIGNED(16, static const int16_t, + mrow_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 1, 1, 2, 2, + 0, 0, 1, 4, 2, 5, 3, 6, + 4, 4, 5, 8, 6, 9, 7, 10, + 8, 8, 9, 12, 10, 13, 11, 14, + 12, 12, 13, 16, 14, 17, 15, 18, + 16, 16, 17, 20, 18, 21, 19, 22, + 20, 20, 21, 24, 22, 25, 23, 26, + 24, 24, 25, 28, 26, 29, 27, 30, + 0, 0 +}; + +DECLARE_ALIGNED(16, static const int16_t, + default_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 0, 0, 1, 8, + 1, 1, 8, 8, 2, 9, 9, 16, + 10, 17, 2, 2, 16, 16, 3, 10, + 17, 24, 11, 18, 18, 25, 3, 3, + 4, 11, 19, 26, 12, 19, 4, 4, + 20, 27, 5, 12, 13, 20, 21, 28, + 5, 5, 6, 13, 14, 21, 22, 29, + 6, 6, 7, 14, 15, 22, 23, 30, + 0, 0 +}; + +DECLARE_ALIGNED(16, static const int16_t, + mcol_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 8, 8, 16, 16, + 0, 0, 1, 8, 9, 16, 17, 24, + 1, 1, 2, 9, 10, 17, 18, 25, + 2, 2, 3, 10, 11, 18, 19, 26, + 3, 3, 4, 11, 12, 19, 20, 27, + 4, 4, 5, 12, 13, 20, 21, 28, + 5, 5, 6, 13, 14, 21, 22, 29, + 6, 6, 7, 14, 15, 22, 23, 30, + 0, 0 +}; + +DECLARE_ALIGNED(16, static const int16_t, + mrow_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 1, 1, 2, 2, + 3, 3, 4, 4, 5, 5, 6, 6, + 0, 0, 1, 8, 2, 9, 3, 10, + 4, 11, 5, 12, 6, 13, 7, 14, + 8, 8, 9, 16, 10, 17, 11, 18, + 12, 19, 13, 20, 14, 21, 15, 22, + 16, 16, 17, 24, 18, 25, 19, 26, + 20, 27, 21, 28, 22, 29, 23, 30, + 0, 0 +}; +#endif // CONFIG_EXT_TX + DECLARE_ALIGNED(16, static const int16_t, col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = { 0, 0, 0, 0, 8, 8, 8, 0, 16, 16, 1, 8, @@ -2258,6 +2382,50 @@ DECLARE_ALIGNED(16, static const int16_t, vp10_row_iscan_4x4[16]) = { 0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15, }; +#if CONFIG_EXT_TX +DECLARE_ALIGNED(16, static const int16_t, vp10_default_iscan_4x8[32]) = { + 0, 1, 4, 9, 2, 3, 6, 11, + 5, 7, 8, 13, 10, 12, 14, 17, + 15, 16, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_4x8[32]) = { + 0, 8, 16, 24, 1, 9, 17, 25, + 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, + 6, 14, 22, 30, 7, 15, 23, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp10_mrow_iscan_4x8[32]) = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp10_default_iscan_8x4[32]) = { + 0, 1, 4, 9, 15, 19, 24, 28, + 2, 3, 6, 11, 16, 21, 25, 29, + 5, 7, 8, 13, 18, 22, 26, 30, + 10, 12, 14, 17, 20, 23, 27, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_8x4[32]) = { + 0, 4, 8, 12, 16, 20, 24, 28, + 1, 5, 9, 13, 17, 21, 25, 29, + 2, 6, 10, 14, 18, 22, 26, 30, + 3, 7, 11, 15, 19, 23, 27, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp10_mrow_iscan_8x4[32]) = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, +}; +#endif // CONFIG_EXT_TX + #if CONFIG_EXT_TX DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_8x8[64]) = { 0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57, 2, 10, @@ -2943,13 +3111,6 @@ DECLARE_ALIGNED(16, static const int16_t, vp10_qtr_iscan_32x32[1024]) = { }; #endif // CONFIG_EXT_TX -const scan_order vp10_default_scan_orders[TX_SIZES] = { - {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors}, - {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors}, - {default_scan_16x16, vp10_default_iscan_16x16, default_scan_16x16_neighbors}, - {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors}, -}; - #if CONFIG_EXT_TX const scan_order vp10_intra_scan_orders[TX_SIZES][TX_TYPES] = { { // TX_4X4 @@ -3039,7 +3200,7 @@ const scan_order vp10_intra_scan_orders[TX_SIZES][TX_TYPES] = { } }; -const scan_order vp10_inter_scan_orders[TX_SIZES][TX_TYPES] = { +const scan_order vp10_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = { { // TX_4X4 {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors}, {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors}, @@ -3126,6 +3287,40 @@ const scan_order vp10_inter_scan_orders[TX_SIZES][TX_TYPES] = { {mcol_scan_32x32, vp10_mcol_iscan_32x32, mcol_scan_32x32_neighbors}, {mrow_scan_32x32, vp10_mrow_iscan_32x32, mrow_scan_32x32_neighbors}, {mcol_scan_32x32, vp10_mcol_iscan_32x32, mcol_scan_32x32_neighbors}, + }, { // TX_4X8 + {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors}, + {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors}, + {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors}, + {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors}, + {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors}, + {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors}, + {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors}, + {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors}, + {default_scan_4x8, vp10_default_iscan_4x8, default_scan_4x8_neighbors}, + {mrow_scan_4x8, vp10_mrow_iscan_4x8, mrow_scan_4x8_neighbors}, + {mrow_scan_4x8, vp10_mrow_iscan_4x8, mrow_scan_4x8_neighbors}, + {mcol_scan_4x8, vp10_mcol_iscan_4x8, mcol_scan_4x8_neighbors}, + {mrow_scan_4x8, vp10_mrow_iscan_4x8, mrow_scan_4x8_neighbors}, + {mcol_scan_4x8, vp10_mcol_iscan_4x8, mcol_scan_4x8_neighbors}, + {mrow_scan_4x8, vp10_mrow_iscan_4x8, mrow_scan_4x8_neighbors}, + {mcol_scan_4x8, vp10_mcol_iscan_4x8, mcol_scan_4x8_neighbors}, + }, { // TX_8X4 + {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors}, + {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors}, + {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors}, + {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors}, + {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors}, + {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors}, + {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors}, + {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors}, + {default_scan_8x4, vp10_default_iscan_8x4, default_scan_8x4_neighbors}, + {mrow_scan_8x4, vp10_mrow_iscan_8x4, mrow_scan_8x4_neighbors}, + {mrow_scan_8x4, vp10_mrow_iscan_8x4, mrow_scan_8x4_neighbors}, + {mcol_scan_8x4, vp10_mcol_iscan_8x4, mcol_scan_8x4_neighbors}, + {mrow_scan_8x4, vp10_mrow_iscan_8x4, mrow_scan_8x4_neighbors}, + {mcol_scan_8x4, vp10_mcol_iscan_8x4, mcol_scan_8x4_neighbors}, + {mrow_scan_8x4, vp10_mrow_iscan_8x4, mrow_scan_8x4_neighbors}, + {mcol_scan_8x4, vp10_mcol_iscan_8x4, mcol_scan_8x4_neighbors}, } }; diff --git a/vp10/common/scan.h b/vp10/common/scan.h index aadae4024..92a8e6b06 100644 --- a/vp10/common/scan.h +++ b/vp10/common/scan.h @@ -29,7 +29,6 @@ typedef struct { const int16_t *neighbors; } scan_order; -extern const scan_order vp10_default_scan_orders[TX_SIZES]; extern const scan_order vp10_intra_scan_orders[TX_SIZES][TX_TYPES]; static INLINE int get_coef_context(const int16_t *neighbors, @@ -44,7 +43,7 @@ static INLINE const scan_order *get_intra_scan(TX_SIZE tx_size, } #if CONFIG_EXT_TX -extern const scan_order vp10_inter_scan_orders[TX_SIZES][TX_TYPES]; +extern const scan_order vp10_inter_scan_orders[TX_SIZES_ALL][TX_TYPES]; static INLINE const scan_order *get_inter_scan(TX_SIZE tx_size, TX_TYPE tx_type) { diff --git a/vp10/common/vp10_inv_txfm2d.c b/vp10/common/vp10_inv_txfm2d.c index 85a33baa4..071419e86 100644 --- a/vp10/common/vp10_inv_txfm2d.c +++ b/vp10/common/vp10_inv_txfm2d.c @@ -82,7 +82,7 @@ TXFM_2D_FLIP_CFG vp10_get_inv_txfm_cfg(int tx_type, int tx_size) { } TXFM_2D_FLIP_CFG vp10_get_inv_txfm_64x64_cfg(int tx_type) { - TXFM_2D_FLIP_CFG cfg; + TXFM_2D_FLIP_CFG cfg = {0, 0, NULL}; switch (tx_type) { case DCT_DCT: cfg.cfg = &inv_txfm_2d_cfg_dct_dct_64; diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl index 8f87b0222..ab2fa1608 100644 --- a/vp10/common/vp10_rtcd_defs.pl +++ b/vp10/common/vp10_rtcd_defs.pl @@ -83,6 +83,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; specialize qw/vp10_iht4x4_16_add/; + add_proto qw/void vp10_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp10_iht8x4_32_add/; + + add_proto qw/void vp10_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp10_iht4x8_32_add/; + add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; specialize qw/vp10_iht8x8_64_add/; @@ -143,6 +149,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; specialize qw/vp10_iht4x4_16_add sse2/; + add_proto qw/void vp10_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp10_iht8x4_32_add/; + + add_proto qw/void vp10_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp10_iht4x8_32_add/; + add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; specialize qw/vp10_iht8x8_64_add sse2/; @@ -206,6 +218,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; specialize qw/vp10_iht4x4_16_add/; + add_proto qw/void vp10_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp10_iht8x4_32_add/; + + add_proto qw/void vp10_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp10_iht4x8_32_add/; + add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; specialize qw/vp10_iht8x8_64_add/; @@ -242,6 +260,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; specialize qw/vp10_iht4x4_16_add sse2 neon dspr2/; + add_proto qw/void vp10_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp10_iht8x4_32_add/; + + add_proto qw/void vp10_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp10_iht4x8_32_add/; + add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; specialize qw/vp10_iht8x8_64_add sse2 neon dspr2/; @@ -348,6 +372,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp10_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; specialize qw/vp10_highbd_iht4x4_16_add/; + add_proto qw/void vp10_highbd_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; + specialize qw/vp10_highbd_iht8x4_32_add/; + + add_proto qw/void vp10_highbd_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; + specialize qw/vp10_highbd_iht4x8_32_add/; + add_proto qw/void vp10_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; specialize qw/vp10_highbd_iht8x8_64_add/; @@ -407,6 +437,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp10_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/vp10_fht4x4 sse2/; + add_proto qw/void vp10_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp10_fht8x4/; + + add_proto qw/void vp10_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp10_fht4x8/; + add_proto qw/void vp10_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/vp10_fht8x8 sse2/; @@ -422,6 +458,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp10_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/vp10_fht4x4 sse2/; + add_proto qw/void vp10_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp10_fht8x4/; + + add_proto qw/void vp10_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp10_fht4x8/; + add_proto qw/void vp10_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/vp10_fht8x8 sse2/; @@ -699,6 +741,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp10_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/vp10_highbd_fht4x4 sse4_1/; + add_proto qw/void vp10_highbd_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp10_highbd_fht8x4/; + + add_proto qw/void vp10_highbd_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp10_highbd_fht4x8/; + add_proto qw/void vp10_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/vp10_highbd_fht8x8/; diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c index 66b44a36b..6eab34056 100644 --- a/vp10/decoder/decodeframe.c +++ b/vp10/decoder/decodeframe.c @@ -249,15 +249,16 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, dqcoeff[0] = 0; } else { if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10) - memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0])); + memset(dqcoeff, 0, 4 * 4 * num_4x4_blocks_wide_txsize_lookup[tx_size] * + sizeof(dqcoeff[0])); #if CONFIG_EXT_TX else - memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0])); + memset(dqcoeff, 0, get_tx2d_size(tx_size) * sizeof(dqcoeff[0])); #else else if (tx_size == TX_32X32 && eob <= 34) memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0])); else - memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0])); + memset(dqcoeff, 0, get_tx2d_size(tx_size) * sizeof(dqcoeff[0])); #endif } } @@ -285,8 +286,8 @@ static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd, mode = xd->mi[0]->bmi[(row << 1) + col].as_mode; vp10_predict_intra_block(xd, pd->n4_wl, pd->n4_hl, tx_size, mode, - dst, pd->dst.stride, dst, pd->dst.stride, - col, row, plane); + dst, pd->dst.stride, dst, pd->dst.stride, + col, row, plane); if (!mbmi->skip) { TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, tx_size); @@ -323,14 +324,18 @@ static void decode_reconstruct_tx(MACROBLOCKD *const xd, vp10_reader *r, if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; - if (tx_size == plane_tx_size) { + if (tx_size == plane_tx_size +#if CONFIG_EXT_TX && CONFIG_RECT_TX + || plane_tx_size >= TX_SIZES +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + ) { PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV; - TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size); - const scan_order *sc = get_scan(tx_size, tx_type, 1); + TX_TYPE tx_type = get_tx_type(plane_type, xd, block, plane_tx_size); + const scan_order *sc = get_scan(plane_tx_size, tx_type, 1); const int eob = vp10_decode_block_tokens(xd, plane, sc, - blk_col, blk_row, tx_size, + blk_col, blk_row, plane_tx_size, tx_type, r, mbmi->segment_id); - inverse_transform_block(xd, plane, tx_type, tx_size, + inverse_transform_block(xd, plane, tx_type, plane_tx_size, &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col], pd->dst.stride, eob); *eob_total += eob; @@ -344,7 +349,7 @@ static void decode_reconstruct_tx(MACROBLOCKD *const xd, vp10_reader *r, for (i = 0; i < 4; ++i) { const int offsetr = blk_row + ((i >> 1) << bsl); const int offsetc = blk_col + ((i & 0x01) << bsl); - int step = 1 << (2 * (tx_size - 1)); + int step = num_4x4_blocks_txsize_lookup[tx_size - 1]; if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; @@ -431,7 +436,6 @@ static MB_MODE_INFO *set_offsets(VP10_COMMON *const cm, MACROBLOCKD *const xd, set_skip_context(xd, mi_row, mi_col); - #if CONFIG_VAR_TX xd->max_tx_size = max_txsize_lookup[bsize]; #endif @@ -1321,7 +1325,8 @@ static void decode_block(VP10Decoder *const pbi, MACROBLOCKD *const xd, : mbmi->tx_size; const int num_4x4_w = pd->n4_w; const int num_4x4_h = pd->n4_h; - const int step = (1 << tx_size); + const int stepr = num_4x4_blocks_high_txsize_lookup[tx_size]; + const int stepc = num_4x4_blocks_wide_txsize_lookup[tx_size]; int row, col; const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? @@ -1330,8 +1335,8 @@ static void decode_block(VP10Decoder *const pbi, MACROBLOCKD *const xd, (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); - for (row = 0; row < max_blocks_high; row += step) - for (col = 0; col < max_blocks_wide; col += step) + for (row = 0; row < max_blocks_high; row += stepr) + for (col = 0; col < max_blocks_wide; col += stepc) predict_and_reconstruct_intra_block(xd, r, mbmi, plane, @@ -1409,15 +1414,20 @@ static void decode_block(VP10Decoder *const pbi, MACROBLOCKD *const xd, int row, col; #if CONFIG_VAR_TX // TODO(jingning): This can be simplified for decoder performance. - const BLOCK_SIZE plane_bsize = - get_plane_block_size(VPXMAX(bsize, BLOCK_8X8), pd); + const BLOCK_SIZE plane_bsize = get_plane_block_size( + VPXMAX(bsize, BLOCK_8X8), pd); +#if CONFIG_EXT_TX && CONFIG_RECT_TX + const TX_SIZE max_tx_size = plane ? + max_txsize_lookup[plane_bsize] : max_txsize_rect_lookup[plane_bsize]; +#else const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize]; - const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size]; - int bw = num_4x4_blocks_wide_lookup[txb_size]; +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + int bw = num_4x4_blocks_wide_txsize_lookup[max_tx_size]; + int bh = num_4x4_blocks_high_txsize_lookup[max_tx_size]; + const int step = num_4x4_blocks_txsize_lookup[max_tx_size]; int block = 0; - const int step = 1 << (max_tx_size << 1); - for (row = 0; row < num_4x4_h; row += bw) { + for (row = 0; row < num_4x4_h; row += bh) { for (col = 0; col < num_4x4_w; col += bw) { decode_reconstruct_tx(xd, r, mbmi, plane, plane_bsize, block, row, col, max_tx_size, &eobtotal); @@ -1428,7 +1438,8 @@ static void decode_block(VP10Decoder *const pbi, MACROBLOCKD *const xd, const TX_SIZE tx_size = plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl) : mbmi->tx_size; - const int step = (1 << tx_size); + const int stepr = num_4x4_blocks_high_txsize_lookup[tx_size]; + const int stepc = num_4x4_blocks_wide_txsize_lookup[tx_size]; const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); @@ -1436,8 +1447,8 @@ static void decode_block(VP10Decoder *const pbi, MACROBLOCKD *const xd, (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); - for (row = 0; row < max_blocks_high; row += step) - for (col = 0; col < max_blocks_wide; col += step) + for (row = 0; row < max_blocks_high; row += stepr) + for (col = 0; col < max_blocks_wide; col += stepc) eobtotal += reconstruct_inter_block(xd, r, mbmi->segment_id, @@ -1831,7 +1842,8 @@ static void decode_partition(VP10Decoder *const pbi, MACROBLOCKD *const xd, const TX_SIZE tx_size = i ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl) : mbmi->tx_size; - const int step = (1 << tx_size); + const int stepr = num_4x4_blocks_high_txsize_lookup[tx_size]; + const int stepc = num_4x4_blocks_wide_txsize_lookup[tx_size]; const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); @@ -1839,8 +1851,8 @@ static void decode_partition(VP10Decoder *const pbi, MACROBLOCKD *const xd, (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); - for (row = 0; row < max_blocks_high; row += step) - for (col = 0; col < max_blocks_wide; col += step) + for (row = 0; row < max_blocks_high; row += stepr) + for (col = 0; col < max_blocks_wide; col += stepc) eobtotal += reconstruct_inter_block(xd, r, mbmi->segment_id_supertx, diff --git a/vp10/decoder/decodemv.c b/vp10/decoder/decodemv.c index 8528370df..e036cebef 100644 --- a/vp10/decoder/decodemv.c +++ b/vp10/decoder/decodemv.c @@ -233,7 +233,7 @@ static int read_segment_id(vp10_reader *r, } #if CONFIG_VAR_TX -static void read_tx_size_inter(VP10_COMMON *cm, MACROBLOCKD *xd, +static void read_tx_size_vartx(VP10_COMMON *cm, MACROBLOCKD *xd, MB_MODE_INFO *mbmi, FRAME_COUNTS *counts, TX_SIZE tx_size, int blk_row, int blk_col, vp10_reader *r) { @@ -279,14 +279,14 @@ static void read_tx_size_inter(VP10_COMMON *cm, MACROBLOCKD *xd, for (i = 0; i < 4; ++i) { int offsetr = blk_row + ((i >> 1) << bsl); int offsetc = blk_col + ((i & 0x01) << bsl); - read_tx_size_inter(cm, xd, mbmi, counts, + read_tx_size_vartx(cm, xd, mbmi, counts, tx_size - 1, offsetr, offsetc, r); } } else { int idx, idy; inter_tx_size[0][0] = tx_size; - for (idy = 0; idy < (1 << tx_size) / 2; ++idy) - for (idx = 0; idx < (1 << tx_size) / 2; ++idx) + for (idy = 0; idy < num_4x4_blocks_high_txsize_lookup[tx_size] / 2; ++idy) + for (idx = 0; idx < num_4x4_blocks_wide_txsize_lookup[tx_size] / 2; ++idx) inter_tx_size[idy][idx] = tx_size; mbmi->tx_size = tx_size; if (counts) @@ -309,17 +309,44 @@ static TX_SIZE read_selected_tx_size(VP10_COMMON *cm, MACROBLOCKD *xd, return (TX_SIZE)tx_size; } -static TX_SIZE read_tx_size(VP10_COMMON *cm, MACROBLOCKD *xd, - int allow_select, vp10_reader *r) { +static TX_SIZE read_tx_size_intra(VP10_COMMON *cm, MACROBLOCKD *xd, + vp10_reader *r) { TX_MODE tx_mode = cm->tx_mode; BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; - const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; if (xd->lossless[xd->mi[0]->mbmi.segment_id]) return TX_4X4; - if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8) - return read_selected_tx_size(cm, xd, max_tx_size, r); - else - return VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]); + if (bsize >= BLOCK_8X8) { + const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; + if (tx_mode == TX_MODE_SELECT) { + return read_selected_tx_size(cm, xd, max_tx_size, r); + } else { + return VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]); + } + } else { + return TX_4X4; + } +} + +static TX_SIZE read_tx_size_inter(VP10_COMMON *cm, MACROBLOCKD *xd, + int allow_select, vp10_reader *r) { + TX_MODE tx_mode = cm->tx_mode; + BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; + if (xd->lossless[xd->mi[0]->mbmi.segment_id]) + return TX_4X4; + if (bsize >= BLOCK_8X8) { + const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; + if (allow_select && tx_mode == TX_MODE_SELECT) { + return read_selected_tx_size(cm, xd, max_tx_size, r); + } else { + return VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]); + } + } else { +#if CONFIG_EXT_TX && CONFIG_RECT_TX && !CONFIG_VAR_TX + return max_txsize_rect_lookup[bsize]; +#else + return TX_4X4; +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX && !CONFIG_VAR_TX + } } static int dec_get_segment_id(const VP10_COMMON *cm, const uint8_t *segment_ids, @@ -577,7 +604,7 @@ static void read_intra_frame_mode_info(VP10_COMMON *const cm, mbmi->segment_id = read_intra_segment_id(cm, xd, mi_offset, x_mis, y_mis, r); mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r); - mbmi->tx_size = read_tx_size(cm, xd, 1, r); + mbmi->tx_size = read_tx_size_intra(cm, xd, r); mbmi->ref_frame[0] = INTRA_FRAME; mbmi->ref_frame[1] = NONE; @@ -1670,14 +1697,18 @@ static void read_inter_frame_mode_info(VP10Decoder *const pbi, int idx, idy; for (idy = 0; idy < height; idy += bs) for (idx = 0; idx < width; idx += bs) - read_tx_size_inter(cm, xd, mbmi, xd->counts, max_tx_size, + read_tx_size_vartx(cm, xd, mbmi, xd->counts, max_tx_size, idy, idx, r); if (xd->counts) { const int ctx = get_tx_size_context(xd); ++xd->counts->tx_size[max_tx_size - TX_8X8][ctx][mbmi->tx_size]; } } else { - mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r); + if (inter_block) + mbmi->tx_size = read_tx_size_inter(cm, xd, !mbmi->skip, r); + else + mbmi->tx_size = read_tx_size_intra(cm, xd, r); + if (inter_block) { const int width = num_4x4_blocks_wide_lookup[bsize]; const int height = num_4x4_blocks_high_lookup[bsize]; @@ -1691,7 +1722,10 @@ static void read_inter_frame_mode_info(VP10Decoder *const pbi, set_txfm_ctx(xd->above_txfm_context, mbmi->tx_size, xd->n8_w); } #else - mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r); + if (inter_block) + mbmi->tx_size = read_tx_size_inter(cm, xd, !mbmi->skip, r); + else + mbmi->tx_size = read_tx_size_intra(cm, xd, r); #endif // CONFIG_VAR_TX #if CONFIG_SUPERTX } diff --git a/vp10/decoder/detokenize.c b/vp10/decoder/detokenize.c index cc3b18bb4..7cbf01e56 100644 --- a/vp10/decoder/detokenize.c +++ b/vp10/decoder/detokenize.c @@ -55,12 +55,13 @@ static int decode_coefs(const MACROBLOCKD *xd, int ctx, const int16_t *scan, const int16_t *nb, vp10_reader *r) { FRAME_COUNTS *counts = xd->counts; - const int max_eob = 16 << (tx_size << 1); + const int max_eob = get_tx2d_size(tx_size); const FRAME_CONTEXT *const fc = xd->fc; const int ref = is_inter_block(&xd->mi[0]->mbmi); int band, c = 0; + const int tx_size_ctx = txsize_sqr_map[tx_size]; const vpx_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] = - fc->coef_probs[tx_size][type][ref]; + fc->coef_probs[tx_size_ctx][type][ref]; const vpx_prob *prob; unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1]; unsigned int (*eob_branch_count)[COEFF_CONTEXTS]; @@ -80,8 +81,8 @@ static int decode_coefs(const MACROBLOCKD *xd, const uint8_t *cat6_prob; if (counts) { - coef_counts = counts->coef[tx_size][type][ref]; - eob_branch_count = counts->eob_branch[tx_size][type][ref]; + coef_counts = counts->coef[tx_size_ctx][type][ref]; + eob_branch_count = counts->eob_branch[tx_size_ctx][type][ref]; } #if CONFIG_VP9_HIGHBITDEPTH @@ -249,15 +250,16 @@ static int decode_coefs_ans(const MACROBLOCKD *const xd, int ctx, const int16_t *scan, const int16_t *nb, struct AnsDecoder *const ans) { FRAME_COUNTS *counts = xd->counts; - const int max_eob = 16 << (tx_size << 1); + const int max_eob = get_tx2d_size(tx_size); const FRAME_CONTEXT *const fc = xd->fc; const int ref = is_inter_block(&xd->mi[0]->mbmi); int band, c = 0; int skip_eob = 0; + const int tx_size_ctx = txsize_sqr_map[tx_size]; const vpx_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] = - fc->coef_probs[tx_size][type][ref]; + fc->coef_probs[tx_size_ctx][type][ref]; const rans_dec_lut(*coef_cdfs)[COEFF_CONTEXTS] = - fc->coef_cdfs[tx_size][type][ref]; + fc->coef_cdfs[tx_size_ctx][type][ref]; const vpx_prob *prob; const rans_dec_lut *cdf; unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1]; @@ -280,8 +282,8 @@ static int decode_coefs_ans(const MACROBLOCKD *const xd, dq_shift = get_tx_scale(xd, tx_type, tx_size); if (counts) { - coef_counts = counts->coef[tx_size][type][ref]; - eob_branch_count = counts->eob_branch[tx_size][type][ref]; + coef_counts = counts->coef[tx_size_ctx][type][ref]; + eob_branch_count = counts->eob_branch[tx_size_ctx][type][ref]; } #if CONFIG_VP9_HIGHBITDEPTH @@ -425,23 +427,24 @@ void dec_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, int aoff, int loff) { ENTROPY_CONTEXT *const a = pd->above_context + aoff; ENTROPY_CONTEXT *const l = pd->left_context + loff; - const int tx_size_in_blocks = 1 << tx_size; + const int tx_w_in_blocks = num_4x4_blocks_wide_txsize_lookup[tx_size]; + const int tx_h_in_blocks = num_4x4_blocks_high_txsize_lookup[tx_size]; // above if (has_eob && xd->mb_to_right_edge < 0) { int i; const int blocks_wide = pd->n4_w + (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); - int above_contexts = tx_size_in_blocks; + int above_contexts = tx_w_in_blocks; if (above_contexts + aoff > blocks_wide) above_contexts = blocks_wide - aoff; for (i = 0; i < above_contexts; ++i) a[i] = has_eob; - for (i = above_contexts; i < tx_size_in_blocks; ++i) + for (i = above_contexts; i < tx_w_in_blocks; ++i) a[i] = 0; } else { - memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); + memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_w_in_blocks); } // left @@ -449,16 +452,16 @@ void dec_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, int i; const int blocks_high = pd->n4_h + (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); - int left_contexts = tx_size_in_blocks; + int left_contexts = tx_h_in_blocks; if (left_contexts + loff > blocks_high) left_contexts = blocks_high - loff; for (i = 0; i < left_contexts; ++i) l[i] = has_eob; - for (i = left_contexts; i < tx_size_in_blocks; ++i) + for (i = left_contexts; i < tx_h_in_blocks; ++i) l[i] = 0; } else { - memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); + memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_h_in_blocks); } } @@ -528,7 +531,10 @@ int vp10_decode_block_tokens(MACROBLOCKD *const xd, ctx, sc->scan, sc->neighbors, r); #endif // !CONFIG_ANS dec_set_contexts(xd, pd, tx_size, eob > 0, x, y); + /* + vp10_set_contexts(xd, pd, + get_plane_block_size(xd->mi[0]->mbmi.sb_type, pd), + tx_size, eob > 0, x, y); + */ return eob; } - - diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c index 6430a7104..d63c5d313 100644 --- a/vp10/encoder/bitstream.c +++ b/vp10/encoder/bitstream.c @@ -375,7 +375,8 @@ static void write_selected_tx_size(const VP10_COMMON *cm, TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size; BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; - if (max_tx_size > TX_4X4) { + // For sub8x8 blocks the tx_size symbol does not need to be sent + if (bsize >= BLOCK_8X8) { vp10_write_token(w, vp10_tx_size_tree[max_tx_size - TX_8X8], cm->fc->tx_size_probs[max_tx_size - TX_8X8] [get_tx_size_context(xd)], @@ -801,7 +802,7 @@ static void pack_txb_tokens(vp10_writer *w, for (i = 0; i < 4; ++i) { const int offsetr = blk_row + ((i >> 1) << bsl); const int offsetc = blk_col + ((i & 0x01) << bsl); - int step = 1 << (2 * (tx_size - 1)); + int step = num_4x4_blocks_txsize_lookup[tx_size - 1]; if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; @@ -1662,7 +1663,7 @@ static void write_modes_b(VP10_COMP *cpi, const TileInfo *const tile, const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size]; int bw = num_4x4_blocks_wide_lookup[txb_size]; int block = 0; - const int step = 1 << (max_tx_size << 1); + const int step = num_4x4_blocks_txsize_lookup[max_tx_size]; for (row = 0; row < num_4x4_h; row += bw) { for (col = 0; col < num_4x4_w; col += bw) { pack_txb_tokens(w, tok, tok_end, xd, mbmi, plane, plane_bsize, diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c index 11d4a8e99..46bcd0bab 100644 --- a/vp10/encoder/dct.c +++ b/vp10/encoder/dct.c @@ -1038,29 +1038,29 @@ static void fhalfright32(const tran_low_t *input, tran_low_t *output) { // Note overall scaling factor is 4 times orthogonal } -static void copy_block(const int16_t *src, int src_stride, int l, +static void copy_block(const int16_t *src, int src_stride, int l, int w, int16_t *dest, int dest_stride) { int i; for (i = 0; i < l; ++i) { memcpy(dest + dest_stride * i, src + src_stride * i, - l * sizeof(int16_t)); + w * sizeof(int16_t)); } } -static void fliplr(int16_t *dest, int stride, int l) { +static void fliplr(int16_t *dest, int stride, int l, int w) { int i, j; for (i = 0; i < l; ++i) { - for (j = 0; j < l / 2; ++j) { + for (j = 0; j < w / 2; ++j) { const int16_t tmp = dest[i * stride + j]; - dest[i * stride + j] = dest[i * stride + l - 1 - j]; - dest[i * stride + l - 1 - j] = tmp; + dest[i * stride + j] = dest[i * stride + w - 1 - j]; + dest[i * stride + w - 1 - j] = tmp; } } } -static void flipud(int16_t *dest, int stride, int l) { +static void flipud(int16_t *dest, int stride, int l, int w) { int i, j; - for (j = 0; j < l; ++j) { + for (j = 0; j < w; ++j) { for (i = 0; i < l / 2; ++i) { const int16_t tmp = dest[i * stride + j]; dest[i * stride + j] = dest[(l - 1 - i) * stride + j]; @@ -1069,36 +1069,40 @@ static void flipud(int16_t *dest, int stride, int l) { } } -static void fliplrud(int16_t *dest, int stride, int l) { +static void fliplrud(int16_t *dest, int stride, int l, int w) { int i, j; for (i = 0; i < l / 2; ++i) { - for (j = 0; j < l; ++j) { + for (j = 0; j < w; ++j) { const int16_t tmp = dest[i * stride + j]; - dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j]; - dest[(l - 1 - i) * stride + l - 1 - j] = tmp; + dest[i * stride + j] = dest[(l - 1 - i) * stride + w - 1 - j]; + dest[(l - 1 - i) * stride + w - 1 - j] = tmp; } } } -static void copy_fliplr(const int16_t *src, int src_stride, int l, - int16_t *dest, int dest_stride) { - copy_block(src, src_stride, l, dest, dest_stride); - fliplr(dest, dest_stride, l); +static void copy_fliplr(const int16_t *src, int src_stride, + int l, int w, + int16_t *dest, int dest_stride) { + copy_block(src, src_stride, l, w, dest, dest_stride); + fliplr(dest, dest_stride, l, w); } -static void copy_flipud(const int16_t *src, int src_stride, int l, - int16_t *dest, int dest_stride) { - copy_block(src, src_stride, l, dest, dest_stride); - flipud(dest, dest_stride, l); +static void copy_flipud(const int16_t *src, int src_stride, + int l, int w, + int16_t *dest, int dest_stride) { + copy_block(src, src_stride, l, w, dest, dest_stride); + flipud(dest, dest_stride, l, w); } -static void copy_fliplrud(const int16_t *src, int src_stride, int l, - int16_t *dest, int dest_stride) { - copy_block(src, src_stride, l, dest, dest_stride); - fliplrud(dest, dest_stride, l); +static void copy_fliplrud(const int16_t *src, int src_stride, + int l, int w, + int16_t *dest, int dest_stride) { + copy_block(src, src_stride, l, w, dest, dest_stride); + fliplrud(dest, dest_stride, l, w); } -static void maybe_flip_input(const int16_t **src, int *src_stride, int l, +static void maybe_flip_input(const int16_t **src, int *src_stride, + int l, int w, int16_t *buff, int tx_type) { switch (tx_type) { case DCT_DCT: @@ -1114,21 +1118,21 @@ static void maybe_flip_input(const int16_t **src, int *src_stride, int l, case FLIPADST_DCT: case FLIPADST_ADST: case V_FLIPADST: - copy_flipud(*src, *src_stride, l, buff, l); + copy_flipud(*src, *src_stride, l, w, buff, w); *src = buff; - *src_stride = l; + *src_stride = w; break; case DCT_FLIPADST: case ADST_FLIPADST: case H_FLIPADST: - copy_fliplr(*src, *src_stride, l, buff, l); + copy_fliplr(*src, *src_stride, l, w, buff, w); *src = buff; - *src_stride = l; + *src_stride = w; break; case FLIPADST_FLIPADST: - copy_fliplrud(*src, *src_stride, l, buff, l); + copy_fliplrud(*src, *src_stride, l, w, buff, w); *src = buff; - *src_stride = l; + *src_stride = w; break; default: assert(0); @@ -1219,6 +1223,44 @@ static const transform_2d FHT_32[] = { { fhalfright32, fidtx32 }, // V_FLIPADST { fidtx32, fhalfright32 }, // H_FLIPADST }; + +static const transform_2d FHT_4x8[] = { + { fdct8, fdct4 }, // DCT_DCT + { fadst8, fdct4 }, // ADST_DCT + { fdct8, fadst4 }, // DCT_ADST + { fadst8, fadst4 }, // ADST_ADST + { fadst8, fdct4 }, // FLIPADST_DCT + { fdct8, fadst4 }, // DCT_FLIPADST + { fadst8, fadst4 }, // FLIPADST_FLIPADST + { fadst8, fadst4 }, // ADST_FLIPADST + { fadst8, fadst4 }, // FLIPADST_ADST + { fidtx8, fidtx4 }, // IDTX + { fdct8, fidtx4 }, // V_DCT + { fidtx8, fdct4 }, // H_DCT + { fadst8, fidtx4 }, // V_ADST + { fidtx8, fadst4 }, // H_ADST + { fadst8, fidtx4 }, // V_FLIPADST + { fidtx8, fadst4 }, // H_FLIPADST +}; + +static const transform_2d FHT_8x4[] = { + { fdct4, fdct8 }, // DCT_DCT + { fadst4, fdct8 }, // ADST_DCT + { fdct4, fadst8 }, // DCT_ADST + { fadst4, fadst8 }, // ADST_ADST + { fadst4, fdct8 }, // FLIPADST_DCT + { fdct4, fadst8 }, // DCT_FLIPADST + { fadst4, fadst8 }, // FLIPADST_FLIPADST + { fadst4, fadst8 }, // ADST_FLIPADST + { fadst4, fadst8 }, // FLIPADST_ADST + { fidtx4, fidtx8 }, // IDTX + { fdct4, fidtx8 }, // V_DCT + { fidtx4, fdct8 }, // H_DCT + { fadst4, fidtx8 }, // V_ADST + { fidtx4, fadst8 }, // H_ADST + { fadst4, fidtx8 }, // V_FLIPADST + { fidtx4, fadst8 }, // H_FLIPADST +}; #endif // CONFIG_EXT_TX void vp10_fht4x4_c(const int16_t *input, tran_low_t *output, @@ -1233,7 +1275,7 @@ void vp10_fht4x4_c(const int16_t *input, tran_low_t *output, #if CONFIG_EXT_TX int16_t flipped_input[4 * 4]; - maybe_flip_input(&input, &stride, 4, flipped_input, tx_type); + maybe_flip_input(&input, &stride, 4, 4, flipped_input, tx_type); #endif // Columns @@ -1258,6 +1300,70 @@ void vp10_fht4x4_c(const int16_t *input, tran_low_t *output, } } +#if CONFIG_EXT_TX +void vp10_fht4x8_c(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + const int n = 4; + const int n2 = 8; + tran_low_t out[8 * 4]; + tran_low_t temp_in[8], temp_out[8]; + int i, j; + const transform_2d ht = FHT_4x8[tx_type]; + int16_t flipped_input[8 * 4]; + maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type); + + // Columns + for (i = 0; i < n; ++i) { + for (j = 0; j < n2; ++j) + temp_in[j] = input[j * stride + i] * 8; + ht.cols(temp_in, temp_out); + for (j = 0; j < n2; ++j) + out[j * n + i] = (tran_low_t)fdct_round_shift(temp_out[j] * Sqrt2); + } + + // Rows + for (i = 0; i < n2; ++i) { + for (j = 0; j < n; ++j) + temp_in[j] = out[j + i * n]; + ht.rows(temp_in, temp_out); + for (j = 0; j < n; ++j) + output[j + i * n] = (temp_out[j] + 1) >> 2; + } + // Note: overall scale factor of transform is 8 times unitary +} + +void vp10_fht8x4_c(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + const int n = 4; + const int n2 = 8; + tran_low_t out[8 * 4]; + tran_low_t temp_in[8], temp_out[8]; + int i, j; + const transform_2d ht = FHT_8x4[tx_type]; + int16_t flipped_input[8 * 4]; + maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type); + + // Columns + for (i = 0; i < n2; ++i) { + for (j = 0; j < n; ++j) + temp_in[j] = input[j * stride + i] * 8; + ht.cols(temp_in, temp_out); + for (j = 0; j < n; ++j) + out[j * n2 + i] = (tran_low_t)fdct_round_shift(temp_out[j] * Sqrt2); + } + + // Rows + for (i = 0; i < n; ++i) { + for (j = 0; j < n2; ++j) + temp_in[j] = out[j + i * n2]; + ht.rows(temp_in, temp_out); + for (j = 0; j < n2; ++j) + output[j + i * n2] = (temp_out[j] + 1) >> 2; + } + // Note: overall scale factor of transform is 8 times unitary +} +#endif // CONFIG_EXT_TX + void vp10_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, @@ -1382,7 +1488,7 @@ void vp10_fht8x8_c(const int16_t *input, tran_low_t *output, #if CONFIG_EXT_TX int16_t flipped_input[8 * 8]; - maybe_flip_input(&input, &stride, 8, flipped_input, tx_type); + maybe_flip_input(&input, &stride, 8, 8, flipped_input, tx_type); #endif // Columns @@ -1473,7 +1579,7 @@ void vp10_fht16x16_c(const int16_t *input, tran_low_t *output, #if CONFIG_EXT_TX int16_t flipped_input[16 * 16]; - maybe_flip_input(&input, &stride, 16, flipped_input, tx_type); + maybe_flip_input(&input, &stride, 16, 16, flipped_input, tx_type); #endif // Columns @@ -1498,17 +1604,29 @@ void vp10_fht16x16_c(const int16_t *input, tran_low_t *output, #if CONFIG_VP9_HIGHBITDEPTH void vp10_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, - int stride, int tx_type) { + int stride, int tx_type) { vp10_fht4x4_c(input, output, stride, tx_type); } +#if CONFIG_EXT_TX +void vp10_highbd_fht8x4_c(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + vp10_fht8x4_c(input, output, stride, tx_type); +} + +void vp10_highbd_fht4x8_c(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + vp10_fht4x8_c(input, output, stride, tx_type); +} +#endif // CONFIG_EXT_TX + void vp10_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, - int stride, int tx_type) { + int stride, int tx_type) { vp10_fht8x8_c(input, output, stride, tx_type); } void vp10_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, - int stride) { + int stride) { vp10_fwht4x4_c(input, output, stride); } @@ -1530,7 +1648,7 @@ void vp10_fht32x32_c(const int16_t *input, tran_low_t *output, const transform_2d ht = FHT_32[tx_type]; int16_t flipped_input[32 * 32]; - maybe_flip_input(&input, &stride, 32, flipped_input, tx_type); + maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type); // Columns for (i = 0; i < 32; ++i) { diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c index aceb10f1b..aa8b402cc 100644 --- a/vp10/encoder/encodemb.c +++ b/vp10/encoder/encodemb.c @@ -67,20 +67,6 @@ static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\ } -static const int16_t band_count_table[TX_SIZES][8] = { - { 1, 2, 3, 4, 3, 16 - 13, 0 }, - { 1, 2, 3, 4, 11, 64 - 21, 0 }, - { 1, 2, 3, 4, 11, 256 - 21, 0 }, - { 1, 2, 3, 4, 11, 1024 - 21, 0 }, -}; - -static const int16_t band_cum_count_table[TX_SIZES][8] = { - { 0, 1, 3, 6, 10, 13, 16, 0 }, - { 0, 1, 3, 6, 10, 21, 64, 0 }, - { 0, 1, 3, 6, 10, 21, 256, 0 }, - { 0, 1, 3, 6, 10, 21, 1024, 0 }, -}; - int vp10_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, int ctx) { MACROBLOCKD *const xd = &mb->e_mbd; @@ -95,7 +81,7 @@ int vp10_optimize_b(MACROBLOCK *mb, int plane, int block, tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); const int eob = p->eobs[block]; const PLANE_TYPE type = pd->plane_type; - const int default_eob = 16 << (tx_size << 1); + const int default_eob = get_tx2d_size(tx_size); const int16_t* const dequant_ptr = pd->dequant; const uint8_t* const band_translate = get_band_translate(tx_size); TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size); @@ -125,9 +111,9 @@ int vp10_optimize_b(MACROBLOCK *mb, int plane, int block, const int *cat6_high_cost = vp10_get_high_cost_table(8); #endif unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = - mb->token_costs[tx_size][type][ref]; - const int16_t *band_counts = &band_count_table[tx_size][band]; - int16_t band_left = eob - band_cum_count_table[tx_size][band] + 1; + mb->token_costs[txsize_sqr_map[tx_size]][type][ref]; + const uint16_t *band_counts = &band_count_table[tx_size][band]; + uint16_t band_left = eob - band_cum_count_table[tx_size][band] + 1; int shortcut = 0; int next_shortcut = 0; @@ -444,8 +430,7 @@ void vp10_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row, uint16_t *const eob = &p->eobs[block]; const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const int16_t *src_diff; - const int tx1d_size = get_tx1d_size(tx_size); - const int tx2d_size = tx1d_size * tx1d_size; + const int tx2d_size = get_tx2d_size(tx_size); FWD_TXFM_PARAM fwd_txfm_param; QUANT_PARAM qparam; @@ -524,89 +509,44 @@ void vp10_xform_quant_nuq(MACROBLOCK *x, int plane, int block, int blk_row, fwd_txfm_param.bd = xd->bd; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param); - switch (tx_size) { - case TX_32X32: - highbd_quantize_32x32_nuq(coeff, 1024, x->skip_block, - p->quant, p->quant_shift, pd->dequant, - (const cuml_bins_type_nuq *) - p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *) - pd->dequant_val_nuq[dq], - qcoeff, dqcoeff, eob, - scan_order->scan, band); - break; - case TX_16X16: - highbd_quantize_nuq(coeff, 256, x->skip_block, - p->quant, p->quant_shift, pd->dequant, - (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *) - pd->dequant_val_nuq[dq], - qcoeff, dqcoeff, eob, - scan_order->scan, band); - break; - case TX_8X8: - highbd_quantize_nuq(coeff, 64, x->skip_block, - p->quant, p->quant_shift, pd->dequant, - (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *) - pd->dequant_val_nuq[dq], - qcoeff, dqcoeff, eob, - scan_order->scan, band); - break; - case TX_4X4: - highbd_quantize_nuq(coeff, 16, x->skip_block, - p->quant, p->quant_shift, pd->dequant, - (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *) + if (tx_size == TX_32X32) { + highbd_quantize_32x32_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, + p->quant, p->quant_shift, pd->dequant, + (const cuml_bins_type_nuq *) + p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *) pd->dequant_val_nuq[dq], - qcoeff, dqcoeff, eob, - scan_order->scan, band); - break; - default: - assert(0); + qcoeff, dqcoeff, eob, + scan_order->scan, band); + } else { + highbd_quantize_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, + p->quant, p->quant_shift, pd->dequant, + (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *) + pd->dequant_val_nuq[dq], + qcoeff, dqcoeff, eob, + scan_order->scan, band); } return; } #endif // CONFIG_VP9_HIGHBITDEPTH fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param); - switch (tx_size) { - case TX_32X32: - quantize_32x32_nuq(coeff, 1024, x->skip_block, - p->quant, p->quant_shift, pd->dequant, - (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *) - pd->dequant_val_nuq[dq], - qcoeff, dqcoeff, eob, - scan_order->scan, band); - break; - case TX_16X16: - quantize_nuq(coeff, 256, x->skip_block, - p->quant, p->quant_shift, pd->dequant, - (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], - qcoeff, dqcoeff, eob, - scan_order->scan, band); - break; - case TX_8X8: - quantize_nuq(coeff, 64, x->skip_block, - p->quant, p->quant_shift, pd->dequant, - (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], - qcoeff, dqcoeff, eob, - scan_order->scan, band); - break; - case TX_4X4: - quantize_nuq(coeff, 16, x->skip_block, - p->quant, p->quant_shift, pd->dequant, - (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], - qcoeff, dqcoeff, eob, - scan_order->scan, band); - break; - default: - assert(0); - break; + if (tx_size == TX_32X32) { + quantize_32x32_nuq(coeff, 1024, x->skip_block, + p->quant, p->quant_shift, pd->dequant, + (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *) + pd->dequant_val_nuq[dq], + qcoeff, dqcoeff, eob, + scan_order->scan, band); + } else { + quantize_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, + p->quant, p->quant_shift, pd->dequant, + (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], + qcoeff, dqcoeff, eob, + scan_order->scan, band); } } @@ -645,99 +585,48 @@ void vp10_xform_quant_fp_nuq(MACROBLOCK *x, int plane, int block, int blk_row, fwd_txfm_param.bd = xd->bd; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param); - switch (tx_size) { - case TX_32X32: - highbd_quantize_32x32_fp_nuq(coeff, 1024, x->skip_block, - p->quant_fp, pd->dequant, - (const cuml_bins_type_nuq *) - p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *) - pd->dequant_val_nuq[dq], - qcoeff, dqcoeff, eob, - scan_order->scan, band); - break; - case TX_16X16: - highbd_quantize_fp_nuq(coeff, 256, x->skip_block, - p->quant_fp, pd->dequant, - (const cuml_bins_type_nuq *) - p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *) - pd->dequant_val_nuq[dq], - qcoeff, dqcoeff, eob, - scan_order->scan, band); - break; - case TX_8X8: - highbd_quantize_fp_nuq(coeff, 64, x->skip_block, - p->quant_fp, pd->dequant, - (const cuml_bins_type_nuq *) - p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *) - pd->dequant_val_nuq[dq], - qcoeff, dqcoeff, eob, - scan_order->scan, band); - break; - case TX_4X4: - highbd_quantize_fp_nuq(coeff, 16, x->skip_block, - p->quant_fp, pd->dequant, - (const cuml_bins_type_nuq *) + if (tx_size == TX_32X32) { + highbd_quantize_32x32_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, + p->quant_fp, pd->dequant, + (const cuml_bins_type_nuq *) p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *) + (const dequant_val_type_nuq *) pd->dequant_val_nuq[dq], - qcoeff, dqcoeff, eob, - scan_order->scan, band); - break; - default: - assert(0); + qcoeff, dqcoeff, eob, + scan_order->scan, band); + } else { + highbd_quantize_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, + p->quant_fp, pd->dequant, + (const cuml_bins_type_nuq *) + p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *) + pd->dequant_val_nuq[dq], + qcoeff, dqcoeff, eob, + scan_order->scan, band); } return; } #endif // CONFIG_VP9_HIGHBITDEPTH fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param); - switch (tx_size) { - case TX_32X32: - quantize_32x32_fp_nuq(coeff, 1024, x->skip_block, - p->quant_fp, pd->dequant, - (const cuml_bins_type_nuq *) - p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *) - pd->dequant_val_nuq[dq], - qcoeff, dqcoeff, eob, - scan_order->scan, band); - break; - case TX_16X16: - quantize_fp_nuq(coeff, 256, x->skip_block, - p->quant_fp, pd->dequant, - (const cuml_bins_type_nuq *) + if (tx_size == TX_32X32) { + quantize_32x32_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, + p->quant_fp, pd->dequant, + (const cuml_bins_type_nuq *) p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *) + (const dequant_val_type_nuq *) pd->dequant_val_nuq[dq], - qcoeff, dqcoeff, eob, - scan_order->scan, band); - break; - case TX_8X8: - quantize_fp_nuq(coeff, 64, x->skip_block, - p->quant_fp, pd->dequant, - (const cuml_bins_type_nuq *) - p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *) - pd->dequant_val_nuq[dq], - qcoeff, dqcoeff, eob, - scan_order->scan, band); - break; - case TX_4X4: - quantize_fp_nuq(coeff, 16, x->skip_block, - p->quant_fp, pd->dequant, - (const cuml_bins_type_nuq *) - p->cuml_bins_nuq[dq], - (const dequant_val_type_nuq *) - pd->dequant_val_nuq[dq], - qcoeff, dqcoeff, eob, - scan_order->scan, band); - break; - default: - assert(0); - break; + qcoeff, dqcoeff, eob, + scan_order->scan, band); + } else { + quantize_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, + p->quant_fp, pd->dequant, + (const cuml_bins_type_nuq *) + p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *) + pd->dequant_val_nuq[dq], + qcoeff, dqcoeff, eob, + scan_order->scan, band); } } @@ -773,79 +662,38 @@ void vp10_xform_quant_dc_nuq(MACROBLOCK *x, int plane, int block, int blk_row, fwd_txfm_param.bd = xd->bd; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param); - switch (tx_size) { - case TX_32X32: - highbd_quantize_dc_32x32_nuq(coeff, 1024, x->skip_block, - p->quant[0], p->quant_shift[0], - pd->dequant[0], - p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], - qcoeff, dqcoeff, eob); - break; - case TX_16X16: - highbd_quantize_dc_nuq(coeff, 256, x->skip_block, - p->quant[0], p->quant_shift[0], - pd->dequant[0], - p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], - qcoeff, dqcoeff, eob); - break; - case TX_8X8: - highbd_quantize_dc_nuq(coeff, 64, x->skip_block, - p->quant[0], p->quant_shift[0], - pd->dequant[0], - p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], - qcoeff, dqcoeff, eob); - break; - case TX_4X4: - highbd_quantize_dc_nuq(coeff, 16, x->skip_block, - p->quant[0], p->quant_shift[0], - pd->dequant[0], - p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], - qcoeff, dqcoeff, eob); - break; - default: - assert(0); + if (tx_size == TX_32X32) { + highbd_quantize_dc_32x32_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, + p->quant[0], p->quant_shift[0], + pd->dequant[0], + p->cuml_bins_nuq[dq][0], + pd->dequant_val_nuq[dq][0], + qcoeff, dqcoeff, eob); + } else { + highbd_quantize_dc_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, + p->quant[0], p->quant_shift[0], + pd->dequant[0], + p->cuml_bins_nuq[dq][0], + pd->dequant_val_nuq[dq][0], + qcoeff, dqcoeff, eob); } return; } #endif // CONFIG_VP9_HIGHBITDEPTH fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param); - switch (tx_size) { - case TX_32X32: - quantize_dc_32x32_nuq(coeff, 1024, x->skip_block, - p->quant[0], p->quant_shift[0], pd->dequant[0], - p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], - qcoeff, dqcoeff, eob); - break; - case TX_16X16: - quantize_dc_nuq(coeff, 256, x->skip_block, - p->quant[0], p->quant_shift[0], pd->dequant[0], - p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], - qcoeff, dqcoeff, eob); - break; - case TX_8X8: - quantize_dc_nuq(coeff, 64, x->skip_block, - p->quant[0], p->quant_shift[0], pd->dequant[0], - p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], - qcoeff, dqcoeff, eob); - break; - case TX_4X4: - quantize_dc_nuq(coeff, 16, x->skip_block, - p->quant[0], p->quant_shift[0], pd->dequant[0], - p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], - qcoeff, dqcoeff, eob); - break; - default: - assert(0); - break; + if (tx_size == TX_32X32) { + quantize_dc_32x32_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, + p->quant[0], p->quant_shift[0], pd->dequant[0], + p->cuml_bins_nuq[dq][0], + pd->dequant_val_nuq[dq][0], + qcoeff, dqcoeff, eob); + } else { + quantize_dc_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, + p->quant[0], p->quant_shift[0], pd->dequant[0], + p->cuml_bins_nuq[dq][0], + pd->dequant_val_nuq[dq][0], + qcoeff, dqcoeff, eob); } } @@ -882,76 +730,37 @@ void vp10_xform_quant_dc_fp_nuq(MACROBLOCK *x, int plane, int block, fwd_txfm_param.bd = xd->bd; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param); - switch (tx_size) { - case TX_32X32: - highbd_quantize_dc_32x32_fp_nuq(coeff, 1024, x->skip_block, - p->quant_fp[0], pd->dequant[0], - p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], - qcoeff, dqcoeff, eob); - break; - case TX_16X16: - highbd_quantize_dc_fp_nuq(coeff, 256, x->skip_block, - p->quant_fp[0], pd->dequant[0], - p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], - qcoeff, dqcoeff, eob); - break; - case TX_8X8: - highbd_quantize_dc_fp_nuq(coeff, 64, x->skip_block, - p->quant_fp[0], pd->dequant[0], - p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], - qcoeff, dqcoeff, eob); - break; - case TX_4X4: - highbd_quantize_dc_fp_nuq(coeff, 16, x->skip_block, - p->quant_fp[0], pd->dequant[0], - p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], - qcoeff, dqcoeff, eob); - break; - default: - assert(0); + if (tx_size == TX_32X32) { + highbd_quantize_dc_32x32_fp_nuq(coeff, get_tx2d_size(tx_size), + x->skip_block, + p->quant_fp[0], pd->dequant[0], + p->cuml_bins_nuq[dq][0], + pd->dequant_val_nuq[dq][0], + qcoeff, dqcoeff, eob); + } else { + highbd_quantize_dc_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, + p->quant_fp[0], pd->dequant[0], + p->cuml_bins_nuq[dq][0], + pd->dequant_val_nuq[dq][0], + qcoeff, dqcoeff, eob); } return; } #endif // CONFIG_VP9_HIGHBITDEPTH fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param); - switch (tx_size) { - case TX_32X32: - quantize_dc_32x32_fp_nuq(coeff, 1024, x->skip_block, - p->quant_fp[0], pd->dequant[0], - p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], - qcoeff, dqcoeff, eob); - break; - case TX_16X16: - quantize_dc_fp_nuq(coeff, 256, x->skip_block, - p->quant_fp[0], pd->dequant[0], - p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], - qcoeff, dqcoeff, eob); - - break; - case TX_8X8: - quantize_dc_fp_nuq(coeff, 64, x->skip_block, - p->quant_fp[0], pd->dequant[0], - p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], - qcoeff, dqcoeff, eob); - break; - case TX_4X4: - quantize_dc_fp_nuq(coeff, 16, x->skip_block, - p->quant_fp[0], pd->dequant[0], - p->cuml_bins_nuq[dq][0], - pd->dequant_val_nuq[dq][0], - qcoeff, dqcoeff, eob); - break; - default: - assert(0); - break; + if (tx_size == TX_32X32) { + quantize_dc_32x32_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, + p->quant_fp[0], pd->dequant[0], + p->cuml_bins_nuq[dq][0], + pd->dequant_val_nuq[dq][0], + qcoeff, dqcoeff, eob); + } else { + quantize_dc_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, + p->quant_fp[0], pd->dequant[0], + p->cuml_bins_nuq[dq][0], + pd->dequant_val_nuq[dq][0], + qcoeff, dqcoeff, eob); } } #endif // CONFIG_NEW_QUANT @@ -1011,8 +820,10 @@ static void encode_block(int plane, int block, int blk_row, int blk_col, } #if CONFIG_VAR_TX - for (i = 0; i < (1 << tx_size); ++i) { + for (i = 0; i < num_4x4_blocks_wide_txsize_lookup[tx_size]; ++i) { a[i] = a[0]; + } + for (i = 0; i < num_4x4_blocks_high_txsize_lookup[tx_size]; ++i) { l[i] = l[0]; } #endif @@ -1076,10 +887,14 @@ static void encode_block_inter(int plane, int block, int blk_row, int blk_col, assert(bsl > 0); --bsl; +#if CONFIG_EXT_TX + assert(tx_size < TX_SIZES); +#endif // CONFIG_EXT_TX + for (i = 0; i < 4; ++i) { const int offsetr = blk_row + ((i >> 1) << bsl); const int offsetc = blk_col + ((i & 0x01) << bsl); - int step = 1 << (2 * (tx_size - 1)); + int step = num_4x4_blocks_txsize_lookup[tx_size - 1]; if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; @@ -1165,7 +980,7 @@ void vp10_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { const int bh = num_4x4_blocks_wide_lookup[txb_size]; int idx, idy; int block = 0; - int step = 1 << (max_tx_size * 2); + int step = num_4x4_blocks_txsize_lookup[max_tx_size]; vp10_get_entropy_contexts(bsize, TX_4X4, pd, ctx.ta[plane], ctx.tl[plane]); #else const struct macroblockd_plane* const pd = &xd->plane[plane]; @@ -1242,12 +1057,15 @@ void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col, uint16_t *eob = &p->eobs[block]; const int src_stride = p->src.stride; const int dst_stride = pd->dst.stride; - const int tx1d_size = get_tx1d_size(tx_size); + const int tx1d_width = num_4x4_blocks_wide_txsize_lookup[tx_size] << 2; + const int tx1d_height = num_4x4_blocks_high_txsize_lookup[tx_size] << 2; ENTROPY_CONTEXT *a = NULL, *l = NULL; int ctx; INV_TXFM_PARAM inv_txfm_param; + assert(tx1d_width == tx1d_height); + dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)]; src = &p->src.buf[4 * (blk_row * src_stride + blk_col)]; src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)]; @@ -1257,14 +1075,14 @@ void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col, dst_stride, blk_col, blk_row, plane); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vpx_highbd_subtract_block(tx1d_size, tx1d_size, src_diff, diff_stride, src, - src_stride, dst, dst_stride, xd->bd); + vpx_highbd_subtract_block(tx1d_height, tx1d_width, src_diff, diff_stride, + src, src_stride, dst, dst_stride, xd->bd); } else { - vpx_subtract_block(tx1d_size, tx1d_size, src_diff, diff_stride, src, + vpx_subtract_block(tx1d_height, tx1d_width, src_diff, diff_stride, src, src_stride, dst, dst_stride); } #else - vpx_subtract_block(tx1d_size, tx1d_size, src_diff, diff_stride, src, + vpx_subtract_block(tx1d_height, tx1d_width, src_diff, diff_stride, src, src_stride, dst, dst_stride); #endif // CONFIG_VP9_HIGHBITDEPTH @@ -1274,8 +1092,8 @@ void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col, if (args->enable_optimize_b) { #if CONFIG_NEW_QUANT - vp10_xform_quant_fp_nuq(x, plane, block, blk_row, blk_col, plane_bsize, - tx_size, ctx); + vp10_xform_quant_fp_nuq(x, plane, block, blk_row, blk_col, plane_bsize, + tx_size, ctx); #else // CONFIG_NEW_QUANT vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size, VP10_XFORM_QUANT_FP); diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c index a0e0fdcaa..d5cf82706 100644 --- a/vp10/encoder/hybrid_fwd_txfm.c +++ b/vp10/encoder/hybrid_fwd_txfm.c @@ -61,6 +61,22 @@ static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff, } } +#if CONFIG_EXT_TX +static void fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt) { + (void) fwd_txfm_opt; + vp10_fht8x4(src_diff, coeff, diff_stride, tx_type); +} + +static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt) { + (void) fwd_txfm_opt; + vp10_fht4x8(src_diff, coeff, diff_stride, tx_type); +} +#endif // CONFIG_EXT_TX + static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TX_TYPE tx_type, FWD_TXFM_OPT fwd_txfm_opt) { @@ -214,6 +230,24 @@ static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff, } } +#if CONFIG_EXT_TX +static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt, const int bd) { + (void) fwd_txfm_opt; + (void) bd; + vp10_highbd_fht8x4(src_diff, coeff, diff_stride, tx_type); +} + +static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt, const int bd) { + (void) fwd_txfm_opt; + (void) bd; + vp10_highbd_fht4x8(src_diff, coeff, diff_stride, tx_type); +} +#endif // CONFIG_EXT_TX + static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TX_TYPE tx_type, FWD_TXFM_OPT fwd_txfm_opt, const int bd) { @@ -344,6 +378,14 @@ void fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, case TX_8X8: fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt); break; +#if CONFIG_EXT_TX + case TX_4X8: + fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt); + break; + case TX_8X4: + fwd_txfm_8x4(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt); + break; +#endif // CONFIG_EXT_TX case TX_4X4: fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless); break; @@ -375,6 +417,16 @@ void highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt, bd); break; +#if CONFIG_EXT_TX + case TX_4X8: + highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type, + fwd_txfm_opt, bd); + break; + case TX_8X4: + highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, tx_type, + fwd_txfm_opt, bd); + break; +#endif // CONFIG_EXT_TX case TX_4X4: highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless, bd); diff --git a/vp10/encoder/hybrid_fwd_txfm.h b/vp10/encoder/hybrid_fwd_txfm.h index cd028bc96..07b832cdf 100644 --- a/vp10/encoder/hybrid_fwd_txfm.h +++ b/vp10/encoder/hybrid_fwd_txfm.h @@ -38,22 +38,6 @@ void highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, FWD_TXFM_PARAM *fwd_txfm_param); #endif // CONFIG_VP9_HIGHBITDEPTH -static INLINE int get_tx1d_size(TX_SIZE tx_size) { - switch (tx_size) { - case TX_32X32: - return 32; - case TX_16X16: - return 16; - case TX_8X8: - return 8; - case TX_4X4: - return 4; - default: - assert(0); - return -1; - } -} - #ifdef __cplusplus } // extern "C" #endif diff --git a/vp10/encoder/rd.c b/vp10/encoder/rd.c index 028d578e1..cbdcc94c8 100644 --- a/vp10/encoder/rd.c +++ b/vp10/encoder/rd.c @@ -597,6 +597,18 @@ static void get_entropy_contexts_plane( memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); break; +#if CONFIG_EXT_TX + case TX_4X8: + memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); + for (i = 0; i < num_4x4_h; i += 2) + t_left[i] = !!*(const uint16_t *)&left[i]; + break; + case TX_8X4: + for (i = 0; i < num_4x4_w; i += 2) + t_above[i] = !!*(const uint16_t *)&above[i]; + memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); + break; +#endif // CONFIG_EXT_TX case TX_8X8: for (i = 0; i < num_4x4_w; i += 2) t_above[i] = !!*(const uint16_t *)&above[i]; @@ -622,9 +634,9 @@ static void get_entropy_contexts_plane( } void vp10_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, - const struct macroblockd_plane *pd, - ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE], - ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) { + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE], + ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) { const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); get_entropy_contexts_plane(plane_bsize, tx_size, pd, t_above, t_left); } diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c index 817721287..97b6a6ff3 100644 --- a/vp10/encoder/rdopt.c +++ b/vp10/encoder/rdopt.c @@ -21,6 +21,7 @@ #include "vpx_ports/system_state.h" #include "vp10/common/common.h" +#include "vp10/common/common_data.h" #include "vp10/common/entropy.h" #include "vp10/common/entropymode.h" #include "vp10/common/idct.h" @@ -927,12 +928,6 @@ int64_t vp10_highbd_block_error_c(const tran_low_t *coeff, * can skip this if the last coefficient in this transform block, e.g. the * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block, * were non-zero). */ -static const int16_t band_counts[TX_SIZES][8] = { - { 1, 2, 3, 4, 3, 16 - 13, 0 }, - { 1, 2, 3, 4, 11, 64 - 21, 0 }, - { 1, 2, 3, 4, 11, 256 - 21, 0 }, - { 1, 2, 3, 4, 11, 1024 - 21, 0 }, -}; static int cost_coeffs(MACROBLOCK *x, int plane, int block, #if CONFIG_VAR_TX @@ -948,11 +943,12 @@ static int cost_coeffs(MACROBLOCK *x, const struct macroblock_plane *p = &x->plane[plane]; const struct macroblockd_plane *pd = &xd->plane[plane]; const PLANE_TYPE type = pd->plane_type; - const int16_t *band_count = &band_counts[tx_size][1]; + const uint16_t *band_count = &band_count_table[tx_size][1]; const int eob = p->eobs[block]; const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + const int tx_size_ctx = txsize_sqr_map[tx_size]; unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = - x->token_costs[tx_size][type][is_inter_block(mbmi)]; + x->token_costs[tx_size_ctx][type][is_inter_block(mbmi)]; uint8_t token_cache[MAX_TX_SQUARE]; #if CONFIG_VAR_TX int pt = coeff_ctx; @@ -1064,7 +1060,7 @@ static void dist_block(const VP10_COMP *cpi, MACROBLOCK *x, int plane, if (cpi->sf.use_transform_domain_distortion) { // Transform domain distortion computation is more accurate as it does // not involve an inverse transform, but it is less accurate. - const int ss_txfrm_size = tx_size << 1; + const int ss_txfrm_size = num_4x4_blocks_txsize_log2_lookup[tx_size]; int64_t this_sse; int tx_type = get_tx_type(pd->plane_type, xd, block, tx_size); int shift = (MAX_TX_SCALE - get_tx_scale(xd, tx_type, tx_size)) * 2; @@ -1081,7 +1077,8 @@ static void dist_block(const VP10_COMP *cpi, MACROBLOCK *x, int plane, *out_sse = this_sse >> shift; } else { const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; - const int bs = 4*num_4x4_blocks_wide_lookup[tx_bsize]; + const int bsw = 4 * num_4x4_blocks_wide_lookup[tx_bsize]; + const int bsh = 4 * num_4x4_blocks_high_lookup[tx_bsize]; const int src_stride = x->plane[plane].src.stride; const int dst_stride = xd->plane[plane].dst.stride; const int src_idx = 4 * (blk_row * src_stride + blk_col); @@ -1121,13 +1118,13 @@ static void dist_block(const VP10_COMP *cpi, MACROBLOCK *x, int plane, recon = CONVERT_TO_BYTEPTR(recon); inv_txfm_param.bd = xd->bd; vpx_highbd_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, - NULL, 0, NULL, 0, bs, bs, xd->bd); + NULL, 0, NULL, 0, bsw, bsh, xd->bd); highbd_inv_txfm_add(dqcoeff, recon, MAX_TX_SIZE, &inv_txfm_param); } else #endif // CONFIG_VP9_HIGHBITDEPTH { vpx_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, - NULL, 0, NULL, 0, bs, bs); + NULL, 0, NULL, 0, bsw, bsh); inv_txfm_add(dqcoeff, recon, MAX_TX_SIZE, &inv_txfm_param); } @@ -1159,6 +1156,29 @@ static int rate_block(int plane, int block, int blk_row, int blk_col, #endif // CONFIG_VAR_TX } +static uint64_t sum_squares_2d(const int16_t *diff, int diff_stride, + TX_SIZE tx_size) { + uint64_t sse; + switch (tx_size) { +#if CONFIG_EXT_TX + case TX_4X8: + sse = vpx_sum_squares_2d_i16(diff, diff_stride, 4) + + vpx_sum_squares_2d_i16(diff + 4 * diff_stride, diff_stride, 4); + break; + case TX_8X4: + sse = vpx_sum_squares_2d_i16(diff, diff_stride, 4) + + vpx_sum_squares_2d_i16(diff + 4, diff_stride, 4);; + break; +#endif // CONFIG_EXT_TX + default: + assert(tx_size < TX_SIZES); + sse = vpx_sum_squares_2d_i16( + diff, diff_stride, num_4x4_blocks_wide_txsize_lookup[tx_size] << 2); + break; + } + return sse; +} + static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { @@ -1188,7 +1208,6 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, } else { // Note that the encode block_intra call above already calls // inv_txfm_add, so we can't just call dist_block here. - const int bs = 4 << tx_size; const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; const vpx_variance_fn_t variance = args->cpi->fn_ptr[tx_bsize].vf; @@ -1204,8 +1223,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)]; unsigned int tmp; + sse = sum_squares_2d(diff, diff_stride, tx_size); - sse = vpx_sum_squares_2d_i16(diff, diff_stride, bs); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); @@ -1316,6 +1335,10 @@ void vp10_txfm_rd_in_plane_supertx(MACROBLOCK *x, args.best_rd = ref_best_rd; args.use_fast_coef_costing = use_fast_coef_casting; +#if CONFIG_EXT_TX + assert(tx_size < TX_SIZES); +#endif // CONFIG_EXT_TX + if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size; @@ -1361,6 +1384,7 @@ static int64_t txfm_yrd(VP10_COMP *cpi, MACROBLOCK *x, #endif // CONFIG_EXT_TX assert(skip_prob > 0); + s0 = vp10_cost_bit(skip_prob, 0); s1 = vp10_cost_bit(skip_prob, 1); @@ -2955,6 +2979,10 @@ void vp10_tx_block_rd_b(const VP10_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size, int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize]; int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize]; +#if CONFIG_EXT_TX + assert(tx_size < TX_SIZES); +#endif // CONFIG_EXT_TX + if (xd->mb_to_bottom_edge < 0) max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y); if (xd->mb_to_right_edge < 0) @@ -3087,6 +3115,10 @@ static void select_tx_block(const VP10_COMP *cpi, MACROBLOCK *x, int tmp_eob = 0; int zero_blk_rate; +#if CONFIG_EXT_TX + assert(tx_size < TX_SIZES); +#endif // CONFIG_EXT_TX + if (ref_best_rd < 0) { *is_cost_valid = 0; return; @@ -3158,7 +3190,7 @@ static void select_tx_block(const VP10_COMP *cpi, MACROBLOCK *x, if (tx_size > TX_4X4) { BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; int bsl = b_height_log2_lookup[bsize]; - int sub_step = 1 << (2 * (tx_size - 1)); + int sub_step = num_4x4_blocks_txsize_lookup[tx_size - 1]; int i; int this_rate; int64_t this_dist; @@ -3167,6 +3199,9 @@ static void select_tx_block(const VP10_COMP *cpi, MACROBLOCK *x, int this_cost_valid = 1; int64_t tmp_rd = 0; +#if CONFIG_EXT_TX + assert(tx_size < TX_SIZES); +#endif // CONFIG_EXT_TX --bsl; for (i = 0; i < 4 && this_cost_valid; ++i) { int offsetr = (i >> 1) << bsl; @@ -3191,13 +3226,15 @@ static void select_tx_block(const VP10_COMP *cpi, MACROBLOCK *x, if (this_rd < sum_rd) { int idx, idy; - for (i = 0; i < (1 << tx_size); ++i) - pta[i] = ptl[i] = !(tmp_eob == 0); + for (i = 0; i < num_4x4_blocks_wide_txsize_lookup[tx_size]; ++i) + pta[i] = !(tmp_eob == 0); + for (i = 0; i < num_4x4_blocks_high_txsize_lookup[tx_size]; ++i) + ptl[i] = !(tmp_eob == 0); txfm_partition_update(tx_above + (blk_col >> 1), tx_left + (blk_row >> 1), tx_size); inter_tx_size[0][0] = tx_size; - for (idy = 0; idy < (1 << tx_size) / 2; ++idy) - for (idx = 0; idx < (1 << tx_size) / 2; ++idx) + for (idy = 0; idy < num_4x4_blocks_high_txsize_lookup[tx_size] / 2; ++idy) + for (idx = 0; idx < num_4x4_blocks_wide_txsize_lookup[tx_size] / 2; ++idx) inter_tx_size[idy][idx] = tx_size; mbmi->tx_size = tx_size; if (this_rd == INT64_MAX) @@ -3453,6 +3490,10 @@ static void tx_block_rd(const VP10_COMP *cpi, MACROBLOCK *x, int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize]; int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize]; +#if CONFIG_EXT_TX + assert(tx_size < TX_SIZES); +#endif // CONFIG_EXT_TX + if (xd->mb_to_bottom_edge < 0) max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y); if (xd->mb_to_right_edge < 0) @@ -3487,13 +3528,13 @@ static void tx_block_rd(const VP10_COMP *cpi, MACROBLOCK *x, coeff_ctx = combine_entropy_contexts(ta[0], tl[0]); vp10_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, plane_bsize, coeff_ctx, rate, dist, bsse, skip); - for (i = 0; i < (1 << tx_size); ++i) { + for (i = 0; i < num_4x4_blocks_wide_txsize_lookup[tx_size]; ++i) ta[i] = !(p->eobs[block] == 0); + for (i = 0; i < num_4x4_blocks_high_txsize_lookup[tx_size]; ++i) tl[i] = !(p->eobs[block] == 0); - } } else { int bsl = b_width_log2_lookup[bsize]; - int step = 1 << (2 * (tx_size - 1)); + int step = num_4x4_blocks_txsize_lookup[tx_size - 1]; int i; assert(bsl > 0); @@ -3590,7 +3631,7 @@ static int inter_block_uvrd(const VP10_COMP *cpi, MACROBLOCK *x, return is_cost_valid; } -#endif +#endif // CONFIG_VAR_TX // Return value 0: early termination triggered, no valid rd cost available; // 1: rd cost values are valid. @@ -4402,11 +4443,22 @@ static int64_t encode_inter_mb_segment(VP10_COMP *cpi, const uint8_t *const src = &p->src.buf[vp10_raster_block_offset(BLOCK_8X8, i, p->src.stride)]; uint8_t *const dst = &pd->dst.buf[vp10_raster_block_offset(BLOCK_8X8, i, - pd->dst.stride)]; + pd->dst.stride)]; int64_t thisdistortion = 0, thissse = 0; int thisrate = 0; - TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i, TX_4X4); - const scan_order *so = get_scan(TX_4X4, tx_type, 1); + TX_SIZE tx_size = mi->mbmi.tx_size; + + TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i, tx_size); + const scan_order *so = get_scan(tx_size, tx_type, 1); + const int num_4x4_w = num_4x4_blocks_wide_txsize_lookup[tx_size]; + const int num_4x4_h = num_4x4_blocks_high_txsize_lookup[tx_size]; + +#if CONFIG_EXT_TX && CONFIG_RECT_TX && !CONFIG_VAR_TX + assert(tx_size == max_txsize_rect_lookup[mi->mbmi.sb_type]); +#else + assert(tx_size == TX_4X4); +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX && !CONFIG_VAR_TX + assert(tx_type == DCT_DCT); vp10_build_inter_predictor_sub8x8(xd, 0, i, ir, ic, mi_row, mi_col); @@ -4427,39 +4479,51 @@ static int64_t encode_inter_mb_segment(VP10_COMP *cpi, #endif // CONFIG_VP9_HIGHBITDEPTH k = i; - for (idy = 0; idy < height / 4; ++idy) { - for (idx = 0; idx < width / 4; ++idx) { - int64_t dist, ssz, rd, rd1, rd2; + for (idy = 0; idy < height / 4; idy += num_4x4_h) { + for (idx = 0; idx < width / 4; idx += num_4x4_w) { + int64_t dist, ssz, rd, rd1, rd2, block; int coeff_ctx; k += (idy * 2 + idx); + if (tx_size == TX_4X4) + block = k; + else + block = (i ? 2 : 0); coeff_ctx = combine_entropy_contexts(*(ta + (k & 1)), *(tl + (k >> 1))); #if CONFIG_NEW_QUANT - vp10_xform_quant_fp_nuq(x, 0, k, idy + (i >> 1), idx + (i & 0x01), - BLOCK_8X8, TX_4X4, coeff_ctx); + vp10_xform_quant_fp_nuq(x, 0, block, idy + (i >> 1), idx + (i & 0x01), + BLOCK_8X8, tx_size, coeff_ctx); #else - vp10_xform_quant(x, 0, k, idy + (i >> 1), idx + (i & 0x01), BLOCK_8X8, - TX_4X4, VP10_XFORM_QUANT_FP); + vp10_xform_quant(x, 0, block, idy + (i >> 1), idx + (i & 0x01), BLOCK_8X8, + tx_size, VP10_XFORM_QUANT_FP); #endif // CONFIG_NEW_QUANT if (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0) - vp10_optimize_b(x, 0, k, TX_4X4, coeff_ctx); - dist_block(cpi, x, 0, k, idy + (i >> 1), idx + (i & 0x1), TX_4X4, + vp10_optimize_b(x, 0, block, tx_size, coeff_ctx); + dist_block(cpi, x, 0, block, idy + (i >> 1), idx + (i & 0x1), tx_size, &dist, &ssz); thisdistortion += dist; thissse += ssz; #if CONFIG_VAR_TX - thisrate += cost_coeffs(x, 0, k, coeff_ctx, - TX_4X4, + thisrate += cost_coeffs(x, 0, block, coeff_ctx, + tx_size, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); - *(ta + (k & 1)) = !(p->eobs[k] == 0); - *(tl + (k >> 1)) = !(p->eobs[k] == 0); + *(ta + (k & 1)) = !(p->eobs[block] == 0); + *(tl + (k >> 1)) = !(p->eobs[block] == 0); #else - thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), - TX_4X4, + thisrate += cost_coeffs(x, 0, block, ta + (k & 1), tl + (k >> 1), + tx_size, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); -#endif +#if CONFIG_EXT_TX + if (tx_size == TX_8X4) { + *(ta + (k & 1) + 1) = *(ta + (k & 1)); + } + if (tx_size == TX_4X8) { + *(tl + (k >> 1) + 1) = *(tl + (k >> 1)); + } +#endif // CONFIG_EXT_TX +#endif // CONFIG_VAR_TX rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion); rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse); rd = VPXMIN(rd1, rd2); @@ -4951,6 +5015,11 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x, const int has_second_rf = has_second_ref(mbmi); const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize]; MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; +#if CONFIG_EXT_TX && CONFIG_RECT_TX && !CONFIG_VAR_TX + mbmi->tx_size = max_txsize_rect_lookup[bsize]; +#else + mbmi->tx_size = TX_4X4; +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX && !CONFIG_VAR_TX vp10_zero(*bsi); @@ -5020,8 +5089,8 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x, #if CONFIG_EXT_INTER mv_ref_list, #endif // CONFIG_EXT_INTER - &frame_mv[NEARESTMV][frame], - &frame_mv[NEARMV][frame]); + &frame_mv[NEARESTMV][frame], + &frame_mv[NEARMV][frame]); #if CONFIG_REF_MV tmp_ref_mv[ref] = frame_mv[NEARESTMV][mbmi->ref_frame[ref]]; @@ -5072,10 +5141,11 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x, #if CONFIG_EXT_INTER for (this_mode = (has_second_rf ? NEAREST_NEARESTMV : NEARESTMV); this_mode <= (has_second_rf ? NEW_NEWMV : NEWFROMNEARMV); - ++this_mode) { + ++this_mode) #else - for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { + for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) #endif // CONFIG_EXT_INTER + { const struct buf_2d orig_src = x->plane[0].src; struct buf_2d orig_pre[2]; // This flag controls if the motion estimation will kick off. When it @@ -5342,10 +5412,11 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x, this_mode == NEWMV && #endif // CONFIG_EXT_INTER #if CONFIG_DUAL_FILTER - (mbmi->interp_filter[0] == EIGHTTAP_REGULAR || run_mv_search)) { + (mbmi->interp_filter[0] == EIGHTTAP_REGULAR || run_mv_search)) #else - (mbmi->interp_filter == EIGHTTAP_REGULAR || run_mv_search)) { + (mbmi->interp_filter == EIGHTTAP_REGULAR || run_mv_search)) #endif + { // adjust src pointers mi_buf_shift(x, i); if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { diff --git a/vp10/encoder/tokenize.c b/vp10/encoder/tokenize.c index c25f8bcb5..734ae8be0 100644 --- a/vp10/encoder/tokenize.c +++ b/vp10/encoder/tokenize.c @@ -393,7 +393,7 @@ static INLINE void add_token_no_extra(TOKENEXTRA **t, static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id, TX_SIZE tx_size) { - const int eob_max = 16 << (tx_size << 1); + const int eob_max = num_4x4_blocks_txsize_lookup[tx_size] << 4; return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; } @@ -463,21 +463,21 @@ static void tokenize_b(int plane, int block, int blk_row, int blk_col, const scan_order *const so = get_scan(tx_size, tx_type, is_inter_block(mbmi)); const int ref = is_inter_block(mbmi); unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] = - td->rd_counts.coef_counts[tx_size][type][ref]; + td->rd_counts.coef_counts[txsize_sqr_map[tx_size]][type][ref]; #if CONFIG_ENTROPY vpx_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] = cpi->subframe_stats.coef_probs_buf[cpi->common.coef_probs_update_idx] - [tx_size][type][ref]; + [txsize_sqr_map[tx_size]][type][ref]; #else vpx_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] = - cpi->common.fc->coef_probs[tx_size][type][ref]; + cpi->common.fc->coef_probs[txsize_sqr_map[tx_size]][type][ref]; #endif // CONFIG_ENTROPY #if CONFIG_ANS rans_dec_lut (*const coef_cdfs)[COEFF_CONTEXTS] = - cpi->common.fc->coef_cdfs[tx_size][type][ref]; + cpi->common.fc->coef_cdfs[txsize_sqr_map[tx_size]][type][ref]; #endif // CONFIG_ANS unsigned int (*const eob_branch)[COEFF_CONTEXTS] = - td->counts->eob_branch[tx_size][type][ref]; + td->counts->eob_branch[txsize_sqr_map[tx_size]][type][ref]; const uint8_t *const band = get_band_translate(tx_size); const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size); int skip_eob = 0; @@ -539,7 +539,7 @@ int vp10_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { int result = 1; struct is_skippable_args args = {x->plane[plane].eobs, &result}; vp10_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, is_skippable, - &args); + &args); return result; } @@ -560,7 +560,7 @@ int vp10_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { int result = 0; struct is_skippable_args args = {x->plane[plane].eobs, &result}; vp10_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, - has_high_freq_coeff, &args); + has_high_freq_coeff, &args); return result; } @@ -582,6 +582,9 @@ void tokenize_tx(ThreadData *td, TOKENEXTRA **t, int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize]; int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize]; + + assert(tx_size < TX_SIZES); + if (xd->mb_to_bottom_edge < 0) max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y); if (xd->mb_to_right_edge < 0) @@ -608,7 +611,7 @@ void tokenize_tx(ThreadData *td, TOKENEXTRA **t, for (i = 0; i < 4; ++i) { const int offsetr = blk_row + ((i >> 1) << bsl); const int offsetc = blk_col + ((i & 0x01) << bsl); - int step = 1 << (2 * (tx_size - 1)); + int step = num_4x4_blocks_txsize_lookup[tx_size - 1]; if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; @@ -659,7 +662,7 @@ void vp10_tokenize_sb_inter(VP10_COMP *cpi, ThreadData *td, TOKENEXTRA **t, int bh = num_4x4_blocks_wide_lookup[txb_size]; int idx, idy; int block = 0; - int step = 1 << (max_tx_size * 2); + int step = num_4x4_blocks_txsize_lookup[max_tx_size]; for (idy = 0; idy < mi_height; idy += bh) { for (idx = 0; idx < mi_width; idx += bh) { tokenize_tx(td, t, dry_run, max_tx_size, plane_bsize, idy, idx, @@ -674,7 +677,7 @@ void vp10_tokenize_sb_inter(VP10_COMP *cpi, ThreadData *td, TOKENEXTRA **t, } } } -#endif +#endif // CONFIG_VAR_TX void vp10_tokenize_sb(VP10_COMP *cpi, ThreadData *td, TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize) { -- 2.40.0