From 7105e66d19811add930055959598beb68f21df29 Mon Sep 17 00:00:00 2001 From: Urvang Joshi Date: Fri, 28 Jul 2017 15:57:22 -0700 Subject: [PATCH] Remove the DP version of vp9_optimize_b(). The greedy version was already enabled by default here: https://chromium-review.googlesource.com/c/546848/ And the speed+compression gains from greedy version were already mentioned here: https://chromium-review.googlesource.com/c/531675/ Change-Id: Iad9f7d03490c845ad1e230af028c9d39edddca97 --- vp9/encoder/vp9_encodemb.c | 283 ------------------------------------- 1 file changed, 283 deletions(-) diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 2a50c9912..c8b7977aa 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -53,10 +53,6 @@ static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { { 10, 6 }, { 8, 5 }, }; -#define USE_GREEDY_OPTIMIZE_B 1 - -#if USE_GREEDY_OPTIMIZE_B - // 'num' can be negative, but 'shift' must be non-negative. #define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \ ((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift)) @@ -306,285 +302,6 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, } #undef RIGHT_SHIFT_POSSIBLY_NEGATIVE -#else - -#define UPDATE_RD_COST() \ - { \ - rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); \ - rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); \ - } - -// This function is a place holder for now but may ultimately need -// to scan previous tokens to work out the correct context. -static int trellis_get_coeff_context(const int16_t *scan, const int16_t *nb, - int idx, int token, uint8_t *token_cache) { - int bak = token_cache[scan[idx]], pt; - token_cache[scan[idx]] = vp9_pt_energy_class[token]; - pt = get_coef_context(nb, token_cache, idx + 1); - token_cache[scan[idx]] = bak; - return pt; -} - -static const int16_t band_count_table[TX_SIZES][8] = { - { 1, 2, 3, 4, 3, 16 - 13, 0 }, - { 1, 2, 3, 4, 11, 64 - 21, 0 }, - { 1, 2, 3, 4, 11, 256 - 21, 0 }, - { 1, 2, 3, 4, 11, 1024 - 21, 0 }, -}; -static const int16_t band_cum_count_table[TX_SIZES][8] = { - { 0, 1, 3, 6, 10, 13, 16, 0 }, - { 0, 1, 3, 6, 10, 21, 64, 0 }, - { 0, 1, 3, 6, 10, 21, 256, 0 }, - { 0, 1, 3, 6, 10, 21, 1024, 0 }, -}; - -typedef struct vp9_token_state { - int64_t error; - int rate; - int16_t next; - int16_t token; - tran_low_t qc; - tran_low_t dqc; - uint8_t best_index; -} vp9_token_state; - -int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, - int ctx) { - MACROBLOCKD *const xd = &mb->e_mbd; - struct macroblock_plane *const p = &mb->plane[plane]; - struct macroblockd_plane *const pd = &xd->plane[plane]; - const int ref = is_inter_block(xd->mi[0]); - vp9_token_state tokens[1025][2]; - uint8_t token_cache[1024]; - const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block); - tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); - tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - const int eob = p->eobs[block]; - const PLANE_TYPE type = get_plane_type(plane); - const int default_eob = 16 << (tx_size << 1); - const int shift = (tx_size == TX_32X32); - const int16_t *const dequant_ptr = pd->dequant; - const uint8_t *const band_translate = get_band_translate(tx_size); - const scan_order *const so = get_scan(xd, tx_size, type, block); - const int16_t *const scan = so->scan; - const int16_t *const nb = so->neighbors; - const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift }; - int next = eob, sz = 0; - const int64_t rdmult = ((int64_t)mb->rdmult * plane_rd_mult[ref][type]) >> 1; - const int64_t rddiv = mb->rddiv; - int64_t rd_cost0, rd_cost1; - int rate0, rate1; - int64_t error0, error1; - int16_t t0, t1; - int best, band = (eob < default_eob) ? band_translate[eob] - : band_translate[eob - 1]; - int pt, i, final_eob; -#if CONFIG_VP9_HIGHBITDEPTH - const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd); -#else - const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8); -#endif - unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = - mb->token_costs[tx_size][type][ref]; - const int16_t *band_counts = &band_count_table[tx_size][band]; - int16_t band_left = eob - band_cum_count_table[tx_size][band] + 1; - - token_costs += band; - - assert((!type && !plane) || (type && plane)); - assert(eob <= default_eob); - - /* Now set up a Viterbi trellis to evaluate alternative roundings. */ - /* Initialize the sentinel node of the trellis. */ - tokens[eob][0].rate = 0; - tokens[eob][0].error = 0; - tokens[eob][0].next = default_eob; - tokens[eob][0].token = EOB_TOKEN; - tokens[eob][0].qc = 0; - tokens[eob][1] = tokens[eob][0]; - - for (i = 0; i < eob; i++) { - const int rc = scan[i]; - token_cache[rc] = vp9_pt_energy_class[vp9_get_token(qcoeff[rc])]; - } - - for (i = eob; i-- > 0;) { - int base_bits, d2, dx; - const int rc = scan[i]; - int x = qcoeff[rc]; - /* Only add a trellis state for non-zero coefficients. */ - if (x) { - error0 = tokens[next][0].error; - error1 = tokens[next][1].error; - /* Evaluate the first possibility for this state. */ - rate0 = tokens[next][0].rate; - rate1 = tokens[next][1].rate; - base_bits = vp9_get_token_cost(x, &t0, cat6_high_cost); - /* Consider both possible successor states. */ - if (next < default_eob) { - pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); - rate0 += (*token_costs)[0][pt][tokens[next][0].token]; - rate1 += (*token_costs)[0][pt][tokens[next][1].token]; - } - UPDATE_RD_COST(); - /* And pick the best. */ - best = rd_cost1 < rd_cost0; - dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift); -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - dx >>= xd->bd - 8; - } -#endif // CONFIG_VP9_HIGHBITDEPTH - d2 = dx * dx; - tokens[i][0].rate = base_bits + (best ? rate1 : rate0); - tokens[i][0].error = d2 + (best ? error1 : error0); - tokens[i][0].next = next; - tokens[i][0].token = t0; - tokens[i][0].qc = x; - tokens[i][0].dqc = dqcoeff[rc]; - tokens[i][0].best_index = best; - - /* Evaluate the second possibility for this state. */ - rate0 = tokens[next][0].rate; - rate1 = tokens[next][1].rate; - - if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) && - (abs(x) * dequant_ptr[rc != 0] < - (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])) { - sz = -(x < 0); - x -= 2 * sz + 1; - } else { - tokens[i][1] = tokens[i][0]; - next = i; - - if (!(--band_left)) { - --band_counts; - band_left = *band_counts; - --token_costs; - } - continue; - } - - /* Consider both possible successor states. */ - if (!x) { - /* If we reduced this coefficient to zero, check to see if - * we need to move the EOB back here. - */ - t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; - t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; - base_bits = 0; - } else { - base_bits = vp9_get_token_cost(x, &t0, cat6_high_cost); - t1 = t0; - } - if (next < default_eob) { - if (t0 != EOB_TOKEN) { - pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); - rate0 += (*token_costs)[!x][pt][tokens[next][0].token]; - } - if (t1 != EOB_TOKEN) { - pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache); - rate1 += (*token_costs)[!x][pt][tokens[next][1].token]; - } - } - - UPDATE_RD_COST(); - /* And pick the best. */ - best = rd_cost1 < rd_cost0; - -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz; - } else { - dx -= (dequant_ptr[rc != 0] + sz) ^ sz; - } -#else - dx -= (dequant_ptr[rc != 0] + sz) ^ sz; -#endif // CONFIG_VP9_HIGHBITDEPTH - d2 = dx * dx; - - tokens[i][1].rate = base_bits + (best ? rate1 : rate0); - tokens[i][1].error = d2 + (best ? error1 : error0); - tokens[i][1].next = next; - tokens[i][1].token = best ? t1 : t0; - tokens[i][1].qc = x; - - if (x) { - tran_low_t offset = dq_step[rc != 0]; - // The 32x32 transform coefficient uses half quantization step size. - // Account for the rounding difference in the dequantized coefficeint - // value when the quantization index is dropped from an even number - // to an odd number. - if (shift & x) offset += (dequant_ptr[rc != 0] & 0x01); - - if (sz == 0) - tokens[i][1].dqc = dqcoeff[rc] - offset; - else - tokens[i][1].dqc = dqcoeff[rc] + offset; - } else { - tokens[i][1].dqc = 0; - } - - tokens[i][1].best_index = best; - /* Finally, make this the new head of the trellis. */ - next = i; - } else { - /* There's no choice to make for a zero coefficient, so we don't - * add a new trellis node, but we do need to update the costs. - */ - pt = get_coef_context(nb, token_cache, i + 1); - t0 = tokens[next][0].token; - t1 = tokens[next][1].token; - /* Update the cost of each path if we're past the EOB token. */ - if (t0 != EOB_TOKEN) { - tokens[next][0].rate += (*token_costs)[1][pt][t0]; - tokens[next][0].token = ZERO_TOKEN; - } - if (t1 != EOB_TOKEN) { - tokens[next][1].rate += (*token_costs)[1][pt][t1]; - tokens[next][1].token = ZERO_TOKEN; - } - tokens[i][0].best_index = tokens[i][1].best_index = 0; - /* Don't update next, because we didn't add a new node. */ - } - - if (!(--band_left)) { - --band_counts; - band_left = *band_counts; - --token_costs; - } - } - - /* Now pick the best path through the whole trellis. */ - rate0 = tokens[next][0].rate; - rate1 = tokens[next][1].rate; - error0 = tokens[next][0].error; - error1 = tokens[next][1].error; - t0 = tokens[next][0].token; - t1 = tokens[next][1].token; - rate0 += (*token_costs)[0][ctx][t0]; - rate1 += (*token_costs)[0][ctx][t1]; - UPDATE_RD_COST(); - best = rd_cost1 < rd_cost0; - final_eob = -1; - - for (i = next; i < eob; i = next) { - const int x = tokens[i][best].qc; - const int rc = scan[i]; - if (x) final_eob = i; - qcoeff[rc] = x; - dqcoeff[rc] = tokens[i][best].dqc; - next = tokens[i][best].next; - best = tokens[i][best].best_index; - } - final_eob++; - - mb->plane[plane].eobs[block] = final_eob; - return final_eob; -} - -#endif // USE_GREEDY_OPTIMIZE_B - static INLINE void fdct32x32(int rd_transform, const int16_t *src, tran_low_t *dst, int src_stride) { if (rd_transform) -- 2.40.0