granicus.if.org Git - libvpx/blob - vp9/encoder/vp9_encodemb.c

   1 /*
   2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11
  12 #include "./vp9_rtcd.h"
  13 #include "./vpx_config.h"
  14 #include "./vpx_dsp_rtcd.h"
  15
  16 #include "vpx_dsp/quantize.h"
  17 #include "vpx_mem/vpx_mem.h"
  18 #include "vpx_ports/mem.h"
  19
  20 #include "vp9/common/vp9_idct.h"
  21 #include "vp9/common/vp9_reconinter.h"
  22 #include "vp9/common/vp9_reconintra.h"
  23 #include "vp9/common/vp9_scan.h"
  24
  25 #include "vp9/encoder/vp9_encodemb.h"
  26 #include "vp9/encoder/vp9_rd.h"
  27 #include "vp9/encoder/vp9_tokenize.h"
  28
  29 struct optimize_ctx {
  30   ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
  31   ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
  32 };
  33
  34 void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
  35   struct macroblock_plane *const p = &x->plane[plane];
  36   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
  37   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
  38   const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
  39   const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
  40
  41 #if CONFIG_VP9_HIGHBITDEPTH
  42   if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
  43     vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
  44                               p->src.stride, pd->dst.buf, pd->dst.stride,
  45                               x->e_mbd.bd);
  46     return;
  47   }
  48 #endif  // CONFIG_VP9_HIGHBITDEPTH
  49   vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
  50                      pd->dst.buf, pd->dst.stride);
  51 }
  52
  53 #define RDTRUNC(RM, DM, R, D)                        \
  54   (((1 << (VP9_PROB_COST_SHIFT - 1)) + (R) * (RM)) & \
  55    ((1 << VP9_PROB_COST_SHIFT) - 1))
  56
  57 typedef struct vp9_token_state {
  58   int           rate;
  59   int           error;
  60   int           next;
  61   int16_t       token;
  62   int16_t       qc;
  63 } vp9_token_state;
  64
  65 // TODO(jimbankoski): experiment to find optimal RD numbers.
  66 static const int plane_rd_mult[PLANE_TYPES] = { 4, 2 };
  67
  68 #define UPDATE_RD_COST()\
  69 {\
  70   rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
  71   rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
  72   if (rd_cost0 == rd_cost1) {\
  73     rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
  74     rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
  75   }\
  76 }
  77
  78 // This function is a place holder for now but may ultimately need
  79 // to scan previous tokens to work out the correct context.
  80 static int trellis_get_coeff_context(const int16_t *scan,
  81                                      const int16_t *nb,
  82                                      int idx, int token,
  83                                      uint8_t *token_cache) {
  84   int bak = token_cache[scan[idx]], pt;
  85   token_cache[scan[idx]] = vp9_pt_energy_class[token];
  86   pt = get_coef_context(nb, token_cache, idx + 1);
  87   token_cache[scan[idx]] = bak;
  88   return pt;
  89 }
  90
  91 static int optimize_b(MACROBLOCK *mb, int plane, int block,
  92                       TX_SIZE tx_size, int ctx) {
  93   MACROBLOCKD *const xd = &mb->e_mbd;
  94   struct macroblock_plane *const p = &mb->plane[plane];
  95   struct macroblockd_plane *const pd = &xd->plane[plane];
  96   const int ref = is_inter_block(xd->mi[0]);
  97   vp9_token_state tokens[1025][2];
  98   unsigned best_index[1025][2];
  99   uint8_t token_cache[1024];
 100   const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
 101   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
 102   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 103   const int eob = p->eobs[block];
 104   const PLANE_TYPE type = get_plane_type(plane);
 105   const int default_eob = 16 << (tx_size << 1);
 106   const int mul = 1 + (tx_size == TX_32X32);
 107   const int16_t *dequant_ptr = pd->dequant;
 108   const uint8_t *const band_translate = get_band_translate(tx_size);
 109   const scan_order *const so = get_scan(xd, tx_size, type, block);
 110   const int16_t *const scan = so->scan;
 111   const int16_t *const nb = so->neighbors;
 112   int next = eob, sz = 0;
 113   int64_t rdmult = mb->rdmult * plane_rd_mult[type], rddiv = mb->rddiv;
 114   int64_t rd_cost0, rd_cost1;
 115   int rate0, rate1, error0, error1;
 116   int16_t t0, t1;
 117   EXTRABIT e0;
 118   int best, band, pt, i, final_eob;
 119 #if CONFIG_VP9_HIGHBITDEPTH
 120   const int *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
 121 #else
 122   const int *cat6_high_cost = vp9_get_high_cost_table(8);
 123 #endif
 124
 125   assert((!type && !plane) || (type && plane));
 126   assert(eob <= default_eob);
 127
 128   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
 129   if (!ref)
 130     rdmult = (rdmult * 9) >> 4;
 131
 132   /* Initialize the sentinel node of the trellis. */
 133   tokens[eob][0].rate = 0;
 134   tokens[eob][0].error = 0;
 135   tokens[eob][0].next = default_eob;
 136   tokens[eob][0].token = EOB_TOKEN;
 137   tokens[eob][0].qc = 0;
 138   tokens[eob][1] = tokens[eob][0];
 139
 140   for (i = 0; i < eob; i++)
 141     token_cache[scan[i]] =
 142         vp9_pt_energy_class[vp9_get_token(qcoeff[scan[i]])];
 143
 144   for (i = eob; i-- > 0;) {
 145     int base_bits, d2, dx;
 146     const int rc = scan[i];
 147     int x = qcoeff[rc];
 148     /* Only add a trellis state for non-zero coefficients. */
 149     if (x) {
 150       int shortcut = 0;
 151       error0 = tokens[next][0].error;
 152       error1 = tokens[next][1].error;
 153       /* Evaluate the first possibility for this state. */
 154       rate0 = tokens[next][0].rate;
 155       rate1 = tokens[next][1].rate;
 156       vp9_get_token_extra(x, &t0, &e0);
 157       /* Consider both possible successor states. */
 158       if (next < default_eob) {
 159         band = band_translate[i + 1];
 160         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
 161         rate0 += mb->token_costs[tx_size][type][ref][band][0][pt]
 162                                 [tokens[next][0].token];
 163         rate1 += mb->token_costs[tx_size][type][ref][band][0][pt]
 164                                 [tokens[next][1].token];
 165       }
 166       UPDATE_RD_COST();
 167       /* And pick the best. */
 168       best = rd_cost1 < rd_cost0;
 169       base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
 170       dx = mul * (dqcoeff[rc] - coeff[rc]);
 171 #if CONFIG_VP9_HIGHBITDEPTH
 172       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 173         dx >>= xd->bd - 8;
 174       }
 175 #endif  // CONFIG_VP9_HIGHBITDEPTH
 176       d2 = dx * dx;
 177       tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
 178       tokens[i][0].error = d2 + (best ? error1 : error0);
 179       tokens[i][0].next = next;
 180       tokens[i][0].token = t0;
 181       tokens[i][0].qc = x;
 182       best_index[i][0] = best;
 183
 184       /* Evaluate the second possibility for this state. */
 185       rate0 = tokens[next][0].rate;
 186       rate1 = tokens[next][1].rate;
 187
 188       if ((abs(x) * dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) &&
 189           (abs(x) * dequant_ptr[rc != 0] < abs(coeff[rc]) * mul +
 190                                                dequant_ptr[rc != 0]))
 191         shortcut = 1;
 192       else
 193         shortcut = 0;
 194
 195       if (shortcut) {
 196         sz = -(x < 0);
 197         x -= 2 * sz + 1;
 198       }
 199
 200       /* Consider both possible successor states. */
 201       if (!x) {
 202         /* If we reduced this coefficient to zero, check to see if
 203          *  we need to move the EOB back here.
 204          */
 205         t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
 206         t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
 207         e0 = 0;
 208       } else {
 209         vp9_get_token_extra(x, &t0, &e0);
 210         t1 = t0;
 211       }
 212       if (next < default_eob) {
 213         band = band_translate[i + 1];
 214         if (t0 != EOB_TOKEN) {
 215           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
 216           rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
 217                                   [tokens[next][0].token];
 218         }
 219         if (t1 != EOB_TOKEN) {
 220           pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
 221           rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt]
 222                                   [tokens[next][1].token];
 223         }
 224       }
 225
 226       UPDATE_RD_COST();
 227       /* And pick the best. */
 228       best = rd_cost1 < rd_cost0;
 229       base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
 230
 231       if (shortcut) {
 232 #if CONFIG_VP9_HIGHBITDEPTH
 233         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 234           dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
 235         } else {
 236           dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
 237         }
 238 #else
 239         dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
 240 #endif  // CONFIG_VP9_HIGHBITDEPTH
 241         d2 = dx * dx;
 242       }
 243       tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
 244       tokens[i][1].error = d2 + (best ? error1 : error0);
 245       tokens[i][1].next = next;
 246       tokens[i][1].token = best ? t1 : t0;
 247       tokens[i][1].qc = x;
 248       best_index[i][1] = best;
 249       /* Finally, make this the new head of the trellis. */
 250       next = i;
 251     } else {
 252       /* There's no choice to make for a zero coefficient, so we don't
 253        *  add a new trellis node, but we do need to update the costs.
 254        */
 255       band = band_translate[i + 1];
 256       t0 = tokens[next][0].token;
 257       t1 = tokens[next][1].token;
 258       /* Update the cost of each path if we're past the EOB token. */
 259       if (t0 != EOB_TOKEN) {
 260         tokens[next][0].rate +=
 261             mb->token_costs[tx_size][type][ref][band][1][0][t0];
 262         tokens[next][0].token = ZERO_TOKEN;
 263       }
 264       if (t1 != EOB_TOKEN) {
 265         tokens[next][1].rate +=
 266             mb->token_costs[tx_size][type][ref][band][1][0][t1];
 267         tokens[next][1].token = ZERO_TOKEN;
 268       }
 269       best_index[i][0] = best_index[i][1] = 0;
 270       /* Don't update next, because we didn't add a new node. */
 271     }
 272   }
 273
 274   /* Now pick the best path through the whole trellis. */
 275   band = band_translate[i + 1];
 276   rate0 = tokens[next][0].rate;
 277   rate1 = tokens[next][1].rate;
 278   error0 = tokens[next][0].error;
 279   error1 = tokens[next][1].error;
 280   t0 = tokens[next][0].token;
 281   t1 = tokens[next][1].token;
 282   rate0 += mb->token_costs[tx_size][type][ref][band][0][ctx][t0];
 283   rate1 += mb->token_costs[tx_size][type][ref][band][0][ctx][t1];
 284   UPDATE_RD_COST();
 285   best = rd_cost1 < rd_cost0;
 286   final_eob = -1;
 287   memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
 288   memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
 289   for (i = next; i < eob; i = next) {
 290     const int x = tokens[i][best].qc;
 291     const int rc = scan[i];
 292     if (x) {
 293       final_eob = i;
 294     }
 295
 296     qcoeff[rc] = x;
 297     dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul;
 298
 299     next = tokens[i][best].next;
 300     best = best_index[i][best];
 301   }
 302   final_eob++;
 303
 304   mb->plane[plane].eobs[block] = final_eob;
 305   return final_eob;
 306 }
 307
 308 static INLINE void fdct32x32(int rd_transform,
 309                              const int16_t *src, tran_low_t *dst,
 310                              int src_stride) {
 311   if (rd_transform)
 312     vpx_fdct32x32_rd(src, dst, src_stride);
 313   else
 314     vpx_fdct32x32(src, dst, src_stride);
 315 }
 316
 317 #if CONFIG_VP9_HIGHBITDEPTH
 318 static INLINE void highbd_fdct32x32(int rd_transform, const int16_t *src,
 319                                     tran_low_t *dst, int src_stride) {
 320   if (rd_transform)
 321     vpx_highbd_fdct32x32_rd(src, dst, src_stride);
 322   else
 323     vpx_highbd_fdct32x32(src, dst, src_stride);
 324 }
 325 #endif  // CONFIG_VP9_HIGHBITDEPTH
 326
 327 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
 328                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
 329   MACROBLOCKD *const xd = &x->e_mbd;
 330   const struct macroblock_plane *const p = &x->plane[plane];
 331   const struct macroblockd_plane *const pd = &xd->plane[plane];
 332   const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
 333   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
 334   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
 335   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 336   uint16_t *const eob = &p->eobs[block];
 337   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
 338   int i, j;
 339   const int16_t *src_diff;
 340   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
 341   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 342
 343 #if CONFIG_VP9_HIGHBITDEPTH
 344   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 345     switch (tx_size) {
 346       case TX_32X32:
 347         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
 348         vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
 349                                      p->round_fp, p->quant_fp, p->quant_shift,
 350                                      qcoeff, dqcoeff, pd->dequant,
 351                                      eob, scan_order->scan,
 352                                      scan_order->iscan);
 353         break;
 354       case TX_16X16:
 355         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
 356         vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
 357                                p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
 358                                pd->dequant, eob,
 359                                scan_order->scan, scan_order->iscan);
 360         break;
 361       case TX_8X8:
 362         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
 363         vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
 364                                p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
 365                                pd->dequant, eob,
 366                                scan_order->scan, scan_order->iscan);
 367         break;
 368       case TX_4X4:
 369         x->fwd_txm4x4(src_diff, coeff, diff_stride);
 370         vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
 371                                p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
 372                                pd->dequant, eob,
 373                                scan_order->scan, scan_order->iscan);
 374         break;
 375       default:
 376         assert(0);
 377     }
 378     return;
 379   }
 380 #endif  // CONFIG_VP9_HIGHBITDEPTH
 381
 382   switch (tx_size) {
 383     case TX_32X32:
 384       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
 385       vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
 386                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
 387                             pd->dequant, eob, scan_order->scan,
 388                             scan_order->iscan);
 389       break;
 390     case TX_16X16:
 391       vpx_fdct16x16(src_diff, coeff, diff_stride);
 392       vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
 393                       p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
 394                       pd->dequant, eob,
 395                       scan_order->scan, scan_order->iscan);
 396       break;
 397     case TX_8X8:
 398       vp9_fdct8x8_quant(src_diff, diff_stride, coeff, 64,
 399                         x->skip_block, p->zbin, p->round_fp,
 400                         p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
 401                         pd->dequant, eob,
 402                         scan_order->scan, scan_order->iscan);
 403       break;
 404     case TX_4X4:
 405       x->fwd_txm4x4(src_diff, coeff, diff_stride);
 406       vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
 407                       p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
 408                       pd->dequant, eob,
 409                       scan_order->scan, scan_order->iscan);
 410       break;
 411     default:
 412       assert(0);
 413       break;
 414   }
 415 }
 416
 417 void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
 418                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
 419   MACROBLOCKD *const xd = &x->e_mbd;
 420   const struct macroblock_plane *const p = &x->plane[plane];
 421   const struct macroblockd_plane *const pd = &xd->plane[plane];
 422   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
 423   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
 424   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 425   uint16_t *const eob = &p->eobs[block];
 426   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
 427   int i, j;
 428   const int16_t *src_diff;
 429
 430   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
 431   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 432
 433 #if CONFIG_VP9_HIGHBITDEPTH
 434   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 435     switch (tx_size) {
 436       case TX_32X32:
 437         vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
 438         vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
 439                                      p->quant_fp[0], qcoeff, dqcoeff,
 440                                      pd->dequant[0], eob);
 441         break;
 442       case TX_16X16:
 443         vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
 444         vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
 445                                p->quant_fp[0], qcoeff, dqcoeff,
 446                                pd->dequant[0], eob);
 447         break;
 448       case TX_8X8:
 449         vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
 450         vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
 451                                p->quant_fp[0], qcoeff, dqcoeff,
 452                                pd->dequant[0], eob);
 453         break;
 454       case TX_4X4:
 455         x->fwd_txm4x4(src_diff, coeff, diff_stride);
 456         vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
 457                                p->quant_fp[0], qcoeff, dqcoeff,
 458                                pd->dequant[0], eob);
 459         break;
 460       default:
 461         assert(0);
 462     }
 463     return;
 464   }
 465 #endif  // CONFIG_VP9_HIGHBITDEPTH
 466
 467   switch (tx_size) {
 468     case TX_32X32:
 469       vpx_fdct32x32_1(src_diff, coeff, diff_stride);
 470       vpx_quantize_dc_32x32(coeff, x->skip_block, p->round,
 471                             p->quant_fp[0], qcoeff, dqcoeff,
 472                             pd->dequant[0], eob);
 473       break;
 474     case TX_16X16:
 475       vpx_fdct16x16_1(src_diff, coeff, diff_stride);
 476       vpx_quantize_dc(coeff, 256, x->skip_block, p->round,
 477                      p->quant_fp[0], qcoeff, dqcoeff,
 478                      pd->dequant[0], eob);
 479       break;
 480     case TX_8X8:
 481       vpx_fdct8x8_1(src_diff, coeff, diff_stride);
 482       vpx_quantize_dc(coeff, 64, x->skip_block, p->round,
 483                       p->quant_fp[0], qcoeff, dqcoeff,
 484                       pd->dequant[0], eob);
 485       break;
 486     case TX_4X4:
 487       x->fwd_txm4x4(src_diff, coeff, diff_stride);
 488       vpx_quantize_dc(coeff, 16, x->skip_block, p->round,
 489                       p->quant_fp[0], qcoeff, dqcoeff,
 490                       pd->dequant[0], eob);
 491       break;
 492     default:
 493       assert(0);
 494       break;
 495   }
 496 }
 497
 498 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
 499                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
 500   MACROBLOCKD *const xd = &x->e_mbd;
 501   const struct macroblock_plane *const p = &x->plane[plane];
 502   const struct macroblockd_plane *const pd = &xd->plane[plane];
 503   const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
 504   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
 505   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
 506   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 507   uint16_t *const eob = &p->eobs[block];
 508   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
 509   int i, j;
 510   const int16_t *src_diff;
 511   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
 512   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 513
 514 #if CONFIG_VP9_HIGHBITDEPTH
 515   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 516      switch (tx_size) {
 517       case TX_32X32:
 518         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
 519         vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
 520                                     p->round, p->quant, p->quant_shift, qcoeff,
 521                                     dqcoeff, pd->dequant, eob,
 522                                     scan_order->scan, scan_order->iscan);
 523         break;
 524       case TX_16X16:
 525         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
 526         vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
 527                               p->quant, p->quant_shift, qcoeff, dqcoeff,
 528                               pd->dequant, eob,
 529                               scan_order->scan, scan_order->iscan);
 530         break;
 531       case TX_8X8:
 532         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
 533         vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
 534                               p->quant, p->quant_shift, qcoeff, dqcoeff,
 535                               pd->dequant, eob,
 536                               scan_order->scan, scan_order->iscan);
 537         break;
 538       case TX_4X4:
 539         x->fwd_txm4x4(src_diff, coeff, diff_stride);
 540         vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
 541                               p->quant, p->quant_shift, qcoeff, dqcoeff,
 542                               pd->dequant, eob,
 543                               scan_order->scan, scan_order->iscan);
 544         break;
 545       default:
 546         assert(0);
 547     }
 548     return;
 549   }
 550 #endif  // CONFIG_VP9_HIGHBITDEPTH
 551
 552   switch (tx_size) {
 553     case TX_32X32:
 554       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
 555       vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
 556                            p->quant, p->quant_shift, qcoeff, dqcoeff,
 557                            pd->dequant, eob, scan_order->scan,
 558                            scan_order->iscan);
 559       break;
 560     case TX_16X16:
 561       vpx_fdct16x16(src_diff, coeff, diff_stride);
 562       vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
 563                      p->quant, p->quant_shift, qcoeff, dqcoeff,
 564                      pd->dequant, eob,
 565                      scan_order->scan, scan_order->iscan);
 566       break;
 567     case TX_8X8:
 568       vpx_fdct8x8(src_diff, coeff, diff_stride);
 569       vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
 570                      p->quant, p->quant_shift, qcoeff, dqcoeff,
 571                      pd->dequant, eob,
 572                      scan_order->scan, scan_order->iscan);
 573       break;
 574     case TX_4X4:
 575       x->fwd_txm4x4(src_diff, coeff, diff_stride);
 576       vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
 577                      p->quant, p->quant_shift, qcoeff, dqcoeff,
 578                      pd->dequant, eob,
 579                      scan_order->scan, scan_order->iscan);
 580       break;
 581     default:
 582       assert(0);
 583       break;
 584   }
 585 }
 586
 587 static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
 588                          TX_SIZE tx_size, void *arg) {
 589   struct encode_b_args *const args = arg;
 590   MACROBLOCK *const x = args->x;
 591   MACROBLOCKD *const xd = &x->e_mbd;
 592   struct optimize_ctx *const ctx = args->ctx;
 593   struct macroblock_plane *const p = &x->plane[plane];
 594   struct macroblockd_plane *const pd = &xd->plane[plane];
 595   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 596   int i, j;
 597   uint8_t *dst;
 598   ENTROPY_CONTEXT *a, *l;
 599   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
 600   dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
 601   a = &ctx->ta[plane][i];
 602   l = &ctx->tl[plane][j];
 603
 604   // TODO(jingning): per transformed block zero forcing only enabled for
 605   // luma component. will integrate chroma components as well.
 606   if (x->zcoeff_blk[tx_size][block] && plane == 0) {
 607     p->eobs[block] = 0;
 608     *a = *l = 0;
 609     return;
 610   }
 611
 612   if (!x->skip_recode) {
 613     if (x->quant_fp) {
 614       // Encoding process for rtc mode
 615       if (x->skip_txfm[0] == SKIP_TXFM_AC_DC && plane == 0) {
 616         // skip forward transform
 617         p->eobs[block] = 0;
 618         *a = *l = 0;
 619         return;
 620       } else {
 621         vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
 622       }
 623     } else {
 624       if (max_txsize_lookup[plane_bsize] == tx_size) {
 625         int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1));
 626         if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_NONE) {
 627           // full forward transform and quantization
 628           vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 629         } else if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_AC_ONLY) {
 630           // fast path forward transform and quantization
 631           vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
 632         } else {
 633           // skip forward transform
 634           p->eobs[block] = 0;
 635           *a = *l = 0;
 636           return;
 637         }
 638       } else {
 639         vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 640       }
 641     }
 642   }
 643
 644   if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
 645     const int ctx = combine_entropy_contexts(*a, *l);
 646     *a = *l = optimize_b(x, plane, block, tx_size, ctx) > 0;
 647   } else {
 648     *a = *l = p->eobs[block] > 0;
 649   }
 650
 651   if (p->eobs[block])
 652     *(args->skip) = 0;
 653
 654   if (x->skip_encode || p->eobs[block] == 0)
 655     return;
 656 #if CONFIG_VP9_HIGHBITDEPTH
 657   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 658     switch (tx_size) {
 659       case TX_32X32:
 660         vp9_highbd_idct32x32_add(dqcoeff, dst, pd->dst.stride,
 661                                  p->eobs[block], xd->bd);
 662         break;
 663       case TX_16X16:
 664         vp9_highbd_idct16x16_add(dqcoeff, dst, pd->dst.stride,
 665                                  p->eobs[block], xd->bd);
 666         break;
 667       case TX_8X8:
 668         vp9_highbd_idct8x8_add(dqcoeff, dst, pd->dst.stride,
 669                                p->eobs[block], xd->bd);
 670         break;
 671       case TX_4X4:
 672         // this is like vp9_short_idct4x4 but has a special case around eob<=1
 673         // which is significant (not just an optimization) for the lossless
 674         // case.
 675         x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride,
 676                            p->eobs[block], xd->bd);
 677         break;
 678       default:
 679         assert(0 && "Invalid transform size");
 680     }
 681     return;
 682   }
 683 #endif  // CONFIG_VP9_HIGHBITDEPTH
 684
 685   switch (tx_size) {
 686     case TX_32X32:
 687       vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 688       break;
 689     case TX_16X16:
 690       vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 691       break;
 692     case TX_8X8:
 693       vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 694       break;
 695     case TX_4X4:
 696       // this is like vp9_short_idct4x4 but has a special case around eob<=1
 697       // which is significant (not just an optimization) for the lossless
 698       // case.
 699       x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 700       break;
 701     default:
 702       assert(0 && "Invalid transform size");
 703       break;
 704   }
 705 }
 706
 707 static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
 708                                TX_SIZE tx_size, void *arg) {
 709   MACROBLOCK *const x = (MACROBLOCK *)arg;
 710   MACROBLOCKD *const xd = &x->e_mbd;
 711   struct macroblock_plane *const p = &x->plane[plane];
 712   struct macroblockd_plane *const pd = &xd->plane[plane];
 713   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 714   int i, j;
 715   uint8_t *dst;
 716   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
 717   dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
 718
 719   vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 720
 721   if (p->eobs[block] > 0) {
 722 #if CONFIG_VP9_HIGHBITDEPTH
 723     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 724        x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd);
 725        return;
 726     }
 727 #endif  // CONFIG_VP9_HIGHBITDEPTH
 728     x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 729   }
 730 }
 731
 732 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
 733   vp9_subtract_plane(x, bsize, 0);
 734   vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
 735                                          encode_block_pass1, x);
 736 }
 737
 738 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
 739   MACROBLOCKD *const xd = &x->e_mbd;
 740   struct optimize_ctx ctx;
 741   MODE_INFO *mi = xd->mi[0];
 742   struct encode_b_args arg = {x, &ctx, &mi->skip};
 743   int plane;
 744
 745   mi->skip = 1;
 746
 747   if (x->skip)
 748     return;
 749
 750   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
 751     if (!x->skip_recode)
 752       vp9_subtract_plane(x, bsize, plane);
 753
 754     if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
 755       const struct macroblockd_plane* const pd = &xd->plane[plane];
 756       const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size;
 757       vp9_get_entropy_contexts(bsize, tx_size, pd,
 758                                ctx.ta[plane], ctx.tl[plane]);
 759     }
 760
 761     vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
 762                                            &arg);
 763   }
 764 }
 765
 766 void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
 767                             TX_SIZE tx_size, void *arg) {
 768   struct encode_b_args* const args = arg;
 769   MACROBLOCK *const x = args->x;
 770   MACROBLOCKD *const xd = &x->e_mbd;
 771   MODE_INFO *mi = xd->mi[0];
 772   struct macroblock_plane *const p = &x->plane[plane];
 773   struct macroblockd_plane *const pd = &xd->plane[plane];
 774   tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
 775   tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
 776   tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 777   const scan_order *scan_order;
 778   TX_TYPE tx_type = DCT_DCT;
 779   PREDICTION_MODE mode;
 780   const int bwl = b_width_log2_lookup[plane_bsize];
 781   const int diff_stride = 4 * (1 << bwl);
 782   uint8_t *src, *dst;
 783   int16_t *src_diff;
 784   uint16_t *eob = &p->eobs[block];
 785   const int src_stride = p->src.stride;
 786   const int dst_stride = pd->dst.stride;
 787   int i, j;
 788   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
 789   dst = &pd->dst.buf[4 * (j * dst_stride + i)];
 790   src = &p->src.buf[4 * (j * src_stride + i)];
 791   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 792
 793   if (tx_size == TX_4X4) {
 794     tx_type = get_tx_type_4x4(get_plane_type(plane), xd, block);
 795     scan_order = &vp9_scan_orders[TX_4X4][tx_type];
 796     mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mi->uv_mode;
 797   } else {
 798     mode = plane == 0 ? mi->mode : mi->uv_mode;
 799     if (tx_size == TX_32X32) {
 800       scan_order = &vp9_default_scan_orders[TX_32X32];
 801     } else {
 802       tx_type = get_tx_type(get_plane_type(plane), xd);
 803       scan_order = &vp9_scan_orders[tx_size][tx_type];
 804     }
 805   }
 806
 807   vp9_predict_intra_block(xd, bwl, tx_size, mode, x->skip_encode ? src : dst,
 808                           x->skip_encode ? src_stride : dst_stride,
 809                           dst, dst_stride, i, j, plane);
 810
 811 #if CONFIG_VP9_HIGHBITDEPTH
 812   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 813     switch (tx_size) {
 814       case TX_32X32:
 815         if (!x->skip_recode) {
 816           vpx_highbd_subtract_block(32, 32, src_diff, diff_stride,
 817                                     src, src_stride, dst, dst_stride, xd->bd);
 818           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
 819           vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
 820                                       p->round, p->quant, p->quant_shift,
 821                                       qcoeff, dqcoeff, pd->dequant, eob,
 822                                       scan_order->scan, scan_order->iscan);
 823         }
 824         if (!x->skip_encode && *eob) {
 825           vp9_highbd_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
 826         }
 827         break;
 828       case TX_16X16:
 829         if (!x->skip_recode) {
 830           vpx_highbd_subtract_block(16, 16, src_diff, diff_stride,
 831                                     src, src_stride, dst, dst_stride, xd->bd);
 832           if (tx_type == DCT_DCT)
 833             vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
 834           else
 835             vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
 836           vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
 837                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
 838                                 pd->dequant, eob,
 839                                 scan_order->scan, scan_order->iscan);
 840         }
 841         if (!x->skip_encode && *eob) {
 842           vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, dst_stride,
 843                                   *eob, xd->bd);
 844         }
 845         break;
 846       case TX_8X8:
 847         if (!x->skip_recode) {
 848           vpx_highbd_subtract_block(8, 8, src_diff, diff_stride,
 849                                     src, src_stride, dst, dst_stride, xd->bd);
 850           if (tx_type == DCT_DCT)
 851             vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
 852           else
 853             vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
 854           vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
 855                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
 856                                 pd->dequant, eob,
 857                                 scan_order->scan, scan_order->iscan);
 858         }
 859         if (!x->skip_encode && *eob) {
 860           vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob,
 861                                 xd->bd);
 862         }
 863         break;
 864       case TX_4X4:
 865         if (!x->skip_recode) {
 866           vpx_highbd_subtract_block(4, 4, src_diff, diff_stride,
 867                                     src, src_stride, dst, dst_stride, xd->bd);
 868           if (tx_type != DCT_DCT)
 869             vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
 870           else
 871             x->fwd_txm4x4(src_diff, coeff, diff_stride);
 872           vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
 873                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
 874                                 pd->dequant, eob,
 875                                 scan_order->scan, scan_order->iscan);
 876         }
 877
 878         if (!x->skip_encode && *eob) {
 879           if (tx_type == DCT_DCT) {
 880             // this is like vp9_short_idct4x4 but has a special case around
 881             // eob<=1 which is significant (not just an optimization) for the
 882             // lossless case.
 883             x->highbd_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
 884           } else {
 885             vp9_highbd_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd);
 886           }
 887         }
 888         break;
 889       default:
 890         assert(0);
 891         return;
 892     }
 893     if (*eob)
 894       *(args->skip) = 0;
 895     return;
 896   }
 897 #endif  // CONFIG_VP9_HIGHBITDEPTH
 898
 899   switch (tx_size) {
 900     case TX_32X32:
 901       if (!x->skip_recode) {
 902         vpx_subtract_block(32, 32, src_diff, diff_stride,
 903                            src, src_stride, dst, dst_stride);
 904         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
 905         vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
 906                              p->quant, p->quant_shift, qcoeff, dqcoeff,
 907                              pd->dequant, eob, scan_order->scan,
 908                              scan_order->iscan);
 909       }
 910       if (!x->skip_encode && *eob)
 911         vp9_idct32x32_add(dqcoeff, dst, dst_stride, *eob);
 912       break;
 913     case TX_16X16:
 914       if (!x->skip_recode) {
 915         vpx_subtract_block(16, 16, src_diff, diff_stride,
 916                            src, src_stride, dst, dst_stride);
 917         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
 918         vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
 919                        p->quant, p->quant_shift, qcoeff, dqcoeff,
 920                        pd->dequant, eob, scan_order->scan,
 921                        scan_order->iscan);
 922       }
 923       if (!x->skip_encode && *eob)
 924         vp9_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob);
 925       break;
 926     case TX_8X8:
 927       if (!x->skip_recode) {
 928         vpx_subtract_block(8, 8, src_diff, diff_stride,
 929                            src, src_stride, dst, dst_stride);
 930         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
 931         vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
 932                        p->quant_shift, qcoeff, dqcoeff,
 933                        pd->dequant, eob, scan_order->scan,
 934                        scan_order->iscan);
 935       }
 936       if (!x->skip_encode && *eob)
 937         vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob);
 938       break;
 939     case TX_4X4:
 940       if (!x->skip_recode) {
 941         vpx_subtract_block(4, 4, src_diff, diff_stride,
 942                            src, src_stride, dst, dst_stride);
 943         if (tx_type != DCT_DCT)
 944           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
 945         else
 946           x->fwd_txm4x4(src_diff, coeff, diff_stride);
 947         vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
 948                        p->quant_shift, qcoeff, dqcoeff,
 949                        pd->dequant, eob, scan_order->scan,
 950                        scan_order->iscan);
 951       }
 952
 953       if (!x->skip_encode && *eob) {
 954         if (tx_type == DCT_DCT)
 955           // this is like vp9_short_idct4x4 but has a special case around eob<=1
 956           // which is significant (not just an optimization) for the lossless
 957           // case.
 958           x->itxm_add(dqcoeff, dst, dst_stride, *eob);
 959         else
 960           vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
 961       }
 962       break;
 963     default:
 964       assert(0);
 965       break;
 966   }
 967   if (*eob)
 968     *(args->skip) = 0;
 969 }
 970
 971 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
 972   const MACROBLOCKD *const xd = &x->e_mbd;
 973   struct encode_b_args arg = {x, NULL, &xd->mi[0]->skip};
 974
 975   vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
 976                                          vp9_encode_block_intra, &arg);
 977 }