granicus.if.org Git - libvpx/blob - vp9/encoder/vp9_rdopt.c

   1 /*
   2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include <assert.h>
  12 #include <math.h>
  13
  14 #include "./vp9_rtcd.h"
  15
  16 #include "vpx_mem/vpx_mem.h"
  17
  18 #include "vp9/common/vp9_common.h"
  19 #include "vp9/common/vp9_entropy.h"
  20 #include "vp9/common/vp9_entropymode.h"
  21 #include "vp9/common/vp9_idct.h"
  22 #include "vp9/common/vp9_mvref_common.h"
  23 #include "vp9/common/vp9_pred_common.h"
  24 #include "vp9/common/vp9_quant_common.h"
  25 #include "vp9/common/vp9_reconinter.h"
  26 #include "vp9/common/vp9_reconintra.h"
  27 #include "vp9/common/vp9_seg_common.h"
  28 #include "vp9/common/vp9_systemdependent.h"
  29
  30 #include "vp9/encoder/vp9_cost.h"
  31 #include "vp9/encoder/vp9_encodemb.h"
  32 #include "vp9/encoder/vp9_encodemv.h"
  33 #include "vp9/encoder/vp9_encoder.h"
  34 #include "vp9/encoder/vp9_mcomp.h"
  35 #include "vp9/encoder/vp9_quantize.h"
  36 #include "vp9/encoder/vp9_ratectrl.h"
  37 #include "vp9/encoder/vp9_rd.h"
  38 #include "vp9/encoder/vp9_rdopt.h"
  39 #include "vp9/encoder/vp9_variance.h"
  40
  41 #define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
  42                                  (1 << INTRA_FRAME))
  43 #define GOLDEN_FRAME_MODE_MASK  ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | \
  44                                  (1 << INTRA_FRAME))
  45 #define ALT_REF_MODE_MASK       ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | \
  46                                  (1 << INTRA_FRAME))
  47
  48 #define SECOND_REF_FRAME_MASK   ((1 << ALTREF_FRAME) | 0x01)
  49
  50 #define MIN_EARLY_TERM_INDEX    3
  51
  52 typedef struct {
  53   PREDICTION_MODE mode;
  54   MV_REFERENCE_FRAME ref_frame[2];
  55 } MODE_DEFINITION;
  56
  57 typedef struct {
  58   MV_REFERENCE_FRAME ref_frame[2];
  59 } REF_DEFINITION;
  60
  61 struct rdcost_block_args {
  62   MACROBLOCK *x;
  63   ENTROPY_CONTEXT t_above[16];
  64   ENTROPY_CONTEXT t_left[16];
  65   int rate;
  66   int64_t dist;
  67   int64_t sse;
  68   int this_rate;
  69   int64_t this_dist;
  70   int64_t this_sse;
  71   int64_t this_rd;
  72   int64_t best_rd;
  73   int skip;
  74   int use_fast_coef_costing;
  75   const scan_order *so;
  76 };
  77
  78 static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
  79   {NEARESTMV, {LAST_FRAME,   NONE}},
  80   {NEARESTMV, {ALTREF_FRAME, NONE}},
  81   {NEARESTMV, {GOLDEN_FRAME, NONE}},
  82
  83   {DC_PRED,   {INTRA_FRAME,  NONE}},
  84
  85   {NEWMV,     {LAST_FRAME,   NONE}},
  86   {NEWMV,     {ALTREF_FRAME, NONE}},
  87   {NEWMV,     {GOLDEN_FRAME, NONE}},
  88
  89   {NEARMV,    {LAST_FRAME,   NONE}},
  90   {NEARMV,    {ALTREF_FRAME, NONE}},
  91   {NEARMV,    {GOLDEN_FRAME, NONE}},
  92
  93   {ZEROMV,    {LAST_FRAME,   NONE}},
  94   {ZEROMV,    {GOLDEN_FRAME, NONE}},
  95   {ZEROMV,    {ALTREF_FRAME, NONE}},
  96
  97   {NEARESTMV, {LAST_FRAME,   ALTREF_FRAME}},
  98   {NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
  99
 100   {TM_PRED,   {INTRA_FRAME,  NONE}},
 101
 102   {NEARMV,    {LAST_FRAME,   ALTREF_FRAME}},
 103   {NEWMV,     {LAST_FRAME,   ALTREF_FRAME}},
 104   {NEARMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
 105   {NEWMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
 106
 107   {ZEROMV,    {LAST_FRAME,   ALTREF_FRAME}},
 108   {ZEROMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
 109
 110   {H_PRED,    {INTRA_FRAME,  NONE}},
 111   {V_PRED,    {INTRA_FRAME,  NONE}},
 112   {D135_PRED, {INTRA_FRAME,  NONE}},
 113   {D207_PRED, {INTRA_FRAME,  NONE}},
 114   {D153_PRED, {INTRA_FRAME,  NONE}},
 115   {D63_PRED,  {INTRA_FRAME,  NONE}},
 116   {D117_PRED, {INTRA_FRAME,  NONE}},
 117   {D45_PRED,  {INTRA_FRAME,  NONE}},
 118 };
 119
 120 static const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
 121   {{LAST_FRAME,   NONE}},
 122   {{GOLDEN_FRAME, NONE}},
 123   {{ALTREF_FRAME, NONE}},
 124   {{LAST_FRAME,   ALTREF_FRAME}},
 125   {{GOLDEN_FRAME, ALTREF_FRAME}},
 126   {{INTRA_FRAME,  NONE}},
 127 };
 128
 129 static int raster_block_offset(BLOCK_SIZE plane_bsize,
 130                                int raster_block, int stride) {
 131   const int bw = b_width_log2_lookup[plane_bsize];
 132   const int y = 4 * (raster_block >> bw);
 133   const int x = 4 * (raster_block & ((1 << bw) - 1));
 134   return y * stride + x;
 135 }
 136 static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
 137                                           int raster_block, int16_t *base) {
 138   const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
 139   return base + raster_block_offset(plane_bsize, raster_block, stride);
 140 }
 141
 142 static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
 143                            int m, int n, int min_plane, int max_plane) {
 144   int i;
 145
 146   for (i = min_plane; i < max_plane; ++i) {
 147     struct macroblock_plane *const p = &x->plane[i];
 148     struct macroblockd_plane *const pd = &x->e_mbd.plane[i];
 149
 150     p->coeff    = ctx->coeff_pbuf[i][m];
 151     p->qcoeff   = ctx->qcoeff_pbuf[i][m];
 152     pd->dqcoeff = ctx->dqcoeff_pbuf[i][m];
 153     p->eobs     = ctx->eobs_pbuf[i][m];
 154
 155     ctx->coeff_pbuf[i][m]   = ctx->coeff_pbuf[i][n];
 156     ctx->qcoeff_pbuf[i][m]  = ctx->qcoeff_pbuf[i][n];
 157     ctx->dqcoeff_pbuf[i][m] = ctx->dqcoeff_pbuf[i][n];
 158     ctx->eobs_pbuf[i][m]    = ctx->eobs_pbuf[i][n];
 159
 160     ctx->coeff_pbuf[i][n]   = p->coeff;
 161     ctx->qcoeff_pbuf[i][n]  = p->qcoeff;
 162     ctx->dqcoeff_pbuf[i][n] = pd->dqcoeff;
 163     ctx->eobs_pbuf[i][n]    = p->eobs;
 164   }
 165 }
 166
 167 static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
 168                             MACROBLOCK *x, MACROBLOCKD *xd,
 169                             int *out_rate_sum, int64_t *out_dist_sum,
 170                             int *skip_txfm_sb, int64_t *skip_sse_sb) {
 171   // Note our transform coeffs are 8 times an orthogonal transform.
 172   // Hence quantizer step is also 8 times. To get effective quantizer
 173   // we need to divide by 8 before sending to modeling function.
 174   int i;
 175   int64_t rate_sum = 0;
 176   int64_t dist_sum = 0;
 177   const int ref = xd->mi[0].src_mi->mbmi.ref_frame[0];
 178   unsigned int sse;
 179   unsigned int var = 0;
 180   unsigned int sum_sse = 0;
 181   int64_t total_sse = 0;
 182   int skip_flag = 1;
 183   const int shift = 6;
 184   int rate;
 185   int64_t dist;
 186
 187   x->pred_sse[ref] = 0;
 188
 189   for (i = 0; i < MAX_MB_PLANE; ++i) {
 190     struct macroblock_plane *const p = &x->plane[i];
 191     struct macroblockd_plane *const pd = &xd->plane[i];
 192     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
 193     const TX_SIZE max_tx_size = max_txsize_lookup[bs];
 194     const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size];
 195     const int64_t dc_thr = p->quant_thred[0] >> shift;
 196     const int64_t ac_thr = p->quant_thred[1] >> shift;
 197     // The low thresholds are used to measure if the prediction errors are
 198     // low enough so that we can skip the mode search.
 199     const int64_t low_dc_thr = MIN(50, dc_thr >> 2);
 200     const int64_t low_ac_thr = MIN(80, ac_thr >> 2);
 201     int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
 202     int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
 203     int idx, idy;
 204     int lw = b_width_log2_lookup[unit_size] + 2;
 205     int lh = b_height_log2_lookup[unit_size] + 2;
 206
 207     sum_sse = 0;
 208
 209     for (idy = 0; idy < bh; ++idy) {
 210       for (idx = 0; idx < bw; ++idx) {
 211         uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
 212         uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh);
 213         int block_idx = (idy << 1) + idx;
 214         int low_err_skip = 0;
 215
 216         var = cpi->fn_ptr[unit_size].vf(src, p->src.stride,
 217                                         dst, pd->dst.stride, &sse);
 218         x->bsse[(i << 2) + block_idx] = sse;
 219         sum_sse += sse;
 220
 221         x->skip_txfm[(i << 2) + block_idx] = 0;
 222         if (!x->select_tx_size) {
 223           // Check if all ac coefficients can be quantized to zero.
 224           if (var < ac_thr || var == 0) {
 225             x->skip_txfm[(i << 2) + block_idx] = 2;
 226
 227             // Check if dc coefficient can be quantized to zero.
 228             if (sse - var < dc_thr || sse == var) {
 229               x->skip_txfm[(i << 2) + block_idx] = 1;
 230
 231               if (!sse || (var < low_ac_thr && sse - var < low_dc_thr))
 232                 low_err_skip = 1;
 233             }
 234           }
 235         }
 236
 237         if (skip_flag && !low_err_skip)
 238           skip_flag = 0;
 239
 240         if (i == 0)
 241           x->pred_sse[ref] += sse;
 242       }
 243     }
 244
 245     total_sse += sum_sse;
 246
 247     // Fast approximate the modelling function.
 248     if (cpi->oxcf.speed > 4) {
 249       int64_t rate;
 250       const int64_t square_error = sum_sse;
 251       int quantizer = (pd->dequant[1] >> 3);
 252 #if CONFIG_VP9_HIGHBITDEPTH
 253       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 254         quantizer >>= (xd->bd - 8);
 255       }
 256 #endif  // CONFIG_VP9_HIGHBITDEPTH
 257
 258       if (quantizer < 120)
 259         rate = (square_error * (280 - quantizer)) >> 8;
 260       else
 261         rate = 0;
 262       dist = (square_error * quantizer) >> 8;
 263       rate_sum += rate;
 264       dist_sum += dist;
 265     } else {
 266 #if CONFIG_VP9_HIGHBITDEPTH
 267       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 268         vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
 269                                      pd->dequant[1] >> (xd->bd - 5),
 270                                      &rate, &dist);
 271       } else {
 272         vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
 273                                      pd->dequant[1] >> 3, &rate, &dist);
 274       }
 275 #else
 276       vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
 277                                    pd->dequant[1] >> 3, &rate, &dist);
 278 #endif  // CONFIG_VP9_HIGHBITDEPTH
 279       rate_sum += rate;
 280       dist_sum += dist;
 281     }
 282   }
 283
 284   *skip_txfm_sb = skip_flag;
 285   *skip_sse_sb = total_sse << 4;
 286   *out_rate_sum = (int)rate_sum;
 287   *out_dist_sum = dist_sum << 4;
 288 }
 289
 290 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
 291                           intptr_t block_size, int64_t *ssz) {
 292   int i;
 293   int64_t error = 0, sqcoeff = 0;
 294
 295   for (i = 0; i < block_size; i++) {
 296     const int diff = coeff[i] - dqcoeff[i];
 297     error +=  diff * diff;
 298     sqcoeff += coeff[i] * coeff[i];
 299   }
 300
 301   *ssz = sqcoeff;
 302   return error;
 303 }
 304
 305
 306 #if CONFIG_VP9_HIGHBITDEPTH
 307 int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
 308                                  const tran_low_t *dqcoeff,
 309                                  intptr_t block_size,
 310                                  int64_t *ssz, int bd) {
 311   int i;
 312   int64_t error = 0, sqcoeff = 0;
 313   int shift = 2 * (bd - 8);
 314   int rounding = shift > 0 ? 1 << (shift - 1) : 0;
 315
 316   for (i = 0; i < block_size; i++) {
 317     const int64_t diff = coeff[i] - dqcoeff[i];
 318     error +=  diff * diff;
 319     sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
 320   }
 321   assert(error >= 0 && sqcoeff >= 0);
 322   error = (error + rounding) >> shift;
 323   sqcoeff = (sqcoeff + rounding) >> shift;
 324
 325   *ssz = sqcoeff;
 326   return error;
 327 }
 328 #endif  // CONFIG_VP9_HIGHBITDEPTH
 329
 330 /* The trailing '0' is a terminator which is used inside cost_coeffs() to
 331  * decide whether to include cost of a trailing EOB node or not (i.e. we
 332  * can skip this if the last coefficient in this transform block, e.g. the
 333  * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
 334  * were non-zero). */
 335 static const int16_t band_counts[TX_SIZES][8] = {
 336   { 1, 2, 3, 4,  3,   16 - 13, 0 },
 337   { 1, 2, 3, 4, 11,   64 - 21, 0 },
 338   { 1, 2, 3, 4, 11,  256 - 21, 0 },
 339   { 1, 2, 3, 4, 11, 1024 - 21, 0 },
 340 };
 341 static INLINE int cost_coeffs(MACROBLOCK *x,
 342                               int plane, int block,
 343                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
 344                               TX_SIZE tx_size,
 345                               const int16_t *scan, const int16_t *nb,
 346                               int use_fast_coef_costing) {
 347   MACROBLOCKD *const xd = &x->e_mbd;
 348   MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
 349   const struct macroblock_plane *p = &x->plane[plane];
 350   const struct macroblockd_plane *pd = &xd->plane[plane];
 351   const PLANE_TYPE type = pd->plane_type;
 352   const int16_t *band_count = &band_counts[tx_size][1];
 353   const int eob = p->eobs[block];
 354   const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
 355   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
 356                    x->token_costs[tx_size][type][is_inter_block(mbmi)];
 357   uint8_t token_cache[32 * 32];
 358   int pt = combine_entropy_contexts(*A, *L);
 359   int c, cost;
 360   // Check for consistency of tx_size with mode info
 361   assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
 362                               : get_uv_tx_size(mbmi, pd) == tx_size);
 363
 364   if (eob == 0) {
 365     // single eob token
 366     cost = token_costs[0][0][pt][EOB_TOKEN];
 367     c = 0;
 368   } else {
 369     int band_left = *band_count++;
 370
 371     // dc token
 372     int v = qcoeff[0];
 373     int prev_t = vp9_dct_value_tokens_ptr[v].token;
 374     cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
 375     token_cache[0] = vp9_pt_energy_class[prev_t];
 376     ++token_costs;
 377
 378     // ac tokens
 379     for (c = 1; c < eob; c++) {
 380       const int rc = scan[c];
 381       int t;
 382
 383       v = qcoeff[rc];
 384       t = vp9_dct_value_tokens_ptr[v].token;
 385       if (use_fast_coef_costing) {
 386         cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v];
 387       } else {
 388         pt = get_coef_context(nb, token_cache, c);
 389         cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
 390         token_cache[rc] = vp9_pt_energy_class[t];
 391       }
 392       prev_t = t;
 393       if (!--band_left) {
 394         band_left = *band_count++;
 395         ++token_costs;
 396       }
 397     }
 398
 399     // eob token
 400     if (band_left) {
 401       if (use_fast_coef_costing) {
 402         cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
 403       } else {
 404         pt = get_coef_context(nb, token_cache, c);
 405         cost += (*token_costs)[0][pt][EOB_TOKEN];
 406       }
 407     }
 408   }
 409
 410   // is eob first coefficient;
 411   *A = *L = (c > 0);
 412
 413   return cost;
 414 }
 415
 416 #if CONFIG_VP9_HIGHBITDEPTH
 417 static void dist_block(int plane, int block, TX_SIZE tx_size,
 418                        struct rdcost_block_args* args, int bd) {
 419 #else
 420 static void dist_block(int plane, int block, TX_SIZE tx_size,
 421                        struct rdcost_block_args* args) {
 422 #endif  // CONFIG_VP9_HIGHBITDEPTH
 423   const int ss_txfrm_size = tx_size << 1;
 424   MACROBLOCK* const x = args->x;
 425   MACROBLOCKD* const xd = &x->e_mbd;
 426   const struct macroblock_plane *const p = &x->plane[plane];
 427   const struct macroblockd_plane *const pd = &xd->plane[plane];
 428   int64_t this_sse;
 429   int shift = tx_size == TX_32X32 ? 0 : 2;
 430   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
 431   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 432 #if CONFIG_VP9_HIGHBITDEPTH
 433   args->dist = vp9_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
 434                                       &this_sse, bd) >> shift;
 435 #else
 436   args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
 437                                &this_sse) >> shift;
 438 #endif  // CONFIG_VP9_HIGHBITDEPTH
 439   args->sse  = this_sse >> shift;
 440
 441   if (x->skip_encode && !is_inter_block(&xd->mi[0].src_mi->mbmi)) {
 442     // TODO(jingning): tune the model to better capture the distortion.
 443     int64_t p = (pd->dequant[1] * pd->dequant[1] *
 444                     (1 << ss_txfrm_size)) >> (shift + 2);
 445 #if CONFIG_VP9_HIGHBITDEPTH
 446     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 447       p >>= ((xd->bd - 8) * 2);
 448     }
 449 #endif  // CONFIG_VP9_HIGHBITDEPTH
 450     args->dist += (p >> 4);
 451     args->sse  += p;
 452   }
 453 }
 454
 455 static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
 456                        TX_SIZE tx_size, struct rdcost_block_args* args) {
 457   int x_idx, y_idx;
 458   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x_idx, &y_idx);
 459
 460   args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
 461                            args->t_left + y_idx, tx_size,
 462                            args->so->scan, args->so->neighbors,
 463                            args->use_fast_coef_costing);
 464 }
 465
 466 static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
 467                           TX_SIZE tx_size, void *arg) {
 468   struct rdcost_block_args *args = arg;
 469   MACROBLOCK *const x = args->x;
 470   MACROBLOCKD *const xd = &x->e_mbd;
 471   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
 472   int64_t rd1, rd2, rd;
 473
 474   if (args->skip)
 475     return;
 476
 477   if (!is_inter_block(mbmi)) {
 478     struct encode_b_args arg = {x, NULL, &mbmi->skip};
 479     vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
 480 #if CONFIG_VP9_HIGHBITDEPTH
 481     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 482       dist_block(plane, block, tx_size, args, xd->bd);
 483     } else {
 484       dist_block(plane, block, tx_size, args, 8);
 485     }
 486 #else
 487     dist_block(plane, block, tx_size, args);
 488 #endif  // CONFIG_VP9_HIGHBITDEPTH
 489   } else if (max_txsize_lookup[plane_bsize] == tx_size) {
 490     if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) {
 491       // full forward transform and quantization
 492       vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 493 #if CONFIG_VP9_HIGHBITDEPTH
 494       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 495         dist_block(plane, block, tx_size, args, xd->bd);
 496       } else {
 497         dist_block(plane, block, tx_size, args, 8);
 498       }
 499 #else
 500       dist_block(plane, block, tx_size, args);
 501 #endif  // CONFIG_VP9_HIGHBITDEPTH
 502     } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {
 503       // compute DC coefficient
 504       tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
 505       tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
 506       vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
 507       args->sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
 508       args->dist = args->sse;
 509       if (x->plane[plane].eobs[block]) {
 510         int64_t dc_correct = coeff[0] * coeff[0] -
 511             (coeff[0] - dqcoeff[0]) * (coeff[0] - dqcoeff[0]);
 512 #if CONFIG_VP9_HIGHBITDEPTH
 513         dc_correct >>= ((xd->bd - 8) * 2);
 514 #endif
 515         if (tx_size != TX_32X32)
 516           dc_correct >>= 2;
 517
 518         args->dist = MAX(0, args->sse - dc_correct);
 519       }
 520     } else {
 521       // skip forward transform
 522       x->plane[plane].eobs[block] = 0;
 523       args->sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
 524       args->dist = args->sse;
 525     }
 526   } else {
 527     // full forward transform and quantization
 528     vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 529 #if CONFIG_VP9_HIGHBITDEPTH
 530     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 531       dist_block(plane, block, tx_size, args, xd->bd);
 532     } else {
 533       dist_block(plane, block, tx_size, args, 8);
 534     }
 535 #else
 536     dist_block(plane, block, tx_size, args);
 537 #endif  // CONFIG_VP9_HIGHBITDEPTH
 538   }
 539
 540   rate_block(plane, block, plane_bsize, tx_size, args);
 541   rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
 542   rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
 543
 544   // TODO(jingning): temporarily enabled only for luma component
 545   rd = MIN(rd1, rd2);
 546   if (plane == 0)
 547     x->zcoeff_blk[tx_size][block] = !x->plane[plane].eobs[block] ||
 548                                     (rd1 > rd2 && !xd->lossless);
 549
 550   args->this_rate += args->rate;
 551   args->this_dist += args->dist;
 552   args->this_sse  += args->sse;
 553   args->this_rd += rd;
 554
 555   if (args->this_rd > args->best_rd) {
 556     args->skip = 1;
 557     return;
 558   }
 559 }
 560
 561 static void txfm_rd_in_plane(MACROBLOCK *x,
 562                              int *rate, int64_t *distortion,
 563                              int *skippable, int64_t *sse,
 564                              int64_t ref_best_rd, int plane,
 565                              BLOCK_SIZE bsize, TX_SIZE tx_size,
 566                              int use_fast_coef_casting) {
 567   MACROBLOCKD *const xd = &x->e_mbd;
 568   const struct macroblockd_plane *const pd = &xd->plane[plane];
 569   struct rdcost_block_args args;
 570   vp9_zero(args);
 571   args.x = x;
 572   args.best_rd = ref_best_rd;
 573   args.use_fast_coef_costing = use_fast_coef_casting;
 574
 575   if (plane == 0)
 576     xd->mi[0].src_mi->mbmi.tx_size = tx_size;
 577
 578   vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 579
 580   args.so = get_scan(xd, tx_size, pd->plane_type, 0);
 581
 582   vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
 583                                          block_rd_txfm, &args);
 584   if (args.skip) {
 585     *rate       = INT_MAX;
 586     *distortion = INT64_MAX;
 587     *sse        = INT64_MAX;
 588     *skippable  = 0;
 589   } else {
 590     *distortion = args.this_dist;
 591     *rate       = args.this_rate;
 592     *sse        = args.this_sse;
 593     *skippable  = vp9_is_skippable_in_plane(x, bsize, plane);
 594   }
 595 }
 596
 597 static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x,
 598                                    int *rate, int64_t *distortion,
 599                                    int *skip, int64_t *sse,
 600                                    int64_t ref_best_rd,
 601                                    BLOCK_SIZE bs) {
 602   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
 603   VP9_COMMON *const cm = &cpi->common;
 604   const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
 605   MACROBLOCKD *const xd = &x->e_mbd;
 606   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
 607
 608   mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
 609
 610   txfm_rd_in_plane(x, rate, distortion, skip,
 611                    sse, ref_best_rd, 0, bs,
 612                    mbmi->tx_size, cpi->sf.use_fast_coef_costing);
 613 }
 614
 615 static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
 616                                    int *rate,
 617                                    int64_t *distortion,
 618                                    int *skip,
 619                                    int64_t *psse,
 620                                    int64_t tx_cache[TX_MODES],
 621                                    int64_t ref_best_rd,
 622                                    BLOCK_SIZE bs) {
 623   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
 624   VP9_COMMON *const cm = &cpi->common;
 625   MACROBLOCKD *const xd = &x->e_mbd;
 626   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
 627   vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
 628   int r[TX_SIZES][2], s[TX_SIZES];
 629   int64_t d[TX_SIZES], sse[TX_SIZES];
 630   int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
 631                              {INT64_MAX, INT64_MAX},
 632                              {INT64_MAX, INT64_MAX},
 633                              {INT64_MAX, INT64_MAX}};
 634   int n, m;
 635   int s0, s1;
 636   const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
 637   int64_t best_rd = INT64_MAX;
 638   TX_SIZE best_tx = max_tx_size;
 639
 640   const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
 641   assert(skip_prob > 0);
 642   s0 = vp9_cost_bit(skip_prob, 0);
 643   s1 = vp9_cost_bit(skip_prob, 1);
 644
 645   for (n = max_tx_size; n >= 0;  n--) {
 646     txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n],
 647                      &sse[n], ref_best_rd, 0, bs, n,
 648                      cpi->sf.use_fast_coef_costing);
 649     r[n][1] = r[n][0];
 650     if (r[n][0] < INT_MAX) {
 651       for (m = 0; m <= n - (n == (int) max_tx_size); m++) {
 652         if (m == n)
 653           r[n][1] += vp9_cost_zero(tx_probs[m]);
 654         else
 655           r[n][1] += vp9_cost_one(tx_probs[m]);
 656       }
 657     }
 658     if (d[n] == INT64_MAX) {
 659       rd[n][0] = rd[n][1] = INT64_MAX;
 660     } else if (s[n]) {
 661       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
 662     } else {
 663       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
 664       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
 665     }
 666
 667     // Early termination in transform size search.
 668     if (cpi->sf.tx_size_search_breakout &&
 669         (rd[n][1] == INT64_MAX ||
 670         (n < (int) max_tx_size && rd[n][1] > rd[n + 1][1]) ||
 671         s[n] == 1))
 672       break;
 673
 674     if (rd[n][1] < best_rd) {
 675       best_tx = n;
 676       best_rd = rd[n][1];
 677     }
 678   }
 679   mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
 680                       best_tx : MIN(max_tx_size, max_mode_tx_size);
 681
 682
 683   *distortion = d[mbmi->tx_size];
 684   *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
 685   *skip       = s[mbmi->tx_size];
 686   *psse       = sse[mbmi->tx_size];
 687
 688   tx_cache[ONLY_4X4] = rd[TX_4X4][0];
 689   tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
 690   tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
 691   tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
 692
 693   if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
 694     tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
 695   } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
 696     tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
 697   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
 698     tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
 699   } else {
 700     tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
 701   }
 702 }
 703
 704 static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 705                             int64_t *distortion, int *skip,
 706                             int64_t *psse, BLOCK_SIZE bs,
 707                             int64_t txfm_cache[TX_MODES],
 708                             int64_t ref_best_rd) {
 709   MACROBLOCKD *xd = &x->e_mbd;
 710   int64_t sse;
 711   int64_t *ret_sse = psse ? psse : &sse;
 712
 713   assert(bs == xd->mi[0].src_mi->mbmi.sb_type);
 714
 715   if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
 716     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
 717     choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
 718                            bs);
 719   } else {
 720     choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse,
 721                            txfm_cache, ref_best_rd, bs);
 722   }
 723 }
 724
 725 static int conditional_skipintra(PREDICTION_MODE mode,
 726                                  PREDICTION_MODE best_intra_mode) {
 727   if (mode == D117_PRED &&
 728       best_intra_mode != V_PRED &&
 729       best_intra_mode != D135_PRED)
 730     return 1;
 731   if (mode == D63_PRED &&
 732       best_intra_mode != V_PRED &&
 733       best_intra_mode != D45_PRED)
 734     return 1;
 735   if (mode == D207_PRED &&
 736       best_intra_mode != H_PRED &&
 737       best_intra_mode != D45_PRED)
 738     return 1;
 739   if (mode == D153_PRED &&
 740       best_intra_mode != H_PRED &&
 741       best_intra_mode != D135_PRED)
 742     return 1;
 743   return 0;
 744 }
 745
 746 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 747                                      PREDICTION_MODE *best_mode,
 748                                      const int *bmode_costs,
 749                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
 750                                      int *bestrate, int *bestratey,
 751                                      int64_t *bestdistortion,
 752                                      BLOCK_SIZE bsize, int64_t rd_thresh) {
 753   PREDICTION_MODE mode;
 754   MACROBLOCKD *const xd = &x->e_mbd;
 755   int64_t best_rd = rd_thresh;
 756
 757   struct macroblock_plane *p = &x->plane[0];
 758   struct macroblockd_plane *pd = &xd->plane[0];
 759   const int src_stride = p->src.stride;
 760   const int dst_stride = pd->dst.stride;
 761   const uint8_t *src_init = &p->src.buf[raster_block_offset(BLOCK_8X8, ib,
 762                                                             src_stride)];
 763   uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib,
 764                                                        dst_stride)];
 765   ENTROPY_CONTEXT ta[2], tempa[2];
 766   ENTROPY_CONTEXT tl[2], templ[2];
 767
 768   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
 769   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
 770   int idx, idy;
 771   uint8_t best_dst[8 * 8];
 772 #if CONFIG_VP9_HIGHBITDEPTH
 773   uint16_t best_dst16[8 * 8];
 774 #endif
 775
 776   assert(ib < 4);
 777
 778   vpx_memcpy(ta, a, sizeof(ta));
 779   vpx_memcpy(tl, l, sizeof(tl));
 780   xd->mi[0].src_mi->mbmi.tx_size = TX_4X4;
 781
 782 #if CONFIG_VP9_HIGHBITDEPTH
 783   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 784     for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
 785       int64_t this_rd;
 786       int ratey = 0;
 787       int64_t distortion = 0;
 788       int rate = bmode_costs[mode];
 789
 790       if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
 791         continue;
 792
 793       // Only do the oblique modes if the best so far is
 794       // one of the neighboring directional modes
 795       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
 796         if (conditional_skipintra(mode, *best_mode))
 797             continue;
 798       }
 799
 800       vpx_memcpy(tempa, ta, sizeof(ta));
 801       vpx_memcpy(templ, tl, sizeof(tl));
 802
 803       for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
 804         for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
 805           const int block = ib + idy * 2 + idx;
 806           const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
 807           uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
 808           int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
 809                                                               p->src_diff);
 810           tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
 811           xd->mi[0].src_mi->bmi[block].as_mode = mode;
 812           vp9_predict_intra_block(xd, block, 1,
 813                                   TX_4X4, mode,
 814                                   x->skip_encode ? src : dst,
 815                                   x->skip_encode ? src_stride : dst_stride,
 816                                   dst, dst_stride, idx, idy, 0);
 817           vp9_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,
 818                                     dst, dst_stride, xd->bd);
 819           if (xd->lossless) {
 820             const scan_order *so = &vp9_default_scan_orders[TX_4X4];
 821             vp9_highbd_fwht4x4(src_diff, coeff, 8);
 822             vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
 823             ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
 824                                  so->scan, so->neighbors,
 825                                  cpi->sf.use_fast_coef_costing);
 826             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
 827               goto next_highbd;
 828             vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
 829                                    dst, dst_stride,
 830                                    p->eobs[block], xd->bd);
 831           } else {
 832             int64_t unused;
 833             const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
 834             const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
 835             vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
 836             vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
 837             ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
 838                                  so->scan, so->neighbors,
 839                                  cpi->sf.use_fast_coef_costing);
 840             distortion += vp9_highbd_block_error(
 841                 coeff, BLOCK_OFFSET(pd->dqcoeff, block),
 842                 16, &unused, xd->bd) >> 2;
 843             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
 844               goto next_highbd;
 845             vp9_highbd_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
 846                                   dst, dst_stride, p->eobs[block], xd->bd);
 847           }
 848         }
 849       }
 850
 851       rate += ratey;
 852       this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 853
 854       if (this_rd < best_rd) {
 855         *bestrate = rate;
 856         *bestratey = ratey;
 857         *bestdistortion = distortion;
 858         best_rd = this_rd;
 859         *best_mode = mode;
 860         vpx_memcpy(a, tempa, sizeof(tempa));
 861         vpx_memcpy(l, templ, sizeof(templ));
 862         for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
 863           vpx_memcpy(best_dst16 + idy * 8,
 864                      CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
 865                      num_4x4_blocks_wide * 4 * sizeof(uint16_t));
 866         }
 867       }
 868     next_highbd:
 869       {}
 870     }
 871     if (best_rd >= rd_thresh || x->skip_encode)
 872       return best_rd;
 873
 874     for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
 875       vpx_memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
 876                  best_dst16 + idy * 8,
 877                  num_4x4_blocks_wide * 4 * sizeof(uint16_t));
 878     }
 879
 880     return best_rd;
 881   }
 882 #endif  // CONFIG_VP9_HIGHBITDEPTH
 883
 884   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
 885     int64_t this_rd;
 886     int ratey = 0;
 887     int64_t distortion = 0;
 888     int rate = bmode_costs[mode];
 889
 890     if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
 891       continue;
 892
 893     // Only do the oblique modes if the best so far is
 894     // one of the neighboring directional modes
 895     if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
 896       if (conditional_skipintra(mode, *best_mode))
 897           continue;
 898     }
 899
 900     vpx_memcpy(tempa, ta, sizeof(ta));
 901     vpx_memcpy(templ, tl, sizeof(tl));
 902
 903     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
 904       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
 905         const int block = ib + idy * 2 + idx;
 906         const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
 907         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
 908         int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
 909                                                             p->src_diff);
 910         tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
 911         xd->mi[0].src_mi->bmi[block].as_mode = mode;
 912         vp9_predict_intra_block(xd, block, 1,
 913                                 TX_4X4, mode,
 914                                 x->skip_encode ? src : dst,
 915                                 x->skip_encode ? src_stride : dst_stride,
 916                                 dst, dst_stride, idx, idy, 0);
 917         vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
 918
 919         if (xd->lossless) {
 920           const scan_order *so = &vp9_default_scan_orders[TX_4X4];
 921           vp9_fwht4x4(src_diff, coeff, 8);
 922           vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
 923           ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
 924                                so->scan, so->neighbors,
 925                                cpi->sf.use_fast_coef_costing);
 926           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
 927             goto next;
 928           vp9_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, dst_stride,
 929                           p->eobs[block]);
 930         } else {
 931           int64_t unused;
 932           const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
 933           const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
 934           vp9_fht4x4(src_diff, coeff, 8, tx_type);
 935           vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
 936           ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
 937                              so->scan, so->neighbors,
 938                              cpi->sf.use_fast_coef_costing);
 939           distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
 940                                         16, &unused) >> 2;
 941           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
 942             goto next;
 943           vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
 944                          dst, dst_stride, p->eobs[block]);
 945         }
 946       }
 947     }
 948
 949     rate += ratey;
 950     this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 951
 952     if (this_rd < best_rd) {
 953       *bestrate = rate;
 954       *bestratey = ratey;
 955       *bestdistortion = distortion;
 956       best_rd = this_rd;
 957       *best_mode = mode;
 958       vpx_memcpy(a, tempa, sizeof(tempa));
 959       vpx_memcpy(l, templ, sizeof(templ));
 960       for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
 961         vpx_memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
 962                    num_4x4_blocks_wide * 4);
 963     }
 964   next:
 965     {}
 966   }
 967
 968   if (best_rd >= rd_thresh || x->skip_encode)
 969     return best_rd;
 970
 971   for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
 972     vpx_memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
 973                num_4x4_blocks_wide * 4);
 974
 975   return best_rd;
 976 }
 977
 978 static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
 979                                             int *rate, int *rate_y,
 980                                             int64_t *distortion,
 981                                             int64_t best_rd) {
 982   int i, j;
 983   const MACROBLOCKD *const xd = &mb->e_mbd;
 984   MODE_INFO *const mic = xd->mi[0].src_mi;
 985   const MODE_INFO *above_mi = xd->mi[-xd->mi_stride].src_mi;
 986   const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1].src_mi : NULL;
 987   const BLOCK_SIZE bsize = xd->mi[0].src_mi->mbmi.sb_type;
 988   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
 989   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
 990   int idx, idy;
 991   int cost = 0;
 992   int64_t total_distortion = 0;
 993   int tot_rate_y = 0;
 994   int64_t total_rd = 0;
 995   ENTROPY_CONTEXT t_above[4], t_left[4];
 996   const int *bmode_costs = cpi->mbmode_cost;
 997
 998   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
 999   vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
1000
1001   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
1002   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
1003     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
1004       PREDICTION_MODE best_mode = DC_PRED;
1005       int r = INT_MAX, ry = INT_MAX;
1006       int64_t d = INT64_MAX, this_rd = INT64_MAX;
1007       i = idy * 2 + idx;
1008       if (cpi->common.frame_type == KEY_FRAME) {
1009         const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, i);
1010         const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, i);
1011
1012         bmode_costs  = cpi->y_mode_costs[A][L];
1013       }
1014
1015       this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
1016                                       t_above + idx, t_left + idy, &r, &ry, &d,
1017                                       bsize, best_rd - total_rd);
1018       if (this_rd >= best_rd - total_rd)
1019         return INT64_MAX;
1020
1021       total_rd += this_rd;
1022       cost += r;
1023       total_distortion += d;
1024       tot_rate_y += ry;
1025
1026       mic->bmi[i].as_mode = best_mode;
1027       for (j = 1; j < num_4x4_blocks_high; ++j)
1028         mic->bmi[i + j * 2].as_mode = best_mode;
1029       for (j = 1; j < num_4x4_blocks_wide; ++j)
1030         mic->bmi[i + j].as_mode = best_mode;
1031
1032       if (total_rd >= best_rd)
1033         return INT64_MAX;
1034     }
1035   }
1036
1037   *rate = cost;
1038   *rate_y = tot_rate_y;
1039   *distortion = total_distortion;
1040   mic->mbmi.mode = mic->bmi[3].as_mode;
1041
1042   return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
1043 }
1044
1045 // This function is used only for intra_only frames
1046 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
1047                                       int *rate, int *rate_tokenonly,
1048                                       int64_t *distortion, int *skippable,
1049                                       BLOCK_SIZE bsize,
1050                                       int64_t tx_cache[TX_MODES],
1051                                       int64_t best_rd) {
1052   PREDICTION_MODE mode;
1053   PREDICTION_MODE mode_selected = DC_PRED;
1054   MACROBLOCKD *const xd = &x->e_mbd;
1055   MODE_INFO *const mic = xd->mi[0].src_mi;
1056   int this_rate, this_rate_tokenonly, s;
1057   int64_t this_distortion, this_rd;
1058   TX_SIZE best_tx = TX_4X4;
1059   int i;
1060   int *bmode_costs;
1061   const MODE_INFO *above_mi = xd->mi[-xd->mi_stride].src_mi;
1062   const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1].src_mi : NULL;
1063   const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
1064   const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
1065   bmode_costs = cpi->y_mode_costs[A][L];
1066
1067   if (cpi->sf.tx_size_search_method == USE_FULL_RD)
1068     for (i = 0; i < TX_MODES; i++)
1069       tx_cache[i] = INT64_MAX;
1070
1071   vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
1072   /* Y Search for intra prediction mode */
1073   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
1074     int64_t local_tx_cache[TX_MODES];
1075     mic->mbmi.mode = mode;
1076
1077     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
1078         &s, NULL, bsize, local_tx_cache, best_rd);
1079
1080     if (this_rate_tokenonly == INT_MAX)
1081       continue;
1082
1083     this_rate = this_rate_tokenonly + bmode_costs[mode];
1084     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
1085
1086     if (this_rd < best_rd) {
1087       mode_selected   = mode;
1088       best_rd         = this_rd;
1089       best_tx         = mic->mbmi.tx_size;
1090       *rate           = this_rate;
1091       *rate_tokenonly = this_rate_tokenonly;
1092       *distortion     = this_distortion;
1093       *skippable      = s;
1094     }
1095
1096     if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
1097       for (i = 0; i < TX_MODES && local_tx_cache[i] < INT64_MAX; i++) {
1098         const int64_t adj_rd = this_rd + local_tx_cache[i] -
1099             local_tx_cache[cpi->common.tx_mode];
1100         if (adj_rd < tx_cache[i]) {
1101           tx_cache[i] = adj_rd;
1102         }
1103       }
1104     }
1105   }
1106
1107   mic->mbmi.mode = mode_selected;
1108   mic->mbmi.tx_size = best_tx;
1109
1110   return best_rd;
1111 }
1112
1113 // Return value 0: early termination triggered, no valid rd cost available;
1114 //              1: rd cost values are valid.
1115 static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
1116                             int *rate, int64_t *distortion, int *skippable,
1117                             int64_t *sse, BLOCK_SIZE bsize,
1118                             int64_t ref_best_rd) {
1119   MACROBLOCKD *const xd = &x->e_mbd;
1120   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
1121   const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
1122   int plane;
1123   int pnrate = 0, pnskip = 1;
1124   int64_t pndist = 0, pnsse = 0;
1125   int is_cost_valid = 1;
1126
1127   if (ref_best_rd < 0)
1128     is_cost_valid = 0;
1129
1130   if (is_inter_block(mbmi) && is_cost_valid) {
1131     int plane;
1132     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
1133       vp9_subtract_plane(x, bsize, plane);
1134   }
1135
1136   *rate = 0;
1137   *distortion = 0;
1138   *sse = 0;
1139   *skippable = 1;
1140
1141   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
1142     txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
1143                      ref_best_rd, plane, bsize, uv_tx_size,
1144                      cpi->sf.use_fast_coef_costing);
1145     if (pnrate == INT_MAX) {
1146       is_cost_valid = 0;
1147       break;
1148     }
1149     *rate += pnrate;
1150     *distortion += pndist;
1151     *sse += pnsse;
1152     *skippable &= pnskip;
1153   }
1154
1155   if (!is_cost_valid) {
1156     // reset cost value
1157     *rate = INT_MAX;
1158     *distortion = INT64_MAX;
1159     *sse = INT64_MAX;
1160     *skippable = 0;
1161   }
1162
1163   return is_cost_valid;
1164 }
1165
1166 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
1167                                        PICK_MODE_CONTEXT *ctx,
1168                                        int *rate, int *rate_tokenonly,
1169                                        int64_t *distortion, int *skippable,
1170                                        BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
1171   MACROBLOCKD *xd = &x->e_mbd;
1172   PREDICTION_MODE mode;
1173   PREDICTION_MODE mode_selected = DC_PRED;
1174   int64_t best_rd = INT64_MAX, this_rd;
1175   int this_rate_tokenonly, this_rate, s;
1176   int64_t this_distortion, this_sse;
1177
1178   vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
1179   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
1180     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
1181       continue;
1182
1183     xd->mi[0].src_mi->mbmi.uv_mode = mode;
1184
1185     if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
1186                           &this_distortion, &s, &this_sse, bsize, best_rd))
1187       continue;
1188     this_rate = this_rate_tokenonly +
1189                 cpi->intra_uv_mode_cost[cpi->common.frame_type][mode];
1190     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
1191
1192     if (this_rd < best_rd) {
1193       mode_selected   = mode;
1194       best_rd         = this_rd;
1195       *rate           = this_rate;
1196       *rate_tokenonly = this_rate_tokenonly;
1197       *distortion     = this_distortion;
1198       *skippable      = s;
1199       if (!x->select_tx_size)
1200         swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE);
1201     }
1202   }
1203
1204   xd->mi[0].src_mi->mbmi.uv_mode = mode_selected;
1205   return best_rd;
1206 }
1207
1208 static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x,
1209                               int *rate, int *rate_tokenonly,
1210                               int64_t *distortion, int *skippable,
1211                               BLOCK_SIZE bsize) {
1212   const VP9_COMMON *cm = &cpi->common;
1213   int64_t unused;
1214
1215   x->e_mbd.mi[0].src_mi->mbmi.uv_mode = DC_PRED;
1216   vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
1217   super_block_uvrd(cpi, x, rate_tokenonly, distortion,
1218                    skippable, &unused, bsize, INT64_MAX);
1219   *rate = *rate_tokenonly + cpi->intra_uv_mode_cost[cm->frame_type][DC_PRED];
1220   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
1221 }
1222
1223 static void choose_intra_uv_mode(VP9_COMP *cpi, MACROBLOCK *const x,
1224                                  PICK_MODE_CONTEXT *ctx,
1225                                  BLOCK_SIZE bsize, TX_SIZE max_tx_size,
1226                                  int *rate_uv, int *rate_uv_tokenonly,
1227                                  int64_t *dist_uv, int *skip_uv,
1228                                  PREDICTION_MODE *mode_uv) {
1229   // Use an estimated rd for uv_intra based on DC_PRED if the
1230   // appropriate speed flag is set.
1231   if (cpi->sf.use_uv_intra_rd_estimate) {
1232     rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv,
1233                    skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
1234   // Else do a proper rd search for each possible transform size that may
1235   // be considered in the main rd loop.
1236   } else {
1237     rd_pick_intra_sbuv_mode(cpi, x, ctx,
1238                             rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
1239                             bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
1240   }
1241   *mode_uv = x->e_mbd.mi[0].src_mi->mbmi.uv_mode;
1242 }
1243
1244 static int cost_mv_ref(const VP9_COMP *cpi, PREDICTION_MODE mode,
1245                        int mode_context) {
1246   assert(is_inter_mode(mode));
1247   return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
1248 }
1249
1250 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
1251                                 BLOCK_SIZE bsize,
1252                                 int_mv *frame_mv,
1253                                 int mi_row, int mi_col,
1254                                 int_mv single_newmv[MAX_REF_FRAMES],
1255                                 int *rate_mv);
1256
1257 static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCKD *xd, int i,
1258                                 PREDICTION_MODE mode, int_mv this_mv[2],
1259                                 int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
1260                                 int_mv seg_mvs[MAX_REF_FRAMES],
1261                                 int_mv *best_ref_mv[2], const int *mvjcost,
1262                                 int *mvcost[2]) {
1263   MODE_INFO *const mic = xd->mi[0].src_mi;
1264   const MB_MODE_INFO *const mbmi = &mic->mbmi;
1265   int thismvcost = 0;
1266   int idx, idy;
1267   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
1268   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
1269   const int is_compound = has_second_ref(mbmi);
1270
1271   switch (mode) {
1272     case NEWMV:
1273       this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
1274       thismvcost += vp9_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
1275                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
1276       if (is_compound) {
1277         this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
1278         thismvcost += vp9_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
1279                                       mvjcost, mvcost, MV_COST_WEIGHT_SUB);
1280       }
1281       break;
1282     case NEARMV:
1283     case NEARESTMV:
1284       this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
1285       if (is_compound)
1286         this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
1287       break;
1288     case ZEROMV:
1289       this_mv[0].as_int = 0;
1290       if (is_compound)
1291         this_mv[1].as_int = 0;
1292       break;
1293     default:
1294       break;
1295   }
1296
1297   mic->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
1298   if (is_compound)
1299     mic->bmi[i].as_mv[1].as_int = this_mv[1].as_int;
1300
1301   mic->bmi[i].as_mode = mode;
1302
1303   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
1304     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
1305       vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
1306                  &mic->bmi[i], sizeof(mic->bmi[i]));
1307
1308   return cost_mv_ref(cpi, mode, mbmi->mode_context[mbmi->ref_frame[0]]) +
1309             thismvcost;
1310 }
1311
1312 static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
1313                                        MACROBLOCK *x,
1314                                        int64_t best_yrd,
1315                                        int i,
1316                                        int *labelyrate,
1317                                        int64_t *distortion, int64_t *sse,
1318                                        ENTROPY_CONTEXT *ta,
1319                                        ENTROPY_CONTEXT *tl,
1320                                        int mi_row, int mi_col) {
1321   int k;
1322   MACROBLOCKD *xd = &x->e_mbd;
1323   struct macroblockd_plane *const pd = &xd->plane[0];
1324   struct macroblock_plane *const p = &x->plane[0];
1325   MODE_INFO *const mi = xd->mi[0].src_mi;
1326   const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
1327   const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
1328   const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
1329   int idx, idy;
1330
1331   const uint8_t *const src = &p->src.buf[raster_block_offset(BLOCK_8X8, i,
1332                                                              p->src.stride)];
1333   uint8_t *const dst = &pd->dst.buf[raster_block_offset(BLOCK_8X8, i,
1334                                                         pd->dst.stride)];
1335   int64_t thisdistortion = 0, thissse = 0;
1336   int thisrate = 0, ref;
1337   const scan_order *so = &vp9_default_scan_orders[TX_4X4];
1338   const int is_compound = has_second_ref(&mi->mbmi);
1339   const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
1340
1341   for (ref = 0; ref < 1 + is_compound; ++ref) {
1342     const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
1343                                                pd->pre[ref].stride)];
1344 #if CONFIG_VP9_HIGHBITDEPTH
1345   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
1346     vp9_highbd_build_inter_predictor(pre, pd->pre[ref].stride,
1347                                      dst, pd->dst.stride,
1348                                      &mi->bmi[i].as_mv[ref].as_mv,
1349                                      &xd->block_refs[ref]->sf, width, height,
1350                                      ref, kernel, MV_PRECISION_Q3,
1351                                      mi_col * MI_SIZE + 4 * (i % 2),
1352                                      mi_row * MI_SIZE + 4 * (i / 2), xd->bd);
1353   } else {
1354     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
1355                               dst, pd->dst.stride,
1356                               &mi->bmi[i].as_mv[ref].as_mv,
1357                               &xd->block_refs[ref]->sf, width, height, ref,
1358                               kernel, MV_PRECISION_Q3,
1359                               mi_col * MI_SIZE + 4 * (i % 2),
1360                               mi_row * MI_SIZE + 4 * (i / 2));
1361   }
1362 #else
1363     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
1364                               dst, pd->dst.stride,
1365                               &mi->bmi[i].as_mv[ref].as_mv,
1366                               &xd->block_refs[ref]->sf, width, height, ref,
1367                               kernel, MV_PRECISION_Q3,
1368                               mi_col * MI_SIZE + 4 * (i % 2),
1369                               mi_row * MI_SIZE + 4 * (i / 2));
1370 #endif  // CONFIG_VP9_HIGHBITDEPTH
1371   }
1372
1373 #if CONFIG_VP9_HIGHBITDEPTH
1374   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
1375     vp9_highbd_subtract_block(
1376         height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
1377         src, p->src.stride, dst, pd->dst.stride, xd->bd);
1378   } else {
1379     vp9_subtract_block(
1380         height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
1381         src, p->src.stride, dst, pd->dst.stride);
1382   }
1383 #else
1384   vp9_subtract_block(height, width,
1385                      raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
1386                      src, p->src.stride, dst, pd->dst.stride);
1387 #endif  // CONFIG_VP9_HIGHBITDEPTH
1388
1389   k = i;
1390   for (idy = 0; idy < height / 4; ++idy) {
1391     for (idx = 0; idx < width / 4; ++idx) {
1392       int64_t ssz, rd, rd1, rd2;
1393       tran_low_t* coeff;
1394
1395       k += (idy * 2 + idx);
1396       coeff = BLOCK_OFFSET(p->coeff, k);
1397       x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
1398                     coeff, 8);
1399       vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
1400 #if CONFIG_VP9_HIGHBITDEPTH
1401       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
1402         thisdistortion += vp9_highbd_block_error(coeff,
1403                                                  BLOCK_OFFSET(pd->dqcoeff, k),
1404                                                  16, &ssz, xd->bd);
1405       } else {
1406         thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
1407                                           16, &ssz);
1408       }
1409 #else
1410       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
1411                                         16, &ssz);
1412 #endif  // CONFIG_VP9_HIGHBITDEPTH
1413       thissse += ssz;
1414       thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
1415                               so->scan, so->neighbors,
1416                               cpi->sf.use_fast_coef_costing);
1417       rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
1418       rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
1419       rd = MIN(rd1, rd2);
1420       if (rd >= best_yrd)
1421         return INT64_MAX;
1422     }
1423   }
1424
1425   *distortion = thisdistortion >> 2;
1426   *labelyrate = thisrate;
1427   *sse = thissse >> 2;
1428
1429   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
1430 }
1431
1432 typedef struct {
1433   int eobs;
1434   int brate;
1435   int byrate;
1436   int64_t bdist;
1437   int64_t bsse;
1438   int64_t brdcost;
1439   int_mv mvs[2];
1440   ENTROPY_CONTEXT ta[2];
1441   ENTROPY_CONTEXT tl[2];
1442 } SEG_RDSTAT;
1443
1444 typedef struct {
1445   int_mv *ref_mv[2];
1446   int_mv mvp;
1447
1448   int64_t segment_rd;
1449   int r;
1450   int64_t d;
1451   int64_t sse;
1452   int segment_yrate;
1453   PREDICTION_MODE modes[4];
1454   SEG_RDSTAT rdstat[4][INTER_MODES];
1455   int mvthresh;
1456 } BEST_SEG_INFO;
1457
1458 static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) {
1459   return (mv->row >> 3) < x->mv_row_min ||
1460          (mv->row >> 3) > x->mv_row_max ||
1461          (mv->col >> 3) < x->mv_col_min ||
1462          (mv->col >> 3) > x->mv_col_max;
1463 }
1464
1465 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
1466   MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0].src_mi->mbmi;
1467   struct macroblock_plane *const p = &x->plane[0];
1468   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
1469
1470   p->src.buf = &p->src.buf[raster_block_offset(BLOCK_8X8, i, p->src.stride)];
1471   assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
1472   pd->pre[0].buf = &pd->pre[0].buf[raster_block_offset(BLOCK_8X8, i,
1473                                                        pd->pre[0].stride)];
1474   if (has_second_ref(mbmi))
1475     pd->pre[1].buf = &pd->pre[1].buf[raster_block_offset(BLOCK_8X8, i,
1476                                                          pd->pre[1].stride)];
1477 }
1478
1479 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
1480                                   struct buf_2d orig_pre[2]) {
1481   MB_MODE_INFO *mbmi = &x->e_mbd.mi[0].src_mi->mbmi;
1482   x->plane[0].src = orig_src;
1483   x->e_mbd.plane[0].pre[0] = orig_pre[0];
1484   if (has_second_ref(mbmi))
1485     x->e_mbd.plane[0].pre[1] = orig_pre[1];
1486 }
1487
1488 static INLINE int mv_has_subpel(const MV *mv) {
1489   return (mv->row & 0x0F) || (mv->col & 0x0F);
1490 }
1491
1492 // Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
1493 // TODO(aconverse): Find out if this is still productive then clean up or remove
1494 static int check_best_zero_mv(
1495     const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
1496     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int this_mode,
1497     const MV_REFERENCE_FRAME ref_frames[2]) {
1498   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
1499       frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
1500       (ref_frames[1] == NONE ||
1501        frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
1502     int rfc = mode_context[ref_frames[0]];
1503     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
1504     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
1505     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
1506
1507     if (this_mode == NEARMV) {
1508       if (c1 > c3) return 0;
1509     } else if (this_mode == NEARESTMV) {
1510       if (c2 > c3) return 0;
1511     } else {
1512       assert(this_mode == ZEROMV);
1513       if (ref_frames[1] == NONE) {
1514         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
1515             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
1516           return 0;
1517       } else {
1518         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 &&
1519              frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) ||
1520             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 &&
1521              frame_mv[NEARMV][ref_frames[1]].as_int == 0))
1522           return 0;
1523       }
1524     }
1525   }
1526   return 1;
1527 }
1528
1529 static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
1530                                         const TileInfo * const tile,
1531                                         int_mv *best_ref_mv,
1532                                         int_mv *second_best_ref_mv,
1533                                         int64_t best_rd, int *returntotrate,
1534                                         int *returnyrate,
1535                                         int64_t *returndistortion,
1536                                         int *skippable, int64_t *psse,
1537                                         int mvthresh,
1538                                         int_mv seg_mvs[4][MAX_REF_FRAMES],
1539                                         BEST_SEG_INFO *bsi_buf, int filter_idx,
1540                                         int mi_row, int mi_col) {
1541   int i;
1542   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
1543   MACROBLOCKD *xd = &x->e_mbd;
1544   MODE_INFO *mi = xd->mi[0].src_mi;
1545   MB_MODE_INFO *mbmi = &mi->mbmi;
1546   int mode_idx;
1547   int k, br = 0, idx, idy;
1548   int64_t bd = 0, block_sse = 0;
1549   PREDICTION_MODE this_mode;
1550   VP9_COMMON *cm = &cpi->common;
1551   struct macroblock_plane *const p = &x->plane[0];
1552   struct macroblockd_plane *const pd = &xd->plane[0];
1553   const int label_count = 4;
1554   int64_t this_segment_rd = 0;
1555   int label_mv_thresh;
1556   int segmentyrate = 0;
1557   const BLOCK_SIZE bsize = mbmi->sb_type;
1558   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
1559   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
1560   ENTROPY_CONTEXT t_above[2], t_left[2];
1561   int subpelmv = 1, have_ref = 0;
1562   const int has_second_rf = has_second_ref(mbmi);
1563   const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
1564
1565   vp9_zero(*bsi);
1566
1567   bsi->segment_rd = best_rd;
1568   bsi->ref_mv[0] = best_ref_mv;
1569   bsi->ref_mv[1] = second_best_ref_mv;
1570   bsi->mvp.as_int = best_ref_mv->as_int;
1571   bsi->mvthresh = mvthresh;
1572
1573   for (i = 0; i < 4; i++)
1574     bsi->modes[i] = ZEROMV;
1575
1576   vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
1577   vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
1578
1579   // 64 makes this threshold really big effectively
1580   // making it so that we very rarely check mvs on
1581   // segments.   setting this to 1 would make mv thresh
1582   // roughly equal to what it is for macroblocks
1583   label_mv_thresh = 1 * bsi->mvthresh / label_count;
1584
1585   // Segmentation method overheads
1586   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
1587     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
1588       // TODO(jingning,rbultje): rewrite the rate-distortion optimization
1589       // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
1590       int_mv mode_mv[MB_MODE_COUNT][2];
1591       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
1592       PREDICTION_MODE mode_selected = ZEROMV;
1593       int64_t best_rd = INT64_MAX;
1594       const int i = idy * 2 + idx;
1595       int ref;
1596
1597       for (ref = 0; ref < 1 + has_second_rf; ++ref) {
1598         const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
1599         frame_mv[ZEROMV][frame].as_int = 0;
1600         vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, i, ref, mi_row, mi_col,
1601                                       &frame_mv[NEARESTMV][frame],
1602                                       &frame_mv[NEARMV][frame]);
1603       }
1604
1605       // search for the best motion vector on this segment
1606       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
1607         const struct buf_2d orig_src = x->plane[0].src;
1608         struct buf_2d orig_pre[2];
1609
1610         mode_idx = INTER_OFFSET(this_mode);
1611         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
1612         if (!(inter_mode_mask & (1 << this_mode)))
1613           continue;
1614
1615         if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
1616                                 this_mode, mbmi->ref_frame))
1617           continue;
1618
1619         vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
1620         vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
1621                    sizeof(bsi->rdstat[i][mode_idx].ta));
1622         vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
1623                    sizeof(bsi->rdstat[i][mode_idx].tl));
1624
1625         // motion search for newmv (single predictor case only)
1626         if (!has_second_rf && this_mode == NEWMV &&
1627             seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
1628           MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
1629           int step_param = 0;
1630           int thissme, bestsme = INT_MAX;
1631           int sadpb = x->sadperbit4;
1632           MV mvp_full;
1633           int max_mv;
1634           int cost_list[5];
1635
1636           /* Is the best so far sufficiently good that we cant justify doing
1637            * and new motion search. */
1638           if (best_rd < label_mv_thresh)
1639             break;
1640
1641           if (cpi->oxcf.mode != BEST) {
1642             // use previous block's result as next block's MV predictor.
1643             if (i > 0) {
1644               bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
1645               if (i == 2)
1646                 bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
1647             }
1648           }
1649           if (i == 0)
1650             max_mv = x->max_mv_context[mbmi->ref_frame[0]];
1651           else
1652             max_mv = MAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
1653
1654           if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
1655             // Take wtd average of the step_params based on the last frame's
1656             // max mv magnitude and the best ref mvs of the current block for
1657             // the given reference.
1658             step_param = (vp9_init_search_range(max_mv) +
1659                               cpi->mv_step_param) / 2;
1660           } else {
1661             step_param = cpi->mv_step_param;
1662           }
1663
1664           mvp_full.row = bsi->mvp.as_mv.row >> 3;
1665           mvp_full.col = bsi->mvp.as_mv.col >> 3;
1666
1667           if (cpi->sf.adaptive_motion_search) {
1668             mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
1669             mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3;
1670             step_param = MAX(step_param, 8);
1671           }
1672
1673           // adjust src pointer for this block
1674           mi_buf_shift(x, i);
1675
1676           vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
1677
1678           bestsme = vp9_full_pixel_search(
1679               cpi, x, bsize, &mvp_full, step_param, sadpb,
1680               cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
1681               &bsi->ref_mv[0]->as_mv, new_mv,
1682               INT_MAX, 1);
1683
1684           // Should we do a full search (best quality only)
1685           if (cpi->oxcf.mode == BEST) {
1686             int_mv *const best_mv = &mi->bmi[i].as_mv[0];
1687             /* Check if mvp_full is within the range. */
1688             clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
1689                      x->mv_row_min, x->mv_row_max);
1690             thissme = cpi->full_search_sad(x, &mvp_full,
1691                                            sadpb, 16, &cpi->fn_ptr[bsize],
1692                                            &bsi->ref_mv[0]->as_mv,
1693                                            &best_mv->as_mv);
1694             cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
1695             if (thissme < bestsme) {
1696               bestsme = thissme;
1697               *new_mv = best_mv->as_mv;
1698             } else {
1699               // The full search result is actually worse so re-instate the
1700               // previous best vector
1701               best_mv->as_mv = *new_mv;
1702             }
1703           }
1704
1705           if (bestsme < INT_MAX) {
1706             int distortion;
1707             cpi->find_fractional_mv_step(
1708                 x,
1709                 new_mv,
1710                 &bsi->ref_mv[0]->as_mv,
1711                 cm->allow_high_precision_mv,
1712                 x->errorperbit, &cpi->fn_ptr[bsize],
1713                 cpi->sf.mv.subpel_force_stop,
1714                 cpi->sf.mv.subpel_iters_per_step,
1715                 cond_cost_list(cpi, cost_list),
1716                 x->nmvjointcost, x->mvcost,
1717                 &distortion,
1718                 &x->pred_sse[mbmi->ref_frame[0]],
1719                 NULL, 0, 0);
1720
1721             // save motion search result for use in compound prediction
1722             seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
1723           }
1724
1725           if (cpi->sf.adaptive_motion_search)
1726             x->pred_mv[mbmi->ref_frame[0]] = *new_mv;
1727
1728           // restore src pointers
1729           mi_buf_restore(x, orig_src, orig_pre);
1730         }
1731
1732         if (has_second_rf) {
1733           if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
1734               seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
1735             continue;
1736         }
1737
1738         if (has_second_rf && this_mode == NEWMV &&
1739             mbmi->interp_filter == EIGHTTAP) {
1740           // adjust src pointers
1741           mi_buf_shift(x, i);
1742           if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
1743             int rate_mv;
1744             joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
1745                                 mi_row, mi_col, seg_mvs[i],
1746                                 &rate_mv);
1747             seg_mvs[i][mbmi->ref_frame[0]].as_int =
1748                 frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
1749             seg_mvs[i][mbmi->ref_frame[1]].as_int =
1750                 frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
1751           }
1752           // restore src pointers
1753           mi_buf_restore(x, orig_src, orig_pre);
1754         }
1755
1756         bsi->rdstat[i][mode_idx].brate =
1757             set_and_cost_bmi_mvs(cpi, xd, i, this_mode, mode_mv[this_mode],
1758                                  frame_mv, seg_mvs[i], bsi->ref_mv,
1759                                  x->nmvjointcost, x->mvcost);
1760
1761         for (ref = 0; ref < 1 + has_second_rf; ++ref) {
1762           bsi->rdstat[i][mode_idx].mvs[ref].as_int =
1763               mode_mv[this_mode][ref].as_int;
1764           if (num_4x4_blocks_wide > 1)
1765             bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int =
1766                 mode_mv[this_mode][ref].as_int;
1767           if (num_4x4_blocks_high > 1)
1768             bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int =
1769                 mode_mv[this_mode][ref].as_int;
1770         }
1771
1772         // Trap vectors that reach beyond the UMV borders
1773         if (mv_check_bounds(x, &mode_mv[this_mode][0].as_mv) ||
1774             (has_second_rf &&
1775              mv_check_bounds(x, &mode_mv[this_mode][1].as_mv)))
1776           continue;
1777
1778         if (filter_idx > 0) {
1779           BEST_SEG_INFO *ref_bsi = bsi_buf;
1780           subpelmv = 0;
1781           have_ref = 1;
1782
1783           for (ref = 0; ref < 1 + has_second_rf; ++ref) {
1784             subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
1785             have_ref &= mode_mv[this_mode][ref].as_int ==
1786                 ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
1787           }
1788
1789           if (filter_idx > 1 && !subpelmv && !have_ref) {
1790             ref_bsi = bsi_buf + 1;
1791             have_ref = 1;
1792             for (ref = 0; ref < 1 + has_second_rf; ++ref)
1793               have_ref &= mode_mv[this_mode][ref].as_int ==
1794                   ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
1795           }
1796
1797           if (!subpelmv && have_ref &&
1798               ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
1799             vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
1800                        sizeof(SEG_RDSTAT));
1801             if (num_4x4_blocks_wide > 1)
1802               bsi->rdstat[i + 1][mode_idx].eobs =
1803                   ref_bsi->rdstat[i + 1][mode_idx].eobs;
1804             if (num_4x4_blocks_high > 1)
1805               bsi->rdstat[i + 2][mode_idx].eobs =
1806                   ref_bsi->rdstat[i + 2][mode_idx].eobs;
1807
1808             if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
1809               mode_selected = this_mode;
1810               best_rd = bsi->rdstat[i][mode_idx].brdcost;
1811             }
1812             continue;
1813           }
1814         }
1815
1816         bsi->rdstat[i][mode_idx].brdcost =
1817             encode_inter_mb_segment(cpi, x,
1818                                     bsi->segment_rd - this_segment_rd, i,
1819                                     &bsi->rdstat[i][mode_idx].byrate,
1820                                     &bsi->rdstat[i][mode_idx].bdist,
1821                                     &bsi->rdstat[i][mode_idx].bsse,
1822                                     bsi->rdstat[i][mode_idx].ta,
1823                                     bsi->rdstat[i][mode_idx].tl,
1824                                     mi_row, mi_col);
1825         if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
1826           bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
1827                                             bsi->rdstat[i][mode_idx].brate, 0);
1828           bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
1829           bsi->rdstat[i][mode_idx].eobs = p->eobs[i];
1830           if (num_4x4_blocks_wide > 1)
1831             bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1];
1832           if (num_4x4_blocks_high > 1)
1833             bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2];
1834         }
1835
1836         if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
1837           mode_selected = this_mode;
1838           best_rd = bsi->rdstat[i][mode_idx].brdcost;
1839         }
1840       } /*for each 4x4 mode*/
1841
1842       if (best_rd == INT64_MAX) {
1843         int iy, midx;
1844         for (iy = i + 1; iy < 4; ++iy)
1845           for (midx = 0; midx < INTER_MODES; ++midx)
1846             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
1847         bsi->segment_rd = INT64_MAX;
1848         return INT64_MAX;;
1849       }
1850
1851       mode_idx = INTER_OFFSET(mode_selected);
1852       vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
1853       vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
1854
1855       set_and_cost_bmi_mvs(cpi, xd, i, mode_selected, mode_mv[mode_selected],
1856                            frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
1857                            x->mvcost);
1858
1859       br += bsi->rdstat[i][mode_idx].brate;
1860       bd += bsi->rdstat[i][mode_idx].bdist;
1861       block_sse += bsi->rdstat[i][mode_idx].bsse;
1862       segmentyrate += bsi->rdstat[i][mode_idx].byrate;
1863       this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
1864
1865       if (this_segment_rd > bsi->segment_rd) {
1866         int iy, midx;
1867         for (iy = i + 1; iy < 4; ++iy)
1868           for (midx = 0; midx < INTER_MODES; ++midx)
1869             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
1870         bsi->segment_rd = INT64_MAX;
1871         return INT64_MAX;;
1872       }
1873     }
1874   } /* for each label */
1875
1876   bsi->r = br;
1877   bsi->d = bd;
1878   bsi->segment_yrate = segmentyrate;
1879   bsi->segment_rd = this_segment_rd;
1880   bsi->sse = block_sse;
1881
1882   // update the coding decisions
1883   for (k = 0; k < 4; ++k)
1884     bsi->modes[k] = mi->bmi[k].as_mode;
1885
1886   if (bsi->segment_rd > best_rd)
1887     return INT64_MAX;
1888   /* set it to the best */
1889   for (i = 0; i < 4; i++) {
1890     mode_idx = INTER_OFFSET(bsi->modes[i]);
1891     mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
1892     if (has_second_ref(mbmi))
1893       mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
1894     x->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
1895     mi->bmi[i].as_mode = bsi->modes[i];
1896   }
1897
1898   /*
1899    * used to set mbmi->mv.as_int
1900    */
1901   *returntotrate = bsi->r;
1902   *returndistortion = bsi->d;
1903   *returnyrate = bsi->segment_yrate;
1904   *skippable = vp9_is_skippable_in_plane(x, BLOCK_8X8, 0);
1905   *psse = bsi->sse;
1906   mbmi->mode = bsi->modes[3];
1907
1908   return bsi->segment_rd;
1909 }
1910
1911 static void estimate_ref_frame_costs(const VP9_COMMON *cm,
1912                                      const MACROBLOCKD *xd,
1913                                      int segment_id,
1914                                      unsigned int *ref_costs_single,
1915                                      unsigned int *ref_costs_comp,
1916                                      vp9_prob *comp_mode_p) {
1917   int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
1918                                              SEG_LVL_REF_FRAME);
1919   if (seg_ref_active) {
1920     vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
1921     vpx_memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
1922     *comp_mode_p = 128;
1923   } else {
1924     vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
1925     vp9_prob comp_inter_p = 128;
1926
1927     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
1928       comp_inter_p = vp9_get_reference_mode_prob(cm, xd);
1929       *comp_mode_p = comp_inter_p;
1930     } else {
1931       *comp_mode_p = 128;
1932     }
1933
1934     ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
1935
1936     if (cm->reference_mode != COMPOUND_REFERENCE) {
1937       vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
1938       vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
1939       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
1940
1941       if (cm->reference_mode == REFERENCE_MODE_SELECT)
1942         base_cost += vp9_cost_bit(comp_inter_p, 0);
1943
1944       ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
1945           ref_costs_single[ALTREF_FRAME] = base_cost;
1946       ref_costs_single[LAST_FRAME]   += vp9_cost_bit(ref_single_p1, 0);
1947       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
1948       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
1949       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
1950       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
1951     } else {
1952       ref_costs_single[LAST_FRAME]   = 512;
1953       ref_costs_single[GOLDEN_FRAME] = 512;
1954       ref_costs_single[ALTREF_FRAME] = 512;
1955     }
1956     if (cm->reference_mode != SINGLE_REFERENCE) {
1957       vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
1958       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
1959
1960       if (cm->reference_mode == REFERENCE_MODE_SELECT)
1961         base_cost += vp9_cost_bit(comp_inter_p, 1);
1962
1963       ref_costs_comp[LAST_FRAME]   = base_cost + vp9_cost_bit(ref_comp_p, 0);
1964       ref_costs_comp[GOLDEN_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 1);
1965     } else {
1966       ref_costs_comp[LAST_FRAME]   = 512;
1967       ref_costs_comp[GOLDEN_FRAME] = 512;
1968     }
1969   }
1970 }
1971
1972 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
1973                          int mode_index,
1974                          int64_t comp_pred_diff[REFERENCE_MODES],
1975                          const int64_t tx_size_diff[TX_MODES],
1976                          int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS],
1977                          int skippable) {
1978   MACROBLOCKD *const xd = &x->e_mbd;
1979
1980   // Take a snapshot of the coding context so it can be
1981   // restored if we decide to encode this way
1982   ctx->skip = x->skip;
1983   ctx->skippable = skippable;
1984   ctx->best_mode_index = mode_index;
1985   ctx->mic = *xd->mi[0].src_mi;
1986   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
1987   ctx->comp_pred_diff   = (int)comp_pred_diff[COMPOUND_REFERENCE];
1988   ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
1989
1990   vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
1991   vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
1992              sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
1993 }
1994
1995 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
1996                                const TileInfo *const tile,
1997                                MV_REFERENCE_FRAME ref_frame,
1998                                BLOCK_SIZE block_size,
1999                                int mi_row, int mi_col,
2000                                int_mv frame_nearest_mv[MAX_REF_FRAMES],
2001                                int_mv frame_near_mv[MAX_REF_FRAMES],
2002                                struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
2003   const VP9_COMMON *cm = &cpi->common;
2004   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
2005   MACROBLOCKD *const xd = &x->e_mbd;
2006   MODE_INFO *const mi = xd->mi[0].src_mi;
2007   int_mv *const candidates = mi->mbmi.ref_mvs[ref_frame];
2008   const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
2009
2010   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
2011   // use the UV scaling factors.
2012   vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
2013
2014   // Gets an initial list of candidate vectors from neighbours and orders them
2015   vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col);
2016
2017   // Candidate refinement carried out at encoder and decoder
2018   vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
2019                         &frame_nearest_mv[ref_frame],
2020                         &frame_near_mv[ref_frame]);
2021
2022   // Further refinement that is encode side only to test the top few candidates
2023   // in full and choose the best as the centre point for subsequent searches.
2024   // The current implementation doesn't support scaling.
2025   if (!vp9_is_scaled(sf) && block_size >= BLOCK_8X8)
2026     vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
2027                 ref_frame, block_size);
2028 }
2029
2030 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
2031                                  BLOCK_SIZE bsize,
2032                                  int mi_row, int mi_col,
2033                                  int_mv *tmp_mv, int *rate_mv) {
2034   MACROBLOCKD *xd = &x->e_mbd;
2035   const VP9_COMMON *cm = &cpi->common;
2036   MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
2037   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
2038   int bestsme = INT_MAX;
2039   int step_param;
2040   int sadpb = x->sadperbit16;
2041   MV mvp_full;
2042   int ref = mbmi->ref_frame[0];
2043   MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
2044
2045   int tmp_col_min = x->mv_col_min;
2046   int tmp_col_max = x->mv_col_max;
2047   int tmp_row_min = x->mv_row_min;
2048   int tmp_row_max = x->mv_row_max;
2049   int cost_list[5];
2050
2051   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
2052                                                                         ref);
2053
2054   MV pred_mv[3];
2055   pred_mv[0] = mbmi->ref_mvs[ref][0].as_mv;
2056   pred_mv[1] = mbmi->ref_mvs[ref][1].as_mv;
2057   pred_mv[2] = x->pred_mv[ref];
2058
2059   if (scaled_ref_frame) {
2060     int i;
2061     // Swap out the reference frame for a version that's been scaled to
2062     // match the resolution of the current frame, allowing the existing
2063     // motion search code to be used without additional modifications.
2064     for (i = 0; i < MAX_MB_PLANE; i++)
2065       backup_yv12[i] = xd->plane[i].pre[0];
2066
2067     vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
2068   }
2069
2070   vp9_set_mv_search_range(x, &ref_mv);
2071
2072   // Work out the size of the first step in the mv step search.
2073   // 0 here is maximum length first step. 1 is MAX >> 1 etc.
2074   if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
2075     // Take wtd average of the step_params based on the last frame's
2076     // max mv magnitude and that based on the best ref mvs of the current
2077     // block for the given reference.
2078     step_param = (vp9_init_search_range(x->max_mv_context[ref]) +
2079                     cpi->mv_step_param) / 2;
2080   } else {
2081     step_param = cpi->mv_step_param;
2082   }
2083
2084   if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
2085     int boffset = 2 * (b_width_log2_lookup[BLOCK_64X64] -
2086           MIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
2087     step_param = MAX(step_param, boffset);
2088   }
2089
2090   if (cpi->sf.adaptive_motion_search) {
2091     int bwl = b_width_log2_lookup[bsize];
2092     int bhl = b_height_log2_lookup[bsize];
2093     int i;
2094     int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
2095
2096     if (tlevel < 5)
2097       step_param += 2;
2098
2099     for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
2100       if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
2101         x->pred_mv[ref].row = 0;
2102         x->pred_mv[ref].col = 0;
2103         tmp_mv->as_int = INVALID_MV;
2104
2105         if (scaled_ref_frame) {
2106           int i;
2107           for (i = 0; i < MAX_MB_PLANE; i++)
2108             xd->plane[i].pre[0] = backup_yv12[i];
2109         }
2110         return;
2111       }
2112     }
2113   }
2114
2115   mvp_full = pred_mv[x->mv_best_ref_index[ref]];
2116
2117   mvp_full.col >>= 3;
2118   mvp_full.row >>= 3;
2119
2120   bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
2121                                   cond_cost_list(cpi, cost_list),
2122                                   &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
2123
2124   x->mv_col_min = tmp_col_min;
2125   x->mv_col_max = tmp_col_max;
2126   x->mv_row_min = tmp_row_min;
2127   x->mv_row_max = tmp_row_max;
2128
2129   if (bestsme < INT_MAX) {
2130     int dis;  /* TODO: use dis in distortion calculation later. */
2131     cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
2132                                  cm->allow_high_precision_mv,
2133                                  x->errorperbit,
2134                                  &cpi->fn_ptr[bsize],
2135                                  cpi->sf.mv.subpel_force_stop,
2136                                  cpi->sf.mv.subpel_iters_per_step,
2137                                  cond_cost_list(cpi, cost_list),
2138                                  x->nmvjointcost, x->mvcost,
2139                                  &dis, &x->pred_sse[ref], NULL, 0, 0);
2140   }
2141   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
2142                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2143
2144   if (cpi->sf.adaptive_motion_search)
2145     x->pred_mv[ref] = tmp_mv->as_mv;
2146
2147   if (scaled_ref_frame) {
2148     int i;
2149     for (i = 0; i < MAX_MB_PLANE; i++)
2150       xd->plane[i].pre[0] = backup_yv12[i];
2151   }
2152 }
2153
2154 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
2155                                 BLOCK_SIZE bsize,
2156                                 int_mv *frame_mv,
2157                                 int mi_row, int mi_col,
2158                                 int_mv single_newmv[MAX_REF_FRAMES],
2159                                 int *rate_mv) {
2160   const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
2161   const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
2162   MACROBLOCKD *xd = &x->e_mbd;
2163   MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
2164   const int refs[2] = { mbmi->ref_frame[0],
2165                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
2166   int_mv ref_mv[2];
2167   int ite, ref;
2168   // Prediction buffer from second frame.
2169 #if CONFIG_VP9_HIGHBITDEPTH
2170   uint8_t *second_pred;
2171   uint8_t *second_pred_alloc;
2172 #else
2173   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
2174 #endif  // CONFIG_VP9_HIGHBITDEPTH
2175   const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
2176
2177   // Do joint motion search in compound mode to get more accurate mv.
2178   struct buf_2d backup_yv12[2][MAX_MB_PLANE];
2179   struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0];
2180   int last_besterr[2] = {INT_MAX, INT_MAX};
2181   const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
2182     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
2183     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
2184   };
2185 #if CONFIG_VP9_HIGHBITDEPTH
2186   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
2187     second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint16_t));
2188     second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc);
2189   } else {
2190     second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint8_t));
2191     second_pred = second_pred_alloc;
2192   }
2193 #endif  // CONFIG_VP9_HIGHBITDEPTH
2194
2195   for (ref = 0; ref < 2; ++ref) {
2196     ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
2197
2198     if (scaled_ref_frame[ref]) {
2199       int i;
2200       // Swap out the reference frame for a version that's been scaled to
2201       // match the resolution of the current frame, allowing the existing
2202       // motion search code to be used without additional modifications.
2203       for (i = 0; i < MAX_MB_PLANE; i++)
2204         backup_yv12[ref][i] = xd->plane[i].pre[ref];
2205       vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
2206                            NULL);
2207     }
2208
2209     frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
2210   }
2211
2212   // Allow joint search multiple times iteratively for each ref frame
2213   // and break out the search loop if it couldn't find better mv.
2214   for (ite = 0; ite < 4; ite++) {
2215     struct buf_2d ref_yv12[2];
2216     int bestsme = INT_MAX;
2217     int sadpb = x->sadperbit16;
2218     MV tmp_mv;
2219     int search_range = 3;
2220
2221     int tmp_col_min = x->mv_col_min;
2222     int tmp_col_max = x->mv_col_max;
2223     int tmp_row_min = x->mv_row_min;
2224     int tmp_row_max = x->mv_row_max;
2225     int id = ite % 2;
2226
2227     // Initialized here because of compiler problem in Visual Studio.
2228     ref_yv12[0] = xd->plane[0].pre[0];
2229     ref_yv12[1] = xd->plane[0].pre[1];
2230
2231     // Get pred block from second frame.
2232 #if CONFIG_VP9_HIGHBITDEPTH
2233     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
2234       vp9_highbd_build_inter_predictor(ref_yv12[!id].buf,
2235                                        ref_yv12[!id].stride,
2236                                        second_pred, pw,
2237                                        &frame_mv[refs[!id]].as_mv,
2238                                        &xd->block_refs[!id]->sf,
2239                                        pw, ph, 0,
2240                                        kernel, MV_PRECISION_Q3,
2241                                        mi_col * MI_SIZE, mi_row * MI_SIZE,
2242                                        xd->bd);
2243     } else {
2244       vp9_build_inter_predictor(ref_yv12[!id].buf,
2245                                 ref_yv12[!id].stride,
2246                                 second_pred, pw,
2247                                 &frame_mv[refs[!id]].as_mv,
2248                                 &xd->block_refs[!id]->sf,
2249                                 pw, ph, 0,
2250                                 kernel, MV_PRECISION_Q3,
2251                                 mi_col * MI_SIZE, mi_row * MI_SIZE);
2252     }
2253 #else
2254     vp9_build_inter_predictor(ref_yv12[!id].buf,
2255                               ref_yv12[!id].stride,
2256                               second_pred, pw,
2257                               &frame_mv[refs[!id]].as_mv,
2258                               &xd->block_refs[!id]->sf,
2259                               pw, ph, 0,
2260                               kernel, MV_PRECISION_Q3,
2261                               mi_col * MI_SIZE, mi_row * MI_SIZE);
2262 #endif  // CONFIG_VP9_HIGHBITDEPTH
2263
2264     // Compound motion search on first ref frame.
2265     if (id)
2266       xd->plane[0].pre[0] = ref_yv12[id];
2267     vp9_set_mv_search_range(x, &ref_mv[id].as_mv);
2268
2269     // Use mv result from single mode as mvp.
2270     tmp_mv = frame_mv[refs[id]].as_mv;
2271
2272     tmp_mv.col >>= 3;
2273     tmp_mv.row >>= 3;
2274
2275     // Small-range full-pixel motion search
2276     bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
2277                                        search_range,
2278                                        &cpi->fn_ptr[bsize],
2279                                        &ref_mv[id].as_mv, second_pred);
2280     if (bestsme < INT_MAX)
2281       bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
2282                                       second_pred, &cpi->fn_ptr[bsize], 1);
2283
2284     x->mv_col_min = tmp_col_min;
2285     x->mv_col_max = tmp_col_max;
2286     x->mv_row_min = tmp_row_min;
2287     x->mv_row_max = tmp_row_max;
2288
2289     if (bestsme < INT_MAX) {
2290       int dis; /* TODO: use dis in distortion calculation later. */
2291       unsigned int sse;
2292       bestsme = cpi->find_fractional_mv_step(
2293           x, &tmp_mv,
2294           &ref_mv[id].as_mv,
2295           cpi->common.allow_high_precision_mv,
2296           x->errorperbit,
2297           &cpi->fn_ptr[bsize],
2298           0, cpi->sf.mv.subpel_iters_per_step,
2299           NULL,
2300           x->nmvjointcost, x->mvcost,
2301           &dis, &sse, second_pred,
2302           pw, ph);
2303     }
2304
2305     if (id)
2306       xd->plane[0].pre[0] = scaled_first_yv12;
2307
2308     if (bestsme < last_besterr[id]) {
2309       frame_mv[refs[id]].as_mv = tmp_mv;
2310       last_besterr[id] = bestsme;
2311     } else {
2312       break;
2313     }
2314   }
2315
2316   *rate_mv = 0;
2317
2318   for (ref = 0; ref < 2; ++ref) {
2319     if (scaled_ref_frame[ref]) {
2320       // restore the predictor
2321       int i;
2322       for (i = 0; i < MAX_MB_PLANE; i++)
2323         xd->plane[i].pre[ref] = backup_yv12[ref][i];
2324     }
2325
2326     *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
2327                                 &mbmi->ref_mvs[refs[ref]][0].as_mv,
2328                                 x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2329   }
2330
2331 #if CONFIG_VP9_HIGHBITDEPTH
2332   vpx_free(second_pred_alloc);
2333 #else
2334   vpx_free(second_pred);
2335 #endif  // CONFIG_VP9_HIGHBITDEPTH
2336 }
2337
2338 static INLINE void restore_dst_buf(MACROBLOCKD *xd,
2339                                    uint8_t *orig_dst[MAX_MB_PLANE],
2340                                    int orig_dst_stride[MAX_MB_PLANE]) {
2341   int i;
2342   for (i = 0; i < MAX_MB_PLANE; i++) {
2343     xd->plane[i].dst.buf = orig_dst[i];
2344     xd->plane[i].dst.stride = orig_dst_stride[i];
2345   }
2346 }
2347
2348 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
2349                                  BLOCK_SIZE bsize,
2350                                  int64_t txfm_cache[],
2351                                  int *rate2, int64_t *distortion,
2352                                  int *skippable,
2353                                  int *rate_y, int *rate_uv,
2354                                  int *disable_skip,
2355                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
2356                                  int mi_row, int mi_col,
2357                                  int_mv single_newmv[MAX_REF_FRAMES],
2358                                  INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
2359                                  int (*single_skippable)[MAX_REF_FRAMES],
2360                                  int64_t *psse,
2361                                  const int64_t ref_best_rd,
2362                                  int64_t *mask_filter,
2363                                  int64_t filter_cache[]) {
2364   VP9_COMMON *cm = &cpi->common;
2365   MACROBLOCKD *xd = &x->e_mbd;
2366   MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
2367   const int is_comp_pred = has_second_ref(mbmi);
2368   const int this_mode = mbmi->mode;
2369   int_mv *frame_mv = mode_mv[this_mode];
2370   int i;
2371   int refs[2] = { mbmi->ref_frame[0],
2372     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
2373   int_mv cur_mv[2];
2374 #if CONFIG_VP9_HIGHBITDEPTH
2375   DECLARE_ALIGNED_ARRAY(16, uint16_t, tmp_buf16, MAX_MB_PLANE * 64 * 64);
2376   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf8, MAX_MB_PLANE * 64 * 64);
2377   uint8_t *tmp_buf;
2378 #else
2379   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
2380 #endif  // CONFIG_VP9_HIGHBITDEPTH
2381   int pred_exists = 0;
2382   int intpel_mv;
2383   int64_t rd, tmp_rd, best_rd = INT64_MAX;
2384   int best_needs_copy = 0;
2385   uint8_t *orig_dst[MAX_MB_PLANE];
2386   int orig_dst_stride[MAX_MB_PLANE];
2387   int rs = 0;
2388   INTERP_FILTER best_filter = SWITCHABLE;
2389   uint8_t skip_txfm[MAX_MB_PLANE << 2] = {0};
2390   int64_t bsse[MAX_MB_PLANE << 2] = {0};
2391
2392   int bsl = mi_width_log2_lookup[bsize];
2393   int pred_filter_search = cpi->sf.cb_pred_filter_search ?
2394       (((mi_row + mi_col) >> bsl) +
2395        get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
2396
2397   int skip_txfm_sb = 0;
2398   int64_t skip_sse_sb = INT64_MAX;
2399   int64_t distortion_y = 0, distortion_uv = 0;
2400
2401 #if CONFIG_VP9_HIGHBITDEPTH
2402   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
2403     tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
2404   } else {
2405     tmp_buf = tmp_buf8;
2406   }
2407 #endif  // CONFIG_VP9_HIGHBITDEPTH
2408
2409   if (pred_filter_search) {
2410     INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
2411     if (xd->up_available)
2412       af = xd->mi[-xd->mi_stride].src_mi->mbmi.interp_filter;
2413     if (xd->left_available)
2414       lf = xd->mi[-1].src_mi->mbmi.interp_filter;
2415
2416     if ((this_mode != NEWMV) || (af == lf))
2417       best_filter = af;
2418   }
2419
2420   if (is_comp_pred) {
2421     if (frame_mv[refs[0]].as_int == INVALID_MV ||
2422         frame_mv[refs[1]].as_int == INVALID_MV)
2423       return INT64_MAX;
2424
2425     if (cpi->sf.adaptive_mode_search) {
2426       if (single_filter[this_mode][refs[0]] ==
2427           single_filter[this_mode][refs[1]])
2428         best_filter = single_filter[this_mode][refs[0]];
2429     }
2430   }
2431
2432   if (this_mode == NEWMV) {
2433     int rate_mv;
2434     if (is_comp_pred) {
2435       // Initialize mv using single prediction mode result.
2436       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
2437       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
2438
2439       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
2440         joint_motion_search(cpi, x, bsize, frame_mv,
2441                             mi_row, mi_col, single_newmv, &rate_mv);
2442       } else {
2443         rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
2444                                    &mbmi->ref_mvs[refs[0]][0].as_mv,
2445                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2446         rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
2447                                    &mbmi->ref_mvs[refs[1]][0].as_mv,
2448                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2449       }
2450       *rate2 += rate_mv;
2451     } else {
2452       int_mv tmp_mv;
2453       single_motion_search(cpi, x, bsize, mi_row, mi_col,
2454                            &tmp_mv, &rate_mv);
2455       if (tmp_mv.as_int == INVALID_MV)
2456         return INT64_MAX;
2457       *rate2 += rate_mv;
2458       frame_mv[refs[0]].as_int =
2459           xd->mi[0].src_mi->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
2460       single_newmv[refs[0]].as_int = tmp_mv.as_int;
2461     }
2462   }
2463
2464   for (i = 0; i < is_comp_pred + 1; ++i) {
2465     cur_mv[i] = frame_mv[refs[i]];
2466     // Clip "next_nearest" so that it does not extend to far out of image
2467     if (this_mode != NEWMV)
2468       clamp_mv2(&cur_mv[i].as_mv, xd);
2469
2470     if (mv_check_bounds(x, &cur_mv[i].as_mv))
2471       return INT64_MAX;
2472     mbmi->mv[i].as_int = cur_mv[i].as_int;
2473   }
2474
2475   // do first prediction into the destination buffer. Do the next
2476   // prediction into a temporary buffer. Then keep track of which one
2477   // of these currently holds the best predictor, and use the other
2478   // one for future predictions. In the end, copy from tmp_buf to
2479   // dst if necessary.
2480   for (i = 0; i < MAX_MB_PLANE; i++) {
2481     orig_dst[i] = xd->plane[i].dst.buf;
2482     orig_dst_stride[i] = xd->plane[i].dst.stride;
2483   }
2484
2485   /* We don't include the cost of the second reference here, because there
2486    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
2487    * words if you present them in that order, the second one is always known
2488    * if the first is known */
2489   *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
2490
2491   if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
2492       mbmi->mode != NEARESTMV)
2493     return INT64_MAX;
2494
2495   pred_exists = 0;
2496   // Are all MVs integer pel for Y and UV
2497   intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
2498   if (is_comp_pred)
2499     intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
2500
2501   // Search for best switchable filter by checking the variance of
2502   // pred error irrespective of whether the filter will be used
2503   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
2504     filter_cache[i] = INT64_MAX;
2505
2506   if (cm->interp_filter != BILINEAR) {
2507     if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
2508       best_filter = EIGHTTAP;
2509     } else if (best_filter == SWITCHABLE) {
2510       int newbest;
2511       int tmp_rate_sum = 0;
2512       int64_t tmp_dist_sum = 0;
2513
2514       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
2515         int j;
2516         int64_t rs_rd;
2517         int tmp_skip_sb = 0;
2518         int64_t tmp_skip_sse = INT64_MAX;
2519
2520         mbmi->interp_filter = i;
2521         rs = vp9_get_switchable_rate(cpi, xd);
2522         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
2523
2524         if (i > 0 && intpel_mv) {
2525           rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
2526           filter_cache[i] = rd;
2527           filter_cache[SWITCHABLE_FILTERS] =
2528               MIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
2529           if (cm->interp_filter == SWITCHABLE)
2530             rd += rs_rd;
2531           *mask_filter = MAX(*mask_filter, rd);
2532         } else {
2533           int rate_sum = 0;
2534           int64_t dist_sum = 0;
2535           if (i > 0 && cpi->sf.adaptive_interp_filter_search &&
2536               (cpi->sf.interp_filter_search_mask & (1 << i))) {
2537             rate_sum = INT_MAX;
2538             dist_sum = INT64_MAX;
2539             continue;
2540           }
2541
2542           if ((cm->interp_filter == SWITCHABLE &&
2543                (!i || best_needs_copy)) ||
2544               (cm->interp_filter != SWITCHABLE &&
2545                (cm->interp_filter == mbmi->interp_filter ||
2546                 (i == 0 && intpel_mv)))) {
2547             restore_dst_buf(xd, orig_dst, orig_dst_stride);
2548           } else {
2549             for (j = 0; j < MAX_MB_PLANE; j++) {
2550               xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
2551               xd->plane[j].dst.stride = 64;
2552             }
2553           }
2554           vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
2555           model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum,
2556                           &tmp_skip_sb, &tmp_skip_sse);
2557
2558           rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
2559           filter_cache[i] = rd;
2560           filter_cache[SWITCHABLE_FILTERS] =
2561               MIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
2562           if (cm->interp_filter == SWITCHABLE)
2563             rd += rs_rd;
2564           *mask_filter = MAX(*mask_filter, rd);
2565
2566           if (i == 0 && intpel_mv) {
2567             tmp_rate_sum = rate_sum;
2568             tmp_dist_sum = dist_sum;
2569           }
2570         }
2571
2572         if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
2573           if (rd / 2 > ref_best_rd) {
2574             restore_dst_buf(xd, orig_dst, orig_dst_stride);
2575             return INT64_MAX;
2576           }
2577         }
2578         newbest = i == 0 || rd < best_rd;
2579
2580         if (newbest) {
2581           best_rd = rd;
2582           best_filter = mbmi->interp_filter;
2583           if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
2584             best_needs_copy = !best_needs_copy;
2585         }
2586
2587         if ((cm->interp_filter == SWITCHABLE && newbest) ||
2588             (cm->interp_filter != SWITCHABLE &&
2589              cm->interp_filter == mbmi->interp_filter)) {
2590           pred_exists = 1;
2591           tmp_rd = best_rd;
2592
2593           skip_txfm_sb = tmp_skip_sb;
2594           skip_sse_sb = tmp_skip_sse;
2595           vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
2596           vpx_memcpy(bsse, x->bsse, sizeof(bsse));
2597         }
2598       }
2599       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2600     }
2601   }
2602   // Set the appropriate filter
2603   mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
2604       cm->interp_filter : best_filter;
2605   rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0;
2606
2607   if (pred_exists) {
2608     if (best_needs_copy) {
2609       // again temporarily set the buffers to local memory to prevent a memcpy
2610       for (i = 0; i < MAX_MB_PLANE; i++) {
2611         xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
2612         xd->plane[i].dst.stride = 64;
2613       }
2614     }
2615     rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0);
2616   } else {
2617     int tmp_rate;
2618     int64_t tmp_dist;
2619     // Handles the special case when a filter that is not in the
2620     // switchable list (ex. bilinear) is indicated at the frame level, or
2621     // skip condition holds.
2622     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
2623     model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist,
2624                     &skip_txfm_sb, &skip_sse_sb);
2625     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
2626     vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
2627     vpx_memcpy(bsse, x->bsse, sizeof(bsse));
2628   }
2629
2630   if (!is_comp_pred)
2631     single_filter[this_mode][refs[0]] = mbmi->interp_filter;
2632
2633   if (cpi->sf.adaptive_mode_search)
2634     if (is_comp_pred)
2635       if (single_skippable[this_mode][refs[0]] &&
2636           single_skippable[this_mode][refs[1]])
2637         vpx_memset(skip_txfm, 1, sizeof(skip_txfm));
2638
2639   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
2640     // if current pred_error modeled rd is substantially more than the best
2641     // so far, do not bother doing full rd
2642     if (rd / 2 > ref_best_rd) {
2643       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2644       return INT64_MAX;
2645     }
2646   }
2647
2648   if (cm->interp_filter == SWITCHABLE)
2649     *rate2 += rs;
2650
2651   vpx_memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm));
2652   vpx_memcpy(x->bsse, bsse, sizeof(bsse));
2653
2654   if (!skip_txfm_sb) {
2655     int skippable_y, skippable_uv;
2656     int64_t sseuv = INT64_MAX;
2657     int64_t rdcosty = INT64_MAX;
2658
2659     // Y cost and distortion
2660     vp9_subtract_plane(x, bsize, 0);
2661     super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
2662                     bsize, txfm_cache, ref_best_rd);
2663
2664     if (*rate_y == INT_MAX) {
2665       *rate2 = INT_MAX;
2666       *distortion = INT64_MAX;
2667       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2668       return INT64_MAX;
2669     }
2670
2671     *rate2 += *rate_y;
2672     *distortion += distortion_y;
2673
2674     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
2675     rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
2676
2677     if (!super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
2678                           &sseuv, bsize, ref_best_rd - rdcosty)) {
2679       *rate2 = INT_MAX;
2680       *distortion = INT64_MAX;
2681       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2682       return INT64_MAX;
2683     }
2684
2685     *psse += sseuv;
2686     *rate2 += *rate_uv;
2687     *distortion += distortion_uv;
2688     *skippable = skippable_y && skippable_uv;
2689   } else {
2690     x->skip = 1;
2691     *disable_skip = 1;
2692
2693     // The cost of skip bit needs to be added.
2694     *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
2695
2696     *distortion = skip_sse_sb;
2697   }
2698
2699   if (!is_comp_pred)
2700     single_skippable[this_mode][refs[0]] = *skippable;
2701
2702   restore_dst_buf(xd, orig_dst, orig_dst_stride);
2703   return 0;  // The rate-distortion cost will be re-calculated by caller.
2704 }
2705
2706 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
2707                                RD_COST *rd_cost, BLOCK_SIZE bsize,
2708                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
2709   VP9_COMMON *const cm = &cpi->common;
2710   MACROBLOCKD *const xd = &x->e_mbd;
2711   struct macroblockd_plane *const pd = xd->plane;
2712   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
2713   int y_skip = 0, uv_skip = 0;
2714   int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
2715   TX_SIZE max_uv_tx_size;
2716   x->skip_encode = 0;
2717   ctx->skip = 0;
2718   xd->mi[0].src_mi->mbmi.ref_frame[0] = INTRA_FRAME;
2719
2720   if (bsize >= BLOCK_8X8) {
2721     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
2722                                &dist_y, &y_skip, bsize, tx_cache,
2723                                best_rd) >= best_rd) {
2724       rd_cost->rate = INT_MAX;
2725       return;
2726     }
2727   } else {
2728     y_skip = 0;
2729     if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
2730                                      &dist_y, best_rd) >= best_rd) {
2731       rd_cost->rate = INT_MAX;
2732       return;
2733     }
2734   }
2735   max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0].src_mi->mbmi.tx_size, bsize,
2736                                        pd[1].subsampling_x,
2737                                        pd[1].subsampling_y);
2738   rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
2739                           &dist_uv, &uv_skip, MAX(BLOCK_8X8, bsize),
2740                           max_uv_tx_size);
2741
2742   if (y_skip && uv_skip) {
2743     rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
2744                     vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
2745     rd_cost->dist = dist_y + dist_uv;
2746     vp9_zero(ctx->tx_rd_diff);
2747   } else {
2748     int i;
2749     rd_cost->rate = rate_y + rate_uv +
2750                       vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
2751     rd_cost->dist = dist_y + dist_uv;
2752     if (cpi->sf.tx_size_search_method == USE_FULL_RD)
2753       for (i = 0; i < TX_MODES; i++) {
2754         if (tx_cache[i] < INT64_MAX && tx_cache[cm->tx_mode] < INT64_MAX)
2755           ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
2756         else
2757           ctx->tx_rd_diff[i] = 0;
2758       }
2759   }
2760
2761   ctx->mic = *xd->mi[0].src_mi;
2762   rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
2763 }
2764
2765 void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
2766                                TileDataEnc *tile_data,
2767                                MACROBLOCK *x,
2768                                int mi_row, int mi_col,
2769                                RD_COST *rd_cost, BLOCK_SIZE bsize,
2770                                PICK_MODE_CONTEXT *ctx,
2771                                int64_t best_rd_so_far) {
2772   VP9_COMMON *const cm = &cpi->common;
2773   TileInfo *const tile_info = &tile_data->tile_info;
2774   RD_OPT *const rd_opt = &cpi->rd;
2775   SPEED_FEATURES *const sf = &cpi->sf;
2776   MACROBLOCKD *const xd = &x->e_mbd;
2777   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
2778   const struct segmentation *const seg = &cm->seg;
2779   PREDICTION_MODE this_mode;
2780   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
2781   unsigned char segment_id = mbmi->segment_id;
2782   int comp_pred, i, k;
2783   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
2784   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
2785   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
2786   INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
2787   int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
2788   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
2789                                     VP9_ALT_FLAG };
2790   int64_t best_rd = best_rd_so_far;
2791   int64_t best_tx_rd[TX_MODES];
2792   int64_t best_tx_diff[TX_MODES];
2793   int64_t best_pred_diff[REFERENCE_MODES];
2794   int64_t best_pred_rd[REFERENCE_MODES];
2795   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
2796   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
2797   MB_MODE_INFO best_mbmode;
2798   int best_mode_skippable = 0;
2799   int midx, best_mode_index = -1;
2800   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
2801   vp9_prob comp_mode_p;
2802   int64_t best_intra_rd = INT64_MAX;
2803   unsigned int best_pred_sse = UINT_MAX;
2804   PREDICTION_MODE best_intra_mode = DC_PRED;
2805   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
2806   int64_t dist_uv[TX_SIZES];
2807   int skip_uv[TX_SIZES];
2808   PREDICTION_MODE mode_uv[TX_SIZES];
2809   const int intra_cost_penalty = vp9_get_intra_cost_penalty(
2810       cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
2811   int best_skip2 = 0;
2812   uint8_t ref_frame_skip_mask[2] = { 0 };
2813   uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
2814   int mode_skip_start = sf->mode_skip_start + 1;
2815   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
2816   const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
2817   int64_t mode_threshold[MAX_MODES];
2818   int *mode_map = tile_data->mode_map[bsize];
2819   const int mode_search_skip_flags = sf->mode_search_skip_flags;
2820   int64_t mask_filter = 0;
2821   int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
2822
2823   vp9_zero(best_mbmode);
2824
2825   x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
2826
2827   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
2828     filter_cache[i] = INT64_MAX;
2829
2830   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
2831                            &comp_mode_p);
2832
2833   for (i = 0; i < REFERENCE_MODES; ++i)
2834     best_pred_rd[i] = INT64_MAX;
2835   for (i = 0; i < TX_MODES; i++)
2836     best_tx_rd[i] = INT64_MAX;
2837   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
2838     best_filter_rd[i] = INT64_MAX;
2839   for (i = 0; i < TX_SIZES; i++)
2840     rate_uv_intra[i] = INT_MAX;
2841   for (i = 0; i < MAX_REF_FRAMES; ++i)
2842     x->pred_sse[i] = INT_MAX;
2843   for (i = 0; i < MB_MODE_COUNT; ++i) {
2844     for (k = 0; k < MAX_REF_FRAMES; ++k) {
2845       single_inter_filter[i][k] = SWITCHABLE;
2846       single_skippable[i][k] = 0;
2847     }
2848   }
2849
2850   rd_cost->rate = INT_MAX;
2851
2852   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
2853     x->pred_mv_sad[ref_frame] = INT_MAX;
2854     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
2855       setup_buffer_inter(cpi, x, tile_info, ref_frame, bsize, mi_row, mi_col,
2856                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
2857     }
2858     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
2859     frame_mv[ZEROMV][ref_frame].as_int = 0;
2860   }
2861
2862   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
2863     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
2864       // Skip checking missing references in both single and compound reference
2865       // modes. Note that a mode will be skipped iff both reference frames
2866       // are masked out.
2867       ref_frame_skip_mask[0] |= (1 << ref_frame);
2868       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
2869     } else if (sf->reference_masking) {
2870       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
2871         // Skip fixed mv modes for poor references
2872         if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
2873           mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
2874           break;
2875         }
2876       }
2877     }
2878     // If the segment reference frame feature is enabled....
2879     // then do nothing if the current ref frame is not allowed..
2880     if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
2881         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
2882       ref_frame_skip_mask[0] |= (1 << ref_frame);
2883       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
2884     }
2885   }
2886
2887   // Disable this drop out case if the ref frame
2888   // segment level feature is enabled for this segment. This is to
2889   // prevent the possibility that we end up unable to pick any mode.
2890   if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
2891     // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
2892     // unless ARNR filtering is enabled in which case we want
2893     // an unfiltered alternative. We allow near/nearest as well
2894     // because they may result in zero-zero MVs but be cheaper.
2895     if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
2896       ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << GOLDEN_FRAME);
2897       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
2898       mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
2899       if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
2900         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
2901       if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
2902         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
2903     }
2904   }
2905
2906   if (cpi->rc.is_src_frame_alt_ref) {
2907     if (sf->alt_ref_search_fp) {
2908       mode_skip_mask[ALTREF_FRAME] = 0;
2909       ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
2910       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
2911     }
2912   }
2913
2914   if (sf->alt_ref_search_fp)
2915     if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
2916       if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
2917         mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
2918
2919   if (sf->adaptive_mode_search) {
2920     if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
2921         cpi->rc.frames_since_golden >= 3)
2922       if (x->pred_mv_sad[GOLDEN_FRAME] > (x->pred_mv_sad[LAST_FRAME] << 1))
2923         mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
2924   }
2925
2926   if (bsize > sf->max_intra_bsize) {
2927     ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
2928     ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
2929   }
2930
2931   mode_skip_mask[INTRA_FRAME] |=
2932       ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
2933
2934   for (i = 0; i < MAX_MODES; ++i)
2935     mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
2936
2937   midx =  sf->schedule_mode_search ? mode_skip_start : 0;
2938   while (midx > 4) {
2939     uint8_t end_pos = 0;
2940     for (i = 5; i < midx; ++i) {
2941       if (mode_threshold[mode_map[i - 1]] > mode_threshold[mode_map[i]]) {
2942         uint8_t tmp = mode_map[i];
2943         mode_map[i] = mode_map[i - 1];
2944         mode_map[i - 1] = tmp;
2945         end_pos = i;
2946       }
2947     }
2948     midx = end_pos;
2949   }
2950
2951   for (midx = 0; midx < MAX_MODES; ++midx) {
2952     int mode_index = mode_map[midx];
2953     int mode_excluded = 0;
2954     int64_t this_rd = INT64_MAX;
2955     int disable_skip = 0;
2956     int compmode_cost = 0;
2957     int rate2 = 0, rate_y = 0, rate_uv = 0;
2958     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
2959     int skippable = 0;
2960     int64_t tx_cache[TX_MODES];
2961     int this_skip2 = 0;
2962     int64_t total_sse = INT64_MAX;
2963     int early_term = 0;
2964
2965     this_mode = vp9_mode_order[mode_index].mode;
2966     ref_frame = vp9_mode_order[mode_index].ref_frame[0];
2967     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
2968
2969     // Look at the reference frame of the best mode so far and set the
2970     // skip mask to look at a subset of the remaining modes.
2971     if (midx == mode_skip_start && best_mode_index >= 0) {
2972       switch (best_mbmode.ref_frame[0]) {
2973         case INTRA_FRAME:
2974           break;
2975         case LAST_FRAME:
2976           ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
2977           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
2978           break;
2979         case GOLDEN_FRAME:
2980           ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
2981           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
2982           break;
2983         case ALTREF_FRAME:
2984           ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK;
2985           break;
2986         case NONE:
2987         case MAX_REF_FRAMES:
2988           assert(0 && "Invalid Reference frame");
2989           break;
2990       }
2991     }
2992
2993     if (ref_frame_skip_mask[0] & (1 << ref_frame) &&
2994         ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame)))
2995       continue;
2996
2997     if (mode_skip_mask[ref_frame] & (1 << this_mode))
2998       continue;
2999
3000     // Test best rd so far against threshold for trying this mode.
3001     if (best_mode_skippable && sf->schedule_mode_search)
3002       mode_threshold[mode_index] <<= 1;
3003
3004     if (best_rd < mode_threshold[mode_index])
3005       continue;
3006
3007     if (sf->motion_field_mode_search) {
3008       const int mi_width  = MIN(num_8x8_blocks_wide_lookup[bsize],
3009                                 tile_info->mi_col_end - mi_col);
3010       const int mi_height = MIN(num_8x8_blocks_high_lookup[bsize],
3011                                 tile_info->mi_row_end - mi_row);
3012       const int bsl = mi_width_log2_lookup[bsize];
3013       int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl)
3014           + get_chessboard_index(cm->current_video_frame)) & 0x1;
3015       MB_MODE_INFO *ref_mbmi;
3016       int const_motion = 1;
3017       int skip_ref_frame = !cb_partition_search_ctrl;
3018       MV_REFERENCE_FRAME rf = NONE;
3019       int_mv ref_mv;
3020       ref_mv.as_int = INVALID_MV;
3021
3022       if ((mi_row - 1) >= tile_info->mi_row_start) {
3023         ref_mv = xd->mi[-xd->mi_stride].src_mi->mbmi.mv[0];
3024         rf = xd->mi[-xd->mi_stride].src_mi->mbmi.ref_frame[0];
3025         for (i = 0; i < mi_width; ++i) {
3026           ref_mbmi = &xd->mi[-xd->mi_stride + i].src_mi->mbmi;
3027           const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
3028                           (ref_frame == ref_mbmi->ref_frame[0]);
3029           skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
3030         }
3031       }
3032
3033       if ((mi_col - 1) >= tile_info->mi_col_start) {
3034         if (ref_mv.as_int == INVALID_MV)
3035           ref_mv = xd->mi[-1].src_mi->mbmi.mv[0];
3036         if (rf == NONE)
3037           rf = xd->mi[-1].src_mi->mbmi.ref_frame[0];
3038         for (i = 0; i < mi_height; ++i) {
3039           ref_mbmi = &xd->mi[i * xd->mi_stride - 1].src_mi->mbmi;
3040           const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
3041                           (ref_frame == ref_mbmi->ref_frame[0]);
3042           skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
3043         }
3044       }
3045
3046       if (skip_ref_frame && this_mode != NEARESTMV && this_mode != NEWMV)
3047         if (rf > INTRA_FRAME)
3048           if (ref_frame != rf)
3049             continue;
3050
3051       if (const_motion)
3052         if (this_mode == NEARMV || this_mode == ZEROMV)
3053           continue;
3054     }
3055
3056     comp_pred = second_ref_frame > INTRA_FRAME;
3057     if (comp_pred) {
3058       if (!cm->allow_comp_inter_inter)
3059         continue;
3060
3061       // Skip compound inter modes if ARF is not available.
3062       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
3063         continue;
3064
3065       // Do not allow compound prediction if the segment level reference frame
3066       // feature is in use as in this case there can only be one reference.
3067       if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
3068         continue;
3069
3070       if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
3071           best_mode_index >= 0 && best_mbmode.ref_frame[0] == INTRA_FRAME)
3072         continue;
3073
3074       mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
3075     } else {
3076       if (ref_frame != INTRA_FRAME)
3077         mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
3078     }
3079
3080     if (ref_frame == INTRA_FRAME) {
3081       if (sf->adaptive_mode_search)
3082         if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse)
3083           continue;
3084
3085       if (this_mode != DC_PRED) {
3086         // Disable intra modes other than DC_PRED for blocks with low variance
3087         // Threshold for intra skipping based on source variance
3088         // TODO(debargha): Specialize the threshold for super block sizes
3089         const unsigned int skip_intra_var_thresh = 64;
3090         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
3091             x->source_variance < skip_intra_var_thresh)
3092           continue;
3093         // Only search the oblique modes if the best so far is
3094         // one of the neighboring directional modes
3095         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
3096             (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
3097           if (best_mode_index >= 0 &&
3098               best_mbmode.ref_frame[0] > INTRA_FRAME)
3099             continue;
3100         }
3101         if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
3102           if (conditional_skipintra(this_mode, best_intra_mode))
3103               continue;
3104         }
3105       }
3106     } else {
3107       const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
3108       if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
3109                               this_mode, ref_frames))
3110         continue;
3111     }
3112
3113     mbmi->mode = this_mode;
3114     mbmi->uv_mode = DC_PRED;
3115     mbmi->ref_frame[0] = ref_frame;
3116     mbmi->ref_frame[1] = second_ref_frame;
3117     // Evaluate all sub-pel filters irrespective of whether we can use
3118     // them for this frame.
3119     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
3120                                                           : cm->interp_filter;
3121     mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
3122
3123     x->skip = 0;
3124     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
3125
3126     // Select prediction reference frames.
3127     for (i = 0; i < MAX_MB_PLANE; i++) {
3128       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
3129       if (comp_pred)
3130         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
3131     }
3132
3133     for (i = 0; i < TX_MODES; ++i)
3134       tx_cache[i] = INT64_MAX;
3135
3136     if (ref_frame == INTRA_FRAME) {
3137       TX_SIZE uv_tx;
3138       struct macroblockd_plane *const pd = &xd->plane[1];
3139       vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
3140       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
3141                       NULL, bsize, tx_cache, best_rd);
3142       if (rate_y == INT_MAX)
3143         continue;
3144
3145       uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd->subsampling_x,
3146                                   pd->subsampling_y);
3147       if (rate_uv_intra[uv_tx] == INT_MAX) {
3148         choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx,
3149                              &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
3150                              &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
3151       }
3152
3153       rate_uv = rate_uv_tokenonly[uv_tx];
3154       distortion_uv = dist_uv[uv_tx];
3155       skippable = skippable && skip_uv[uv_tx];
3156       mbmi->uv_mode = mode_uv[uv_tx];
3157
3158       rate2 = rate_y + cpi->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
3159       if (this_mode != DC_PRED && this_mode != TM_PRED)
3160         rate2 += intra_cost_penalty;
3161       distortion2 = distortion_y + distortion_uv;
3162     } else {
3163       this_rd = handle_inter_mode(cpi, x, bsize,
3164                                   tx_cache,
3165                                   &rate2, &distortion2, &skippable,
3166                                   &rate_y, &rate_uv,
3167                                   &disable_skip, frame_mv,
3168                                   mi_row, mi_col,
3169                                   single_newmv, single_inter_filter,
3170                                   single_skippable, &total_sse, best_rd,
3171                                   &mask_filter, filter_cache);
3172       if (this_rd == INT64_MAX)
3173         continue;
3174
3175       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
3176
3177       if (cm->reference_mode == REFERENCE_MODE_SELECT)
3178         rate2 += compmode_cost;
3179     }
3180
3181     // Estimate the reference frame signaling cost and add it
3182     // to the rolling cost variable.
3183     if (comp_pred) {
3184       rate2 += ref_costs_comp[ref_frame];
3185     } else {
3186       rate2 += ref_costs_single[ref_frame];
3187     }
3188
3189     if (!disable_skip) {
3190       if (skippable) {
3191         // Back out the coefficient coding costs
3192         rate2 -= (rate_y + rate_uv);
3193
3194         // Cost the skip mb case
3195         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
3196       } else if (ref_frame != INTRA_FRAME && !xd->lossless) {
3197         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
3198             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
3199           // Add in the cost of the no skip flag.
3200           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
3201         } else {
3202           // FIXME(rbultje) make this work for splitmv also
3203           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
3204           distortion2 = total_sse;
3205           assert(total_sse >= 0);
3206           rate2 -= (rate_y + rate_uv);
3207           this_skip2 = 1;
3208         }
3209       } else {
3210         // Add in the cost of the no skip flag.
3211         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
3212       }
3213
3214       // Calculate the final RD estimate for this mode.
3215       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
3216     }
3217
3218     if (ref_frame == INTRA_FRAME) {
3219     // Keep record of best intra rd
3220       if (this_rd < best_intra_rd) {
3221         best_intra_rd = this_rd;
3222         best_intra_mode = mbmi->mode;
3223       }
3224     }
3225
3226     if (!disable_skip && ref_frame == INTRA_FRAME) {
3227       for (i = 0; i < REFERENCE_MODES; ++i)
3228         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
3229       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
3230         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
3231     }
3232
3233     // Did this mode help.. i.e. is it the new best mode
3234     if (this_rd < best_rd || x->skip) {
3235       int max_plane = MAX_MB_PLANE;
3236       if (!mode_excluded) {
3237         // Note index of best mode so far
3238         best_mode_index = mode_index;
3239
3240         if (ref_frame == INTRA_FRAME) {
3241           /* required for left and above block mv */
3242           mbmi->mv[0].as_int = 0;
3243           max_plane = 1;
3244         } else {
3245           best_pred_sse = x->pred_sse[ref_frame];
3246         }
3247
3248         rd_cost->rate = rate2;
3249         rd_cost->dist = distortion2;
3250         rd_cost->rdcost = this_rd;
3251         best_rd = this_rd;
3252         best_mbmode = *mbmi;
3253         best_skip2 = this_skip2;
3254         best_mode_skippable = skippable;
3255
3256         if (!x->select_tx_size)
3257           swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
3258         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
3259                    sizeof(uint8_t) * ctx->num_4x4_blk);
3260
3261         // TODO(debargha): enhance this test with a better distortion prediction
3262         // based on qp, activity mask and history
3263         if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
3264             (mode_index > MIN_EARLY_TERM_INDEX)) {
3265           int qstep = xd->plane[0].dequant[1];
3266           // TODO(debargha): Enhance this by specializing for each mode_index
3267           int scale = 4;
3268 #if CONFIG_VP9_HIGHBITDEPTH
3269           if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
3270             qstep >>= (xd->bd - 8);
3271           }
3272 #endif  // CONFIG_VP9_HIGHBITDEPTH
3273           if (x->source_variance < UINT_MAX) {
3274             const int var_adjust = (x->source_variance < 16);
3275             scale -= var_adjust;
3276           }
3277           if (ref_frame > INTRA_FRAME &&
3278               distortion2 * scale < qstep * qstep) {
3279             early_term = 1;
3280           }
3281         }
3282       }
3283     }
3284
3285     /* keep record of best compound/single-only prediction */
3286     if (!disable_skip && ref_frame != INTRA_FRAME) {
3287       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
3288
3289       if (cm->reference_mode == REFERENCE_MODE_SELECT) {
3290         single_rate = rate2 - compmode_cost;
3291         hybrid_rate = rate2;
3292       } else {
3293         single_rate = rate2;
3294         hybrid_rate = rate2 + compmode_cost;
3295       }
3296
3297       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
3298       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
3299
3300       if (!comp_pred) {
3301         if (single_rd < best_pred_rd[SINGLE_REFERENCE])
3302           best_pred_rd[SINGLE_REFERENCE] = single_rd;
3303       } else {
3304         if (single_rd < best_pred_rd[COMPOUND_REFERENCE])
3305           best_pred_rd[COMPOUND_REFERENCE] = single_rd;
3306       }
3307       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
3308         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
3309
3310       /* keep record of best filter type */
3311       if (!mode_excluded && cm->interp_filter != BILINEAR) {
3312         int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
3313                               SWITCHABLE_FILTERS : cm->interp_filter];
3314
3315         for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
3316           int64_t adj_rd;
3317           if (ref == INT64_MAX)
3318             adj_rd = 0;
3319           else if (filter_cache[i] == INT64_MAX)
3320             // when early termination is triggered, the encoder does not have
3321             // access to the rate-distortion cost. it only knows that the cost
3322             // should be above the maximum valid value. hence it takes the known
3323             // maximum plus an arbitrary constant as the rate-distortion cost.
3324             adj_rd = mask_filter - ref + 10;
3325           else
3326             adj_rd = filter_cache[i] - ref;
3327
3328           adj_rd += this_rd;
3329           best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
3330         }
3331       }
3332     }
3333
3334     /* keep record of best txfm size */
3335     if (bsize < BLOCK_32X32) {
3336       if (bsize < BLOCK_16X16)
3337         tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
3338
3339       tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
3340     }
3341     if (!mode_excluded && this_rd != INT64_MAX) {
3342       for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
3343         int64_t adj_rd = INT64_MAX;
3344         adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
3345
3346         if (adj_rd < best_tx_rd[i])
3347           best_tx_rd[i] = adj_rd;
3348       }
3349     }
3350
3351     if (early_term)
3352       break;
3353
3354     if (x->skip && !comp_pred)
3355       break;
3356   }
3357
3358   // The inter modes' rate costs are not calculated precisely in some cases.
3359   // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
3360   // ZEROMV. Here, checks are added for those cases, and the mode decisions
3361   // are corrected.
3362   if (best_mbmode.mode == NEWMV) {
3363     const MV_REFERENCE_FRAME refs[2] = {best_mbmode.ref_frame[0],
3364         best_mbmode.ref_frame[1]};
3365     int comp_pred_mode = refs[1] > INTRA_FRAME;
3366
3367     if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
3368         ((comp_pred_mode && frame_mv[NEARESTMV][refs[1]].as_int ==
3369             best_mbmode.mv[1].as_int) || !comp_pred_mode))
3370       best_mbmode.mode = NEARESTMV;
3371     else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
3372         ((comp_pred_mode && frame_mv[NEARMV][refs[1]].as_int ==
3373             best_mbmode.mv[1].as_int) || !comp_pred_mode))
3374       best_mbmode.mode = NEARMV;
3375     else if (best_mbmode.mv[0].as_int == 0 &&
3376         ((comp_pred_mode && best_mbmode.mv[1].as_int == 0) || !comp_pred_mode))
3377       best_mbmode.mode = ZEROMV;
3378   }
3379
3380   if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
3381     rd_cost->rate = INT_MAX;
3382     rd_cost->rdcost = INT64_MAX;
3383     return;
3384   }
3385
3386   // If we used an estimate for the uv intra rd in the loop above...
3387   if (sf->use_uv_intra_rd_estimate) {
3388     // Do Intra UV best rd mode selection if best mode choice above was intra.
3389     if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
3390       TX_SIZE uv_tx_size;
3391       *mbmi = best_mbmode;
3392       uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
3393       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
3394                               &rate_uv_tokenonly[uv_tx_size],
3395                               &dist_uv[uv_tx_size],
3396                               &skip_uv[uv_tx_size],
3397                               bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize,
3398                               uv_tx_size);
3399     }
3400   }
3401
3402   assert((cm->interp_filter == SWITCHABLE) ||
3403          (cm->interp_filter == best_mbmode.interp_filter) ||
3404          !is_inter_block(&best_mbmode));
3405
3406   if (!cpi->rc.is_src_frame_alt_ref)
3407     vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
3408                               sf->adaptive_rd_thresh, bsize, best_mode_index);
3409
3410   // macroblock modes
3411   *mbmi = best_mbmode;
3412   x->skip |= best_skip2;
3413
3414   for (i = 0; i < REFERENCE_MODES; ++i) {
3415     if (best_pred_rd[i] == INT64_MAX)
3416       best_pred_diff[i] = INT_MIN;
3417     else
3418       best_pred_diff[i] = best_rd - best_pred_rd[i];
3419   }
3420
3421   if (!x->skip) {
3422     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
3423       if (best_filter_rd[i] == INT64_MAX)
3424         best_filter_diff[i] = 0;
3425       else
3426         best_filter_diff[i] = best_rd - best_filter_rd[i];
3427     }
3428     if (cm->interp_filter == SWITCHABLE)
3429       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
3430     for (i = 0; i < TX_MODES; i++) {
3431       if (best_tx_rd[i] == INT64_MAX)
3432         best_tx_diff[i] = 0;
3433       else
3434         best_tx_diff[i] = best_rd - best_tx_rd[i];
3435     }
3436   } else {
3437     vp9_zero(best_filter_diff);
3438     vp9_zero(best_tx_diff);
3439   }
3440
3441   // TODO(yunqingwang): Moving this line in front of the above best_filter_diff
3442   // updating code causes PSNR loss. Need to figure out the confliction.
3443   x->skip |= best_mode_skippable;
3444
3445   if (!x->skip && !x->select_tx_size) {
3446     int has_high_freq_coeff = 0;
3447     int plane;
3448     int max_plane = is_inter_block(&xd->mi[0].src_mi->mbmi)
3449                         ? MAX_MB_PLANE : 1;
3450     for (plane = 0; plane < max_plane; ++plane) {
3451       x->plane[plane].eobs = ctx->eobs_pbuf[plane][1];
3452       has_high_freq_coeff |= vp9_has_high_freq_in_plane(x, bsize, plane);
3453     }
3454
3455     for (plane = max_plane; plane < MAX_MB_PLANE; ++plane) {
3456       x->plane[plane].eobs = ctx->eobs_pbuf[plane][2];
3457       has_high_freq_coeff |= vp9_has_high_freq_in_plane(x, bsize, plane);
3458     }
3459
3460     best_mode_skippable |= !has_high_freq_coeff;
3461   }
3462
3463   store_coding_context(x, ctx, best_mode_index, best_pred_diff,
3464                        best_tx_diff, best_filter_diff, best_mode_skippable);
3465 }
3466
3467 void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi,
3468                                         TileDataEnc *tile_data,
3469                                         MACROBLOCK *x,
3470                                         RD_COST *rd_cost,
3471                                         BLOCK_SIZE bsize,
3472                                         PICK_MODE_CONTEXT *ctx,
3473                                         int64_t best_rd_so_far) {
3474   VP9_COMMON *const cm = &cpi->common;
3475   MACROBLOCKD *const xd = &x->e_mbd;
3476   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
3477   unsigned char segment_id = mbmi->segment_id;
3478   const int comp_pred = 0;
3479   int i;
3480   int64_t best_tx_diff[TX_MODES];
3481   int64_t best_pred_diff[REFERENCE_MODES];
3482   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
3483   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
3484   vp9_prob comp_mode_p;
3485   INTERP_FILTER best_filter = SWITCHABLE;
3486   int64_t this_rd = INT64_MAX;
3487   int rate2 = 0;
3488   const int64_t distortion2 = 0;
3489
3490   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
3491
3492   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
3493                            &comp_mode_p);
3494
3495   for (i = 0; i < MAX_REF_FRAMES; ++i)
3496     x->pred_sse[i] = INT_MAX;
3497   for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i)
3498     x->pred_mv_sad[i] = INT_MAX;
3499
3500   rd_cost->rate = INT_MAX;
3501
3502   assert(vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
3503
3504   mbmi->mode = ZEROMV;
3505   mbmi->uv_mode = DC_PRED;
3506   mbmi->ref_frame[0] = LAST_FRAME;
3507   mbmi->ref_frame[1] = NONE;
3508   mbmi->mv[0].as_int = 0;
3509   x->skip = 1;
3510
3511   if (cm->interp_filter != BILINEAR) {
3512     best_filter = EIGHTTAP;
3513     if (cm->interp_filter == SWITCHABLE &&
3514         x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
3515       int rs;
3516       int best_rs = INT_MAX;
3517       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
3518         mbmi->interp_filter = i;
3519         rs = vp9_get_switchable_rate(cpi, xd);
3520         if (rs < best_rs) {
3521           best_rs = rs;
3522           best_filter = mbmi->interp_filter;
3523         }
3524       }
3525     }
3526   }
3527   // Set the appropriate filter
3528   if (cm->interp_filter == SWITCHABLE) {
3529     mbmi->interp_filter = best_filter;
3530     rate2 += vp9_get_switchable_rate(cpi, xd);
3531   } else {
3532     mbmi->interp_filter = cm->interp_filter;
3533   }
3534
3535   if (cm->reference_mode == REFERENCE_MODE_SELECT)
3536     rate2 += vp9_cost_bit(comp_mode_p, comp_pred);
3537
3538   // Estimate the reference frame signaling cost and add it
3539   // to the rolling cost variable.
3540   rate2 += ref_costs_single[LAST_FRAME];
3541   this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
3542
3543   rd_cost->rate = rate2;
3544   rd_cost->dist = distortion2;
3545   rd_cost->rdcost = this_rd;
3546
3547   if (this_rd >= best_rd_so_far) {
3548     rd_cost->rate = INT_MAX;
3549     rd_cost->rdcost = INT64_MAX;
3550     return;
3551   }
3552
3553   assert((cm->interp_filter == SWITCHABLE) ||
3554          (cm->interp_filter == mbmi->interp_filter));
3555
3556   vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
3557                             cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
3558
3559   vp9_zero(best_pred_diff);
3560   vp9_zero(best_filter_diff);
3561   vp9_zero(best_tx_diff);
3562
3563   if (!x->select_tx_size)
3564     swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE);
3565   store_coding_context(x, ctx, THR_ZEROMV,
3566                        best_pred_diff, best_tx_diff, best_filter_diff, 0);
3567 }
3568
3569 void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
3570                                    TileDataEnc *tile_data,
3571                                    MACROBLOCK *x,
3572                                    int mi_row, int mi_col,
3573                                    RD_COST *rd_cost,
3574                                    BLOCK_SIZE bsize,
3575                                    PICK_MODE_CONTEXT *ctx,
3576                                    int64_t best_rd_so_far) {
3577   VP9_COMMON *const cm = &cpi->common;
3578   TileInfo *const tile_info = &tile_data->tile_info;
3579   RD_OPT *const rd_opt = &cpi->rd;
3580   SPEED_FEATURES *const sf = &cpi->sf;
3581   MACROBLOCKD *const xd = &x->e_mbd;
3582   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
3583   const struct segmentation *const seg = &cm->seg;
3584   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
3585   unsigned char segment_id = mbmi->segment_id;
3586   int comp_pred, i;
3587   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
3588   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
3589   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
3590                                     VP9_ALT_FLAG };
3591   int64_t best_rd = best_rd_so_far;
3592   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
3593   static const int64_t best_tx_diff[TX_MODES] = { 0 };
3594   int64_t best_pred_diff[REFERENCE_MODES];
3595   int64_t best_pred_rd[REFERENCE_MODES];
3596   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
3597   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
3598   MB_MODE_INFO best_mbmode;
3599   int ref_index, best_ref_index = 0;
3600   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
3601   vp9_prob comp_mode_p;
3602   INTERP_FILTER tmp_best_filter = SWITCHABLE;
3603   int rate_uv_intra, rate_uv_tokenonly;
3604   int64_t dist_uv;
3605   int skip_uv;
3606   PREDICTION_MODE mode_uv = DC_PRED;
3607   const int intra_cost_penalty = vp9_get_intra_cost_penalty(
3608       cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
3609   int_mv seg_mvs[4][MAX_REF_FRAMES];
3610   b_mode_info best_bmodes[4];
3611   int best_skip2 = 0;
3612   int ref_frame_skip_mask[2] = { 0 };
3613   int64_t mask_filter = 0;
3614   int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
3615
3616   x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
3617   vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
3618   vp9_zero(best_mbmode);
3619
3620   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
3621     filter_cache[i] = INT64_MAX;
3622
3623   for (i = 0; i < 4; i++) {
3624     int j;
3625     for (j = 0; j < MAX_REF_FRAMES; j++)
3626       seg_mvs[i][j].as_int = INVALID_MV;
3627   }
3628
3629   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
3630                            &comp_mode_p);
3631
3632   for (i = 0; i < REFERENCE_MODES; ++i)
3633     best_pred_rd[i] = INT64_MAX;
3634   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
3635     best_filter_rd[i] = INT64_MAX;
3636   rate_uv_intra = INT_MAX;
3637
3638   rd_cost->rate = INT_MAX;
3639
3640   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
3641     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
3642       setup_buffer_inter(cpi, x, tile_info,
3643                          ref_frame, bsize, mi_row, mi_col,
3644                          frame_mv[NEARESTMV], frame_mv[NEARMV],
3645                          yv12_mb);
3646     } else {
3647       ref_frame_skip_mask[0] |= (1 << ref_frame);
3648       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
3649     }
3650     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
3651     frame_mv[ZEROMV][ref_frame].as_int = 0;
3652   }
3653
3654   for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
3655     int mode_excluded = 0;
3656     int64_t this_rd = INT64_MAX;
3657     int disable_skip = 0;
3658     int compmode_cost = 0;
3659     int rate2 = 0, rate_y = 0, rate_uv = 0;
3660     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
3661     int skippable = 0;
3662     int i;
3663     int this_skip2 = 0;
3664     int64_t total_sse = INT_MAX;
3665     int early_term = 0;
3666
3667     ref_frame = vp9_ref_order[ref_index].ref_frame[0];
3668     second_ref_frame = vp9_ref_order[ref_index].ref_frame[1];
3669
3670     // Look at the reference frame of the best mode so far and set the
3671     // skip mask to look at a subset of the remaining modes.
3672     if (ref_index > 2 && sf->mode_skip_start < MAX_MODES) {
3673       if (ref_index == 3) {
3674         switch (best_mbmode.ref_frame[0]) {
3675           case INTRA_FRAME:
3676             break;
3677           case LAST_FRAME:
3678             ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME);
3679             ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
3680             break;
3681           case GOLDEN_FRAME:
3682             ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << ALTREF_FRAME);
3683             ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
3684             break;
3685           case ALTREF_FRAME:
3686             ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << LAST_FRAME);
3687             break;
3688           case NONE:
3689           case MAX_REF_FRAMES:
3690             assert(0 && "Invalid Reference frame");
3691             break;
3692         }
3693       }
3694     }
3695
3696     if (ref_frame_skip_mask[0] & (1 << ref_frame) &&
3697         ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame)))
3698       continue;
3699
3700     // Test best rd so far against threshold for trying this mode.
3701     if (rd_less_than_thresh(best_rd,
3702                             rd_opt->threshes[segment_id][bsize][ref_index],
3703                             tile_data->thresh_freq_fact[bsize][ref_index]))
3704       continue;
3705
3706     comp_pred = second_ref_frame > INTRA_FRAME;
3707     if (comp_pred) {
3708       if (!cm->allow_comp_inter_inter)
3709         continue;
3710       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
3711         continue;
3712       // Do not allow compound prediction if the segment level reference frame
3713       // feature is in use as in this case there can only be one reference.
3714       if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
3715         continue;
3716
3717       if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
3718           best_mbmode.ref_frame[0] == INTRA_FRAME)
3719         continue;
3720     }
3721
3722     // TODO(jingning, jkoleszar): scaling reference frame not supported for
3723     // sub8x8 blocks.
3724     if (ref_frame > INTRA_FRAME &&
3725         vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
3726       continue;
3727
3728     if (second_ref_frame > INTRA_FRAME &&
3729         vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf))
3730       continue;
3731
3732     if (comp_pred)
3733       mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
3734     else if (ref_frame != INTRA_FRAME)
3735       mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
3736
3737     // If the segment reference frame feature is enabled....
3738     // then do nothing if the current ref frame is not allowed..
3739     if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
3740         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
3741       continue;
3742     // Disable this drop out case if the ref frame
3743     // segment level feature is enabled for this segment. This is to
3744     // prevent the possibility that we end up unable to pick any mode.
3745     } else if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
3746       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
3747       // unless ARNR filtering is enabled in which case we want
3748       // an unfiltered alternative. We allow near/nearest as well
3749       // because they may result in zero-zero MVs but be cheaper.
3750       if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
3751         continue;
3752     }
3753
3754     mbmi->tx_size = TX_4X4;
3755     mbmi->uv_mode = DC_PRED;
3756     mbmi->ref_frame[0] = ref_frame;
3757     mbmi->ref_frame[1] = second_ref_frame;
3758     // Evaluate all sub-pel filters irrespective of whether we can use
3759     // them for this frame.
3760     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
3761                                                           : cm->interp_filter;
3762     x->skip = 0;
3763     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
3764
3765     // Select prediction reference frames.
3766     for (i = 0; i < MAX_MB_PLANE; i++) {
3767       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
3768       if (comp_pred)
3769         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
3770     }
3771
3772     if (ref_frame == INTRA_FRAME) {
3773       int rate;
3774       if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
3775                                        &distortion_y, best_rd) >= best_rd)
3776         continue;
3777       rate2 += rate;
3778       rate2 += intra_cost_penalty;
3779       distortion2 += distortion_y;
3780
3781       if (rate_uv_intra == INT_MAX) {
3782         choose_intra_uv_mode(cpi, x, ctx, bsize, TX_4X4,
3783                              &rate_uv_intra,
3784                              &rate_uv_tokenonly,
3785                              &dist_uv, &skip_uv,
3786                              &mode_uv);
3787       }
3788       rate2 += rate_uv_intra;
3789       rate_uv = rate_uv_tokenonly;
3790       distortion2 += dist_uv;
3791       distortion_uv = dist_uv;
3792       mbmi->uv_mode = mode_uv;
3793     } else {
3794       int rate;
3795       int64_t distortion;
3796       int64_t this_rd_thresh;
3797       int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
3798       int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
3799       int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
3800       int tmp_best_skippable = 0;
3801       int switchable_filter_index;
3802       int_mv *second_ref = comp_pred ?
3803                              &mbmi->ref_mvs[second_ref_frame][0] : NULL;
3804       b_mode_info tmp_best_bmodes[16];
3805       MB_MODE_INFO tmp_best_mbmode;
3806       BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
3807       int pred_exists = 0;
3808       int uv_skippable;
3809
3810       this_rd_thresh = (ref_frame == LAST_FRAME) ?
3811           rd_opt->threshes[segment_id][bsize][THR_LAST] :
3812           rd_opt->threshes[segment_id][bsize][THR_ALTR];
3813       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
3814       rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
3815       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
3816         filter_cache[i] = INT64_MAX;
3817
3818       if (cm->interp_filter != BILINEAR) {
3819         tmp_best_filter = EIGHTTAP;
3820         if (x->source_variance < sf->disable_filter_search_var_thresh) {
3821           tmp_best_filter = EIGHTTAP;
3822         } else if (sf->adaptive_pred_interp_filter == 1 &&
3823                    ctx->pred_interp_filter < SWITCHABLE) {
3824           tmp_best_filter = ctx->pred_interp_filter;
3825         } else if (sf->adaptive_pred_interp_filter == 2) {
3826           tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE ?
3827                               ctx->pred_interp_filter : 0;
3828         } else {
3829           for (switchable_filter_index = 0;
3830                switchable_filter_index < SWITCHABLE_FILTERS;
3831                ++switchable_filter_index) {
3832             int newbest, rs;
3833             int64_t rs_rd;
3834             mbmi->interp_filter = switchable_filter_index;
3835             tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile_info,
3836                                               &mbmi->ref_mvs[ref_frame][0],
3837                                               second_ref, best_yrd, &rate,
3838                                               &rate_y, &distortion,
3839                                               &skippable, &total_sse,
3840                                               (int) this_rd_thresh, seg_mvs,
3841                                               bsi, switchable_filter_index,
3842                                               mi_row, mi_col);
3843
3844             if (tmp_rd == INT64_MAX)
3845               continue;
3846             rs = vp9_get_switchable_rate(cpi, xd);
3847             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
3848             filter_cache[switchable_filter_index] = tmp_rd;
3849             filter_cache[SWITCHABLE_FILTERS] =
3850                 MIN(filter_cache[SWITCHABLE_FILTERS],
3851                     tmp_rd + rs_rd);
3852             if (cm->interp_filter == SWITCHABLE)
3853               tmp_rd += rs_rd;
3854
3855             mask_filter = MAX(mask_filter, tmp_rd);
3856
3857             newbest = (tmp_rd < tmp_best_rd);
3858             if (newbest) {
3859               tmp_best_filter = mbmi->interp_filter;
3860               tmp_best_rd = tmp_rd;
3861             }
3862             if ((newbest && cm->interp_filter == SWITCHABLE) ||
3863                 (mbmi->interp_filter == cm->interp_filter &&
3864                  cm->interp_filter != SWITCHABLE)) {
3865               tmp_best_rdu = tmp_rd;
3866               tmp_best_rate = rate;
3867               tmp_best_ratey = rate_y;
3868               tmp_best_distortion = distortion;
3869               tmp_best_sse = total_sse;
3870               tmp_best_skippable = skippable;
3871               tmp_best_mbmode = *mbmi;
3872               for (i = 0; i < 4; i++) {
3873                 tmp_best_bmodes[i] = xd->mi[0].src_mi->bmi[i];
3874                 x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
3875               }
3876               pred_exists = 1;
3877               if (switchable_filter_index == 0 &&
3878                   sf->use_rd_breakout &&
3879                   best_rd < INT64_MAX) {
3880                 if (tmp_best_rdu / 2 > best_rd) {
3881                   // skip searching the other filters if the first is
3882                   // already substantially larger than the best so far
3883                   tmp_best_filter = mbmi->interp_filter;
3884                   tmp_best_rdu = INT64_MAX;
3885                   break;
3886                 }
3887               }
3888             }
3889           }  // switchable_filter_index loop
3890         }
3891       }
3892
3893       if (tmp_best_rdu == INT64_MAX && pred_exists)
3894         continue;
3895
3896       mbmi->interp_filter = (cm->interp_filter == SWITCHABLE ?
3897                              tmp_best_filter : cm->interp_filter);
3898       if (!pred_exists) {
3899         // Handles the special case when a filter that is not in the
3900         // switchable list (bilinear, 6-tap) is indicated at the frame level
3901         tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile_info,
3902                                           &mbmi->ref_mvs[ref_frame][0],
3903                                           second_ref, best_yrd, &rate, &rate_y,
3904                                           &distortion, &skippable, &total_sse,
3905                                           (int) this_rd_thresh, seg_mvs, bsi, 0,
3906                                           mi_row, mi_col);
3907         if (tmp_rd == INT64_MAX)
3908           continue;
3909       } else {
3910         total_sse = tmp_best_sse;
3911         rate = tmp_best_rate;
3912         rate_y = tmp_best_ratey;
3913         distortion = tmp_best_distortion;
3914         skippable = tmp_best_skippable;
3915         *mbmi = tmp_best_mbmode;
3916         for (i = 0; i < 4; i++)
3917           xd->mi[0].src_mi->bmi[i] = tmp_best_bmodes[i];
3918       }
3919
3920       rate2 += rate;
3921       distortion2 += distortion;
3922
3923       if (cm->interp_filter == SWITCHABLE)
3924         rate2 += vp9_get_switchable_rate(cpi, xd);
3925
3926       if (!mode_excluded)
3927         mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
3928                                   : cm->reference_mode == COMPOUND_REFERENCE;
3929
3930       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
3931
3932       tmp_best_rdu = best_rd -
3933           MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
3934               RDCOST(x->rdmult, x->rddiv, 0, total_sse));
3935
3936       if (tmp_best_rdu > 0) {
3937         // If even the 'Y' rd value of split is higher than best so far
3938         // then dont bother looking at UV
3939         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
3940                                         BLOCK_8X8);
3941         vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
3942         if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
3943                               &uv_sse, BLOCK_8X8, tmp_best_rdu))
3944           continue;
3945
3946         rate2 += rate_uv;
3947         distortion2 += distortion_uv;
3948         skippable = skippable && uv_skippable;
3949         total_sse += uv_sse;
3950       }
3951     }
3952
3953     if (cm->reference_mode == REFERENCE_MODE_SELECT)
3954       rate2 += compmode_cost;
3955
3956     // Estimate the reference frame signaling cost and add it
3957     // to the rolling cost variable.
3958     if (second_ref_frame > INTRA_FRAME) {
3959       rate2 += ref_costs_comp[ref_frame];
3960     } else {
3961       rate2 += ref_costs_single[ref_frame];
3962     }
3963
3964     if (!disable_skip) {
3965       // Skip is never coded at the segment level for sub8x8 blocks and instead
3966       // always coded in the bitstream at the mode info level.
3967
3968       if (ref_frame != INTRA_FRAME && !xd->lossless) {
3969         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
3970             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
3971           // Add in the cost of the no skip flag.
3972           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
3973         } else {
3974           // FIXME(rbultje) make this work for splitmv also
3975           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
3976           distortion2 = total_sse;
3977           assert(total_sse >= 0);
3978           rate2 -= (rate_y + rate_uv);
3979           rate_y = 0;
3980           rate_uv = 0;
3981           this_skip2 = 1;
3982         }
3983       } else {
3984         // Add in the cost of the no skip flag.
3985         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
3986       }
3987
3988       // Calculate the final RD estimate for this mode.
3989       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
3990     }
3991
3992     if (!disable_skip && ref_frame == INTRA_FRAME) {
3993       for (i = 0; i < REFERENCE_MODES; ++i)
3994         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
3995       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
3996         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
3997     }
3998
3999     // Did this mode help.. i.e. is it the new best mode
4000     if (this_rd < best_rd || x->skip) {
4001       if (!mode_excluded) {
4002         int max_plane = MAX_MB_PLANE;
4003         // Note index of best mode so far
4004         best_ref_index = ref_index;
4005
4006         if (ref_frame == INTRA_FRAME) {
4007           /* required for left and above block mv */
4008           mbmi->mv[0].as_int = 0;
4009           max_plane = 1;
4010         }
4011
4012         rd_cost->rate = rate2;
4013         rd_cost->dist = distortion2;
4014         rd_cost->rdcost = this_rd;
4015         best_rd = this_rd;
4016         best_yrd = best_rd -
4017                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
4018         best_mbmode = *mbmi;
4019         best_skip2 = this_skip2;
4020         if (!x->select_tx_size)
4021           swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
4022         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
4023                    sizeof(uint8_t) * ctx->num_4x4_blk);
4024
4025         for (i = 0; i < 4; i++)
4026           best_bmodes[i] = xd->mi[0].src_mi->bmi[i];
4027
4028         // TODO(debargha): enhance this test with a better distortion prediction
4029         // based on qp, activity mask and history
4030         if ((sf->mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
4031             (ref_index > MIN_EARLY_TERM_INDEX)) {
4032           int qstep = xd->plane[0].dequant[1];
4033           // TODO(debargha): Enhance this by specializing for each mode_index
4034           int scale = 4;
4035 #if CONFIG_VP9_HIGHBITDEPTH
4036           if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
4037             qstep >>= (xd->bd - 8);
4038           }
4039 #endif  // CONFIG_VP9_HIGHBITDEPTH
4040           if (x->source_variance < UINT_MAX) {
4041             const int var_adjust = (x->source_variance < 16);
4042             scale -= var_adjust;
4043           }
4044           if (ref_frame > INTRA_FRAME &&
4045               distortion2 * scale < qstep * qstep) {
4046             early_term = 1;
4047           }
4048         }
4049       }
4050     }
4051
4052     /* keep record of best compound/single-only prediction */
4053     if (!disable_skip && ref_frame != INTRA_FRAME) {
4054       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
4055
4056       if (cm->reference_mode == REFERENCE_MODE_SELECT) {
4057         single_rate = rate2 - compmode_cost;
4058         hybrid_rate = rate2;
4059       } else {
4060         single_rate = rate2;
4061         hybrid_rate = rate2 + compmode_cost;
4062       }
4063
4064       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
4065       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
4066
4067       if (!comp_pred && single_rd < best_pred_rd[SINGLE_REFERENCE])
4068         best_pred_rd[SINGLE_REFERENCE] = single_rd;
4069       else if (comp_pred && single_rd < best_pred_rd[COMPOUND_REFERENCE])
4070         best_pred_rd[COMPOUND_REFERENCE] = single_rd;
4071
4072       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
4073         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
4074     }
4075
4076     /* keep record of best filter type */
4077     if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
4078         cm->interp_filter != BILINEAR) {
4079       int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
4080                               SWITCHABLE_FILTERS : cm->interp_filter];
4081       int64_t adj_rd;
4082       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
4083         if (ref == INT64_MAX)
4084           adj_rd = 0;
4085         else if (filter_cache[i] == INT64_MAX)
4086           // when early termination is triggered, the encoder does not have
4087           // access to the rate-distortion cost. it only knows that the cost
4088           // should be above the maximum valid value. hence it takes the known
4089           // maximum plus an arbitrary constant as the rate-distortion cost.
4090           adj_rd = mask_filter - ref + 10;
4091         else
4092           adj_rd = filter_cache[i] - ref;
4093
4094         adj_rd += this_rd;
4095         best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
4096       }
4097     }
4098
4099     if (early_term)
4100       break;
4101
4102     if (x->skip && !comp_pred)
4103       break;
4104   }
4105
4106   if (best_rd >= best_rd_so_far) {
4107     rd_cost->rate = INT_MAX;
4108     rd_cost->rdcost = INT64_MAX;
4109     return;
4110   }
4111
4112   // If we used an estimate for the uv intra rd in the loop above...
4113   if (sf->use_uv_intra_rd_estimate) {
4114     // Do Intra UV best rd mode selection if best mode choice above was intra.
4115     if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
4116       *mbmi = best_mbmode;
4117       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra,
4118                               &rate_uv_tokenonly,
4119                               &dist_uv,
4120                               &skip_uv,
4121                               BLOCK_8X8, TX_4X4);
4122     }
4123   }
4124
4125   if (best_rd == INT64_MAX) {
4126     rd_cost->rate = INT_MAX;
4127     rd_cost->dist = INT64_MAX;
4128     rd_cost->rdcost = INT64_MAX;
4129     return;
4130   }
4131
4132   assert((cm->interp_filter == SWITCHABLE) ||
4133          (cm->interp_filter == best_mbmode.interp_filter) ||
4134          !is_inter_block(&best_mbmode));
4135
4136   vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
4137                             sf->adaptive_rd_thresh, bsize, best_ref_index);
4138
4139   // macroblock modes
4140   *mbmi = best_mbmode;
4141   x->skip |= best_skip2;
4142   if (!is_inter_block(&best_mbmode)) {
4143     for (i = 0; i < 4; i++)
4144       xd->mi[0].src_mi->bmi[i].as_mode = best_bmodes[i].as_mode;
4145   } else {
4146     for (i = 0; i < 4; ++i)
4147       vpx_memcpy(&xd->mi[0].src_mi->bmi[i], &best_bmodes[i],
4148                  sizeof(b_mode_info));
4149
4150     mbmi->mv[0].as_int = xd->mi[0].src_mi->bmi[3].as_mv[0].as_int;
4151     mbmi->mv[1].as_int = xd->mi[0].src_mi->bmi[3].as_mv[1].as_int;
4152   }
4153
4154   for (i = 0; i < REFERENCE_MODES; ++i) {
4155     if (best_pred_rd[i] == INT64_MAX)
4156       best_pred_diff[i] = INT_MIN;
4157     else
4158       best_pred_diff[i] = best_rd - best_pred_rd[i];
4159   }
4160
4161   if (!x->skip) {
4162     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
4163       if (best_filter_rd[i] == INT64_MAX)
4164         best_filter_diff[i] = 0;
4165       else
4166         best_filter_diff[i] = best_rd - best_filter_rd[i];
4167     }
4168     if (cm->interp_filter == SWITCHABLE)
4169       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
4170   } else {
4171     vp9_zero(best_filter_diff);
4172   }
4173
4174   store_coding_context(x, ctx, best_ref_index,
4175                        best_pred_diff, best_tx_diff, best_filter_diff, 0);
4176 }
4177