granicus.if.org Git - libvpx/blob - vp9/encoder/vp9_rdopt.c

   1 /*
   2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11
  12 #include <stdio.h>
  13 #include <math.h>
  14 #include <limits.h>
  15 #include <assert.h>
  16
  17 #include "vp9/common/vp9_pragmas.h"
  18 #include "vp9/encoder/vp9_tokenize.h"
  19 #include "vp9/encoder/vp9_treewriter.h"
  20 #include "vp9/encoder/vp9_onyx_int.h"
  21 #include "vp9/encoder/vp9_modecosts.h"
  22 #include "vp9/encoder/vp9_encodeintra.h"
  23 #include "vp9/common/vp9_entropymode.h"
  24 #include "vp9/common/vp9_reconinter.h"
  25 #include "vp9/common/vp9_reconintra.h"
  26 #include "vp9/common/vp9_findnearmv.h"
  27 #include "vp9/common/vp9_quant_common.h"
  28 #include "vp9/encoder/vp9_encodemb.h"
  29 #include "vp9/encoder/vp9_quantize.h"
  30 #include "vp9/encoder/vp9_variance.h"
  31 #include "vp9/encoder/vp9_mcomp.h"
  32 #include "vp9/encoder/vp9_rdopt.h"
  33 #include "vp9/encoder/vp9_ratectrl.h"
  34 #include "vpx_mem/vpx_mem.h"
  35 #include "vp9/common/vp9_systemdependent.h"
  36 #include "vp9/encoder/vp9_encodemv.h"
  37 #include "vp9/common/vp9_seg_common.h"
  38 #include "vp9/common/vp9_pred_common.h"
  39 #include "vp9/common/vp9_entropy.h"
  40 #include "vp9_rtcd.h"
  41 #include "vp9/common/vp9_mvref_common.h"
  42 #include "vp9/common/vp9_common.h"
  43
  44 #define INVALID_MV 0x80008000
  45
  46 /* Factor to weigh the rate for switchable interp filters */
  47 #define SWITCHABLE_INTERP_RATE_FACTOR 1
  48
  49 DECLARE_ALIGNED(16, extern const uint8_t,
  50                 vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
  51
  52 #define I4X4_PRED 0x8000
  53 #define SPLITMV 0x10000
  54
  55 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
  56   {NEARESTMV, LAST_FRAME,   NONE},
  57   {NEARESTMV, ALTREF_FRAME, NONE},
  58   {NEARESTMV, GOLDEN_FRAME, NONE},
  59   {NEWMV,     LAST_FRAME,   NONE},
  60   {NEARESTMV, LAST_FRAME,   ALTREF_FRAME},
  61   {NEARMV,    LAST_FRAME,   NONE},
  62   {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
  63
  64   {DC_PRED,   INTRA_FRAME,  NONE},
  65
  66   {NEWMV,     GOLDEN_FRAME, NONE},
  67   {NEWMV,     ALTREF_FRAME, NONE},
  68   {NEARMV,    ALTREF_FRAME, NONE},
  69
  70   {TM_PRED,   INTRA_FRAME,  NONE},
  71
  72   {NEARMV,    LAST_FRAME,   ALTREF_FRAME},
  73   {NEWMV,     LAST_FRAME,   ALTREF_FRAME},
  74   {NEARMV,    GOLDEN_FRAME, NONE},
  75   {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},
  76   {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},
  77
  78   {SPLITMV,   LAST_FRAME,   NONE},
  79   {SPLITMV,   GOLDEN_FRAME, NONE},
  80   {SPLITMV,   ALTREF_FRAME, NONE},
  81   {SPLITMV,   LAST_FRAME,   ALTREF_FRAME},
  82   {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME},
  83
  84   {ZEROMV,    LAST_FRAME,   NONE},
  85   {ZEROMV,    GOLDEN_FRAME, NONE},
  86   {ZEROMV,    ALTREF_FRAME, NONE},
  87   {ZEROMV,    LAST_FRAME,   ALTREF_FRAME},
  88   {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},
  89
  90   {I4X4_PRED, INTRA_FRAME,  NONE},
  91   {H_PRED,    INTRA_FRAME,  NONE},
  92   {V_PRED,    INTRA_FRAME,  NONE},
  93   {D135_PRED, INTRA_FRAME,  NONE},
  94   {D27_PRED,  INTRA_FRAME,  NONE},
  95   {D153_PRED, INTRA_FRAME,  NONE},
  96   {D63_PRED,  INTRA_FRAME,  NONE},
  97   {D117_PRED, INTRA_FRAME,  NONE},
  98   {D45_PRED,  INTRA_FRAME,  NONE},
  99 };
 100
 101 // The baseline rd thresholds for breaking out of the rd loop for
 102 // certain modes are assumed to be based on 8x8 blocks.
 103 // This table is used to correct for blocks size.
 104 // The factors here are << 2 (2 = x0.5, 32 = x8 etc).
 105 static int rd_thresh_block_size_factor[BLOCK_SIZE_TYPES] =
 106   {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
 107
 108 #define BASE_RD_THRESH_FREQ_FACT 16
 109 #define MAX_RD_THRESH_FREQ_FACT 32
 110 #define MAX_RD_THRESH_FREQ_INC 1
 111
 112 static void fill_token_costs(vp9_coeff_cost *c,
 113                              vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
 114   int i, j, k, l;
 115   TX_SIZE t;
 116   for (t = TX_4X4; t <= TX_32X32; t++)
 117     for (i = 0; i < BLOCK_TYPES; i++)
 118       for (j = 0; j < REF_TYPES; j++)
 119         for (k = 0; k < COEF_BANDS; k++)
 120           for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
 121             vp9_prob probs[ENTROPY_NODES];
 122             vp9_model_to_full_probs(p[t][i][j][k][l], probs);
 123             vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs,
 124                             vp9_coef_tree);
 125             vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
 126                                  vp9_coef_tree);
 127             assert(c[t][i][j][k][0][l][DCT_EOB_TOKEN] ==
 128                    c[t][i][j][k][1][l][DCT_EOB_TOKEN]);
 129           }
 130 }
 131
 132 static const int rd_iifactor[32] = {
 133   4, 4, 3, 2, 1, 0, 0, 0,
 134   0, 0, 0, 0, 0, 0, 0, 0,
 135   0, 0, 0, 0, 0, 0, 0, 0,
 136   0, 0, 0, 0, 0, 0, 0, 0,
 137 };
 138
 139 // 3* dc_qlookup[Q]*dc_qlookup[Q];
 140
 141 /* values are now correlated to quantizer */
 142 static int sad_per_bit16lut[QINDEX_RANGE];
 143 static int sad_per_bit4lut[QINDEX_RANGE];
 144
 145 void vp9_init_me_luts() {
 146   int i;
 147
 148   // Initialize the sad lut tables using a formulaic calculation for now
 149   // This is to make it easier to resolve the impact of experimental changes
 150   // to the quantizer tables.
 151   for (i = 0; i < QINDEX_RANGE; i++) {
 152     sad_per_bit16lut[i] =
 153       (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);
 154     sad_per_bit4lut[i] = (int)(0.063 * vp9_convert_qindex_to_q(i) + 2.742);
 155   }
 156 }
 157
 158 static int compute_rd_mult(int qindex) {
 159   const int q = vp9_dc_quant(qindex, 0);
 160   return (11 * q * q) >> 2;
 161 }
 162
 163 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
 164   cpi->mb.sadperbit16 = sad_per_bit16lut[qindex];
 165   cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
 166 }
 167
 168
 169 void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
 170   int q, i, bsize;
 171
 172   vp9_clear_system_state();  // __asm emms;
 173
 174   // Further tests required to see if optimum is different
 175   // for key frames, golden frames and arf frames.
 176   // if (cpi->common.refresh_golden_frame ||
 177   //     cpi->common.refresh_alt_ref_frame)
 178   qindex = clamp(qindex, 0, MAXQ);
 179
 180   cpi->RDMULT = compute_rd_mult(qindex);
 181   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
 182     if (cpi->twopass.next_iiratio > 31)
 183       cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;
 184     else
 185       cpi->RDMULT +=
 186           (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
 187   }
 188   cpi->mb.errorperbit = cpi->RDMULT >> 6;
 189   cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
 190
 191   vp9_set_speed_features(cpi);
 192
 193   q = (int)pow(vp9_dc_quant(qindex, 0) >> 2, 1.25);
 194   q <<= 2;
 195   if (q < 8)
 196     q = 8;
 197
 198   if (cpi->RDMULT > 1000) {
 199     cpi->RDDIV = 1;
 200     cpi->RDMULT /= 100;
 201
 202     for (bsize = 0; bsize < BLOCK_SIZE_TYPES; ++bsize) {
 203       for (i = 0; i < MAX_MODES; ++i) {
 204         // Threshold here seem unecessarily harsh but fine given actual
 205         // range of values used for cpi->sf.thresh_mult[]
 206         int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
 207
 208         // *4 relates to the scaling of rd_thresh_block_size_factor[]
 209         if ((int64_t)cpi->sf.thresh_mult[i] < thresh_max) {
 210           cpi->rd_threshes[bsize][i] =
 211             cpi->sf.thresh_mult[i] * q *
 212             rd_thresh_block_size_factor[bsize] / (4 * 100);
 213         } else {
 214           cpi->rd_threshes[bsize][i] = INT_MAX;
 215         }
 216         cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
 217
 218         if (cpi->sf.adaptive_rd_thresh)
 219           cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
 220         else
 221           cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
 222       }
 223     }
 224   } else {
 225     cpi->RDDIV = 100;
 226
 227     for (bsize = 0; bsize < BLOCK_SIZE_TYPES; ++bsize) {
 228       for (i = 0; i < MAX_MODES; i++) {
 229         // Threshold here seem unecessarily harsh but fine given actual
 230         // range of values used for cpi->sf.thresh_mult[]
 231         int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
 232
 233         if (cpi->sf.thresh_mult[i] < thresh_max) {
 234           cpi->rd_threshes[bsize][i] =
 235             cpi->sf.thresh_mult[i] * q *
 236             rd_thresh_block_size_factor[bsize] / 4;
 237         } else {
 238           cpi->rd_threshes[bsize][i] = INT_MAX;
 239         }
 240         cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
 241
 242         if (cpi->sf.adaptive_rd_thresh)
 243           cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
 244         else
 245           cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
 246       }
 247     }
 248   }
 249
 250   fill_token_costs(cpi->mb.token_costs, cpi->common.fc.coef_probs);
 251
 252   for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
 253     vp9_cost_tokens(cpi->mb.partition_cost[i],
 254                     cpi->common.fc.partition_prob[cpi->common.frame_type][i],
 255                     vp9_partition_tree);
 256
 257   /*rough estimate for costing*/
 258   vp9_init_mode_costs(cpi);
 259
 260   if (cpi->common.frame_type != KEY_FRAME) {
 261     vp9_build_nmv_cost_table(
 262         cpi->mb.nmvjointcost,
 263         cpi->mb.e_mbd.allow_high_precision_mv ?
 264         cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
 265         &cpi->common.fc.nmvc,
 266         cpi->mb.e_mbd.allow_high_precision_mv, 1, 1);
 267
 268     for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
 269       MB_PREDICTION_MODE m;
 270
 271       for (m = NEARESTMV; m < MB_MODE_COUNT; m++)
 272         cpi->mb.inter_mode_cost[i][m - NEARESTMV] =
 273             cost_token(vp9_inter_mode_tree,
 274                        cpi->common.fc.inter_mode_probs[i],
 275                        vp9_inter_mode_encodings - NEARESTMV + m);
 276     }
 277   }
 278 }
 279
 280 static INLINE BLOCK_SIZE_TYPE get_block_size(int bwl, int bhl) {
 281   return bsize_from_dim_lookup[bwl][bhl];
 282 }
 283
 284 static BLOCK_SIZE_TYPE get_plane_block_size(BLOCK_SIZE_TYPE bsize,
 285                                             struct macroblockd_plane *pd) {
 286   return get_block_size(plane_block_width_log2by4(bsize, pd),
 287                         plane_block_height_log2by4(bsize, pd));
 288 }
 289
 290 static INLINE void linear_interpolate2(double x, int ntab, int inv_step,
 291                                        const double *tab1, const double *tab2,
 292                                        double *v1, double *v2) {
 293   double y = x * inv_step;
 294   int d = (int) y;
 295   if (d >= ntab - 1) {
 296     *v1 = tab1[ntab - 1];
 297     *v2 = tab2[ntab - 1];
 298   } else {
 299     double a = y - d;
 300     *v1 = tab1[d] * (1 - a) + tab1[d + 1] * a;
 301     *v2 = tab2[d] * (1 - a) + tab2[d + 1] * a;
 302   }
 303 }
 304
 305 static void model_rd_norm(double x, double *R, double *D) {
 306   static const int inv_tab_step = 8;
 307   static const int tab_size = 120;
 308   // NOTE: The tables below must be of the same size
 309   //
 310   // Normalized rate
 311   // This table models the rate for a Laplacian source
 312   // source with given variance when quantized with a uniform quantizer
 313   // with given stepsize. The closed form expression is:
 314   // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
 315   // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
 316   // and H(x) is the binary entropy function.
 317   static const double rate_tab[] = {
 318     64.00, 4.944, 3.949, 3.372, 2.966, 2.655, 2.403, 2.194,
 319     2.014, 1.858, 1.720, 1.596, 1.485, 1.384, 1.291, 1.206,
 320     1.127, 1.054, 0.986, 0.923, 0.863, 0.808, 0.756, 0.708,
 321     0.662, 0.619, 0.579, 0.541, 0.506, 0.473, 0.442, 0.412,
 322     0.385, 0.359, 0.335, 0.313, 0.291, 0.272, 0.253, 0.236,
 323     0.220, 0.204, 0.190, 0.177, 0.165, 0.153, 0.142, 0.132,
 324     0.123, 0.114, 0.106, 0.099, 0.091, 0.085, 0.079, 0.073,
 325     0.068, 0.063, 0.058, 0.054, 0.050, 0.047, 0.043, 0.040,
 326     0.037, 0.034, 0.032, 0.029, 0.027, 0.025, 0.023, 0.022,
 327     0.020, 0.019, 0.017, 0.016, 0.015, 0.014, 0.013, 0.012,
 328     0.011, 0.010, 0.009, 0.008, 0.008, 0.007, 0.007, 0.006,
 329     0.006, 0.005, 0.005, 0.005, 0.004, 0.004, 0.004, 0.003,
 330     0.003, 0.003, 0.003, 0.002, 0.002, 0.002, 0.002, 0.002,
 331     0.002, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
 332     0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.000,
 333   };
 334   // Normalized distortion
 335   // This table models the normalized distortion for a Laplacian source
 336   // source with given variance when quantized with a uniform quantizer
 337   // with given stepsize. The closed form expression is:
 338   // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
 339   // where x = qpstep / sqrt(variance)
 340   // Note the actual distortion is Dn * variance.
 341   static const double dist_tab[] = {
 342     0.000, 0.001, 0.005, 0.012, 0.021, 0.032, 0.045, 0.061,
 343     0.079, 0.098, 0.119, 0.142, 0.166, 0.190, 0.216, 0.242,
 344     0.269, 0.296, 0.324, 0.351, 0.378, 0.405, 0.432, 0.458,
 345     0.484, 0.509, 0.534, 0.557, 0.580, 0.603, 0.624, 0.645,
 346     0.664, 0.683, 0.702, 0.719, 0.735, 0.751, 0.766, 0.780,
 347     0.794, 0.807, 0.819, 0.830, 0.841, 0.851, 0.861, 0.870,
 348     0.878, 0.886, 0.894, 0.901, 0.907, 0.913, 0.919, 0.925,
 349     0.930, 0.935, 0.939, 0.943, 0.947, 0.951, 0.954, 0.957,
 350     0.960, 0.963, 0.966, 0.968, 0.971, 0.973, 0.975, 0.976,
 351     0.978, 0.980, 0.981, 0.982, 0.984, 0.985, 0.986, 0.987,
 352     0.988, 0.989, 0.990, 0.990, 0.991, 0.992, 0.992, 0.993,
 353     0.993, 0.994, 0.994, 0.995, 0.995, 0.996, 0.996, 0.996,
 354     0.996, 0.997, 0.997, 0.997, 0.997, 0.998, 0.998, 0.998,
 355     0.998, 0.998, 0.998, 0.999, 0.999, 0.999, 0.999, 0.999,
 356     0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 1.000,
 357   };
 358   /*
 359   assert(sizeof(rate_tab) == tab_size * sizeof(rate_tab[0]);
 360   assert(sizeof(dist_tab) == tab_size * sizeof(dist_tab[0]);
 361   assert(sizeof(rate_tab) == sizeof(dist_tab));
 362   */
 363   assert(x >= 0.0);
 364   linear_interpolate2(x, tab_size, inv_tab_step,
 365                       rate_tab, dist_tab, R, D);
 366 }
 367
 368 static void model_rd_from_var_lapndz(int var, int n, int qstep,
 369                                      int *rate, int64_t *dist) {
 370   // This function models the rate and distortion for a Laplacian
 371   // source with given variance when quantized with a uniform quantizer
 372   // with given stepsize. The closed form expressions are in:
 373   // Hang and Chen, "Source Model for transform video coder and its
 374   // application - Part I: Fundamental Theory", IEEE Trans. Circ.
 375   // Sys. for Video Tech., April 1997.
 376   vp9_clear_system_state();
 377   if (var == 0 || n == 0) {
 378     *rate = 0;
 379     *dist = 0;
 380   } else {
 381     double D, R;
 382     double s2 = (double) var / n;
 383     double x = qstep / sqrt(s2);
 384     model_rd_norm(x, &R, &D);
 385     *rate = ((n << 8) * R + 0.5);
 386     *dist = (var * D + 0.5);
 387   }
 388   vp9_clear_system_state();
 389 }
 390
 391 static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
 392                             MACROBLOCK *x, MACROBLOCKD *xd,
 393                             int *out_rate_sum, int64_t *out_dist_sum) {
 394   // Note our transform coeffs are 8 times an orthogonal transform.
 395   // Hence quantizer step is also 8 times. To get effective quantizer
 396   // we need to divide by 8 before sending to modeling function.
 397   int i, rate_sum = 0, dist_sum = 0;
 398
 399   for (i = 0; i < MAX_MB_PLANE; ++i) {
 400     struct macroblock_plane *const p = &x->plane[i];
 401     struct macroblockd_plane *const pd = &xd->plane[i];
 402
 403     // TODO(dkovalev) the same code in get_plane_block_size
 404     const int bwl = plane_block_width_log2by4(bsize, pd);
 405     const int bhl = plane_block_height_log2by4(bsize, pd);
 406     const BLOCK_SIZE_TYPE bs = get_block_size(bwl, bhl);
 407     unsigned int sse;
 408     int rate;
 409     int64_t dist;
 410     (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
 411                               pd->dst.buf, pd->dst.stride, &sse);
 412     // sse works better than var, since there is no dc prediction used
 413     model_rd_from_var_lapndz(sse, 16 << (bwl + bhl),
 414                              pd->dequant[1] >> 3, &rate, &dist);
 415
 416     rate_sum += rate;
 417     dist_sum += dist;
 418   }
 419
 420   *out_rate_sum = rate_sum;
 421   *out_dist_sum = dist_sum << 4;
 422 }
 423
 424 static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
 425                               MACROBLOCK *x, MACROBLOCKD *xd,
 426                               int *out_rate_sum, int64_t *out_dist_sum) {
 427   // Note our transform coeffs are 8 times an orthogonal transform.
 428   // Hence quantizer step is also 8 times. To get effective quantizer
 429   // we need to divide by 8 before sending to modeling function.
 430   struct macroblock_plane *const p = &x->plane[0];
 431   struct macroblockd_plane *const pd = &xd->plane[0];
 432
 433   // TODO(dkovalev) the same code in get_plane_block_size
 434   const int bwl = plane_block_width_log2by4(bsize, pd);
 435   const int bhl = plane_block_height_log2by4(bsize, pd);
 436   const BLOCK_SIZE_TYPE bs = get_block_size(bwl, bhl);
 437   unsigned int sse;
 438   int rate;
 439   int64_t dist;
 440   (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
 441                             pd->dst.buf, pd->dst.stride, &sse);
 442   // sse works better than var, since there is no dc prediction used
 443   model_rd_from_var_lapndz(sse, 16 << (bwl + bhl),
 444                            pd->dequant[1] >> 3, &rate, &dist);
 445
 446   *out_rate_sum = rate;
 447   *out_dist_sum = dist << 4;
 448 }
 449
 450 static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
 451                                  TX_SIZE tx_size,
 452                                  MACROBLOCK *x, MACROBLOCKD *xd,
 453                                  int *out_rate_sum, int64_t *out_dist_sum,
 454                                  int *out_skip) {
 455   int t = 4, j, k;
 456   BLOCK_SIZE_TYPE bs = BLOCK_SIZE_AB4X4;
 457   struct macroblock_plane *const p = &x->plane[0];
 458   struct macroblockd_plane *const pd = &xd->plane[0];
 459   const int width = plane_block_width(bsize, pd);
 460   const int height = plane_block_height(bsize, pd);
 461   int rate_sum = 0;
 462   int64_t dist_sum = 0;
 463
 464   if (tx_size == TX_4X4) {
 465     bs = BLOCK_4X4;
 466     t = 4;
 467   } else if (tx_size == TX_8X8) {
 468     bs = BLOCK_8X8;
 469     t = 8;
 470   } else if (tx_size == TX_16X16) {
 471     bs = BLOCK_16X16;
 472     t = 16;
 473   } else if (tx_size == TX_32X32) {
 474     bs = BLOCK_32X32;
 475     t = 32;
 476   } else {
 477     assert(0);
 478   }
 479   *out_skip = 1;
 480   for (j = 0; j < height; j += t) {
 481     for (k = 0; k < width; k += t) {
 482       int rate;
 483       int64_t dist;
 484       unsigned int sse;
 485       (void) cpi->fn_ptr[bs].vf(p->src.buf + j * p->src.stride + k,
 486                                 p->src.stride,
 487                                 pd->dst.buf + j * pd->dst.stride + k,
 488                                 pd->dst.stride, &sse);
 489       // sse works better than var, since there is no dc prediction used
 490       model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3,
 491                                &rate, &dist);
 492       rate_sum += rate;
 493       dist_sum += dist;
 494       *out_skip &= (rate < 1024);
 495     }
 496   }
 497   *out_rate_sum = rate_sum;
 498   *out_dist_sum = (dist_sum << 4);
 499 }
 500
 501 int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
 502                           intptr_t block_size, int64_t *ssz) {
 503   int i;
 504   int64_t error = 0, sqcoeff = 0;
 505
 506   for (i = 0; i < block_size; i++) {
 507     int this_diff = coeff[i] - dqcoeff[i];
 508     error += (unsigned)this_diff * this_diff;
 509     sqcoeff += (unsigned) coeff[i] * coeff[i];
 510   }
 511
 512   *ssz = sqcoeff;
 513   return error;
 514 }
 515
 516 /* The trailing '0' is a terminator which is used inside cost_coeffs() to
 517  * decide whether to include cost of a trailing EOB node or not (i.e. we
 518  * can skip this if the last coefficient in this transform block, e.g. the
 519  * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
 520  * were non-zero). */
 521 static const int16_t band_counts[TX_SIZES][8] = {
 522   { 1, 2, 3, 4,  3,   16 - 13, 0 },
 523   { 1, 2, 3, 4, 11,   64 - 21, 0 },
 524   { 1, 2, 3, 4, 11,  256 - 21, 0 },
 525   { 1, 2, 3, 4, 11, 1024 - 21, 0 },
 526 };
 527
 528 static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
 529                               int plane, int block, PLANE_TYPE type,
 530                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
 531                               TX_SIZE tx_size,
 532                               const int16_t *scan, const int16_t *nb) {
 533   MACROBLOCKD *const xd = &mb->e_mbd;
 534   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
 535   int pt, c, cost;
 536   const int16_t *band_count = &band_counts[tx_size][1];
 537   const int eob = xd->plane[plane].eobs[block];
 538   const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
 539   const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
 540   unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS]
 541                     [MAX_ENTROPY_TOKENS] = mb->token_costs[tx_size][type][ref];
 542   ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
 543   uint8_t token_cache[1024];
 544
 545   // Check for consistency of tx_size with mode info
 546   assert((!type && !plane) || (type && plane));
 547   if (type == PLANE_TYPE_Y_WITH_DC) {
 548     assert(xd->mode_info_context->mbmi.txfm_size == tx_size);
 549   } else {
 550     assert(tx_size == get_uv_tx_size(mbmi));
 551   }
 552
 553   pt = combine_entropy_contexts(above_ec, left_ec);
 554
 555   if (eob == 0) {
 556     // single eob token
 557     cost = token_costs[0][0][pt][DCT_EOB_TOKEN];
 558     c = 0;
 559   } else {
 560     int v, prev_t, band_left = *band_count++;
 561
 562     // dc token
 563     v = qcoeff_ptr[0];
 564     prev_t = vp9_dct_value_tokens_ptr[v].token;
 565     cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
 566     token_cache[0] = vp9_pt_energy_class[prev_t];
 567     ++token_costs;
 568
 569     // ac tokens
 570     for (c = 1; c < eob; c++) {
 571       const int rc = scan[c];
 572       int t;
 573
 574       v = qcoeff_ptr[rc];
 575       t = vp9_dct_value_tokens_ptr[v].token;
 576       pt = get_coef_context(nb, token_cache, c);
 577       cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
 578       token_cache[rc] = vp9_pt_energy_class[t];
 579       prev_t = t;
 580       if (!--band_left) {
 581         band_left = *band_count++;
 582         ++token_costs;
 583       }
 584     }
 585
 586     // eob token
 587     if (band_left) {
 588       pt = get_coef_context(nb, token_cache, c);
 589       cost += (*token_costs)[0][pt][DCT_EOB_TOKEN];
 590     }
 591   }
 592
 593   // is eob first coefficient;
 594   *A = *L = c > 0;
 595
 596   return cost;
 597 }
 598
 599 struct rdcost_block_args {
 600   VP9_COMMON *cm;
 601   MACROBLOCK *x;
 602   ENTROPY_CONTEXT t_above[16];
 603   ENTROPY_CONTEXT t_left[16];
 604   TX_SIZE tx_size;
 605   int bw;
 606   int bh;
 607   int rate;
 608   int64_t dist;
 609   int64_t sse;
 610   int64_t best_rd;
 611   int skip;
 612   const int16_t *scan, *nb;
 613 };
 614
 615 static void dist_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
 616                        int ss_txfrm_size, void *arg) {
 617   struct rdcost_block_args* args = arg;
 618   MACROBLOCK* const x = args->x;
 619   MACROBLOCKD* const xd = &x->e_mbd;
 620   struct macroblock_plane *const p = &x->plane[0];
 621   struct macroblockd_plane *const pd = &xd->plane[0];
 622   int64_t this_sse;
 623   int shift = args->tx_size == TX_32X32 ? 0 : 2;
 624   int16_t *const coeff = BLOCK_OFFSET(p->coeff, block, 16);
 625   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
 626   args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
 627                                 &this_sse) >> shift;
 628   args->sse += this_sse >> shift;
 629
 630   if (x->skip_encode &&
 631       xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {
 632     // TODO(jingning): tune the model to better capture the distortion.
 633     int64_t p = (pd->dequant[1] * pd->dequant[1] *
 634                     (1 << ss_txfrm_size)) >> shift;
 635     args->dist += p;
 636     args->sse  += p;
 637   }
 638 }
 639
 640 static void rate_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
 641                        int ss_txfrm_size, void *arg) {
 642   struct rdcost_block_args* args = arg;
 643   int x_idx, y_idx;
 644   MACROBLOCKD * const xd = &args->x->e_mbd;
 645
 646   txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx,
 647                            &y_idx);
 648
 649   args->rate += cost_coeffs(args->cm, args->x, plane, block,
 650                             xd->plane[plane].plane_type, args->t_above + x_idx,
 651                             args->t_left + y_idx, args->tx_size,
 652                             args->scan, args->nb);
 653 }
 654
 655 // FIXME(jingning): need to make the rd test of chroma components consistent
 656 // with that of luma component. this function should be deprecated afterwards.
 657 static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane,
 658                         BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
 659   MACROBLOCKD * const xd = &x->e_mbd;
 660   const int bwl = plane_block_width_log2by4(bsize, &xd->plane[plane]);
 661   const int bhl = plane_block_height_log2by4(bsize, &xd->plane[plane]);
 662   const int bw = 1 << bwl, bh = 1 << bhl;
 663   int i;
 664   struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
 665     0, 0, 0, INT64_MAX, 0 };
 666
 667   switch (tx_size) {
 668     case TX_4X4:
 669       vpx_memcpy(&args.t_above, xd->plane[plane].above_context,
 670                  sizeof(ENTROPY_CONTEXT) * bw);
 671       vpx_memcpy(&args.t_left, xd->plane[plane].left_context,
 672                  sizeof(ENTROPY_CONTEXT) * bh);
 673       args.scan = vp9_default_scan_4x4;
 674       args.nb = vp9_default_scan_4x4_neighbors;
 675       break;
 676     case TX_8X8:
 677       for (i = 0; i < bw; i += 2)
 678         args.t_above[i] = !!*(uint16_t *)&xd->plane[plane].above_context[i];
 679       for (i = 0; i < bh; i += 2)
 680         args.t_left[i] = !!*(uint16_t *)&xd->plane[plane].left_context[i];
 681       args.scan = vp9_default_scan_8x8;
 682       args.nb = vp9_default_scan_8x8_neighbors;
 683       break;
 684     case TX_16X16:
 685       for (i = 0; i < bw; i += 4)
 686         args.t_above[i] = !!*(uint32_t *)&xd->plane[plane].above_context[i];
 687       for (i = 0; i < bh; i += 4)
 688         args.t_left[i] = !!*(uint32_t *)&xd->plane[plane].left_context[i];
 689       args.scan = vp9_default_scan_16x16;
 690       args.nb = vp9_default_scan_16x16_neighbors;
 691       break;
 692     case TX_32X32:
 693       for (i = 0; i < bw; i += 8)
 694         args.t_above[i] = !!*(uint64_t *)&xd->plane[plane].above_context[i];
 695       for (i = 0; i < bh; i += 8)
 696         args.t_left[i] = !!*(uint64_t *)&xd->plane[plane].left_context[i];
 697       args.scan = vp9_default_scan_32x32;
 698       args.nb = vp9_default_scan_32x32_neighbors;
 699       break;
 700     default:
 701       assert(0);
 702   }
 703
 704   foreach_transformed_block_in_plane(xd, bsize, plane, rate_block, &args);
 705   return args.rate;
 706 }
 707
 708 static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
 709                      BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
 710   int cost = 0, plane;
 711
 712   for (plane = 1; plane < MAX_MB_PLANE; plane++) {
 713     cost += rdcost_plane(cm, x, plane, bsize, tx_size);
 714   }
 715   return cost;
 716 }
 717
 718 static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
 719                            int shift, int64_t *sse) {
 720   struct macroblockd_plane *p = &x->e_mbd.plane[0];
 721   const int bwl = plane_block_width_log2by4(bsize, p);
 722   const int bhl = plane_block_height_log2by4(bsize, p);
 723   int64_t e = vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
 724                               16 << (bwl + bhl), sse) >> shift;
 725   *sse >>= shift;
 726   return e;
 727 }
 728
 729 static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
 730                                 int shift, int64_t *sse) {
 731   int64_t sum = 0, this_sse;
 732   int plane;
 733
 734   *sse = 0;
 735   for (plane = 1; plane < MAX_MB_PLANE; plane++) {
 736     struct macroblockd_plane *p = &x->e_mbd.plane[plane];
 737     const int bwl = plane_block_width_log2by4(bsize, p);
 738     const int bhl = plane_block_height_log2by4(bsize, p);
 739     sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
 740                            16 << (bwl + bhl), &this_sse);
 741     *sse += this_sse;
 742   }
 743   *sse >>= shift;
 744   return sum >> shift;
 745 }
 746
 747 static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE bsize,
 748                            int ss_txfrm_size, void *arg) {
 749   struct rdcost_block_args *args = arg;
 750   MACROBLOCK *const x = args->x;
 751   MACROBLOCKD *const xd = &x->e_mbd;
 752   struct encode_b_args encode_args = {args->cm, x, NULL};
 753   int64_t rd1, rd2, rd;
 754
 755   if (args->skip)
 756     return;
 757   rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
 758   rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
 759   rd = MIN(rd1, rd2);
 760   if (rd > args->best_rd) {
 761     args->skip = 1;
 762     args->rate = INT_MAX;
 763     args->dist = INT64_MAX;
 764     args->sse  = INT64_MAX;
 765     return;
 766   }
 767
 768   if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
 769     encode_block_intra(plane, block, bsize, ss_txfrm_size, &encode_args);
 770   else
 771     xform_quant(plane, block, bsize, ss_txfrm_size, &encode_args);
 772
 773   dist_block(plane, block, bsize, ss_txfrm_size, args);
 774   rate_block(plane, block, bsize, ss_txfrm_size, args);
 775 }
 776
 777 static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
 778                                      int *rate, int64_t *distortion,
 779                                      int *skippable, int64_t *sse,
 780                                      int64_t ref_best_rd,
 781                                      BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
 782   MACROBLOCKD *const xd = &x->e_mbd;
 783   struct macroblockd_plane *const pd = &xd->plane[0];
 784   const int bwl = plane_block_width_log2by4(bsize, pd);
 785   const int bhl = plane_block_height_log2by4(bsize, pd);
 786   const int bw = 1 << bwl, bh = 1 << bhl;
 787   int i;
 788   struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
 789                                     0, 0, 0, ref_best_rd, 0 };
 790   xd->mode_info_context->mbmi.txfm_size = tx_size;
 791   switch (tx_size) {
 792     case TX_4X4:
 793       vpx_memcpy(&args.t_above, pd->above_context,
 794                  sizeof(ENTROPY_CONTEXT) * bw);
 795       vpx_memcpy(&args.t_left, pd->left_context,
 796                  sizeof(ENTROPY_CONTEXT) * bh);
 797       get_scan_nb_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, 0),
 798                       &args.scan, &args.nb);
 799       break;
 800     case TX_8X8:
 801       for (i = 0; i < bw; i += 2)
 802         args.t_above[i] = !!*(uint16_t *)&pd->above_context[i];
 803       for (i = 0; i < bh; i += 2)
 804         args.t_left[i] = !!*(uint16_t *)&pd->left_context[i];
 805       get_scan_nb_8x8(get_tx_type_8x8(PLANE_TYPE_Y_WITH_DC, xd),
 806                       &args.scan, &args.nb);
 807       break;
 808     case TX_16X16:
 809       for (i = 0; i < bw; i += 4)
 810         args.t_above[i] = !!*(uint32_t *)&pd->above_context[i];
 811       for (i = 0; i < bh; i += 4)
 812         args.t_left[i] = !!*(uint32_t *)&pd->left_context[i];
 813       get_scan_nb_16x16(get_tx_type_16x16(PLANE_TYPE_Y_WITH_DC, xd),
 814                         &args.scan, &args.nb);
 815       break;
 816     case TX_32X32:
 817       for (i = 0; i < bw; i += 8)
 818         args.t_above[i] = !!*(uint64_t *)&pd->above_context[i];
 819       for (i = 0; i < bh; i += 8)
 820         args.t_left[i] = !!*(uint64_t *)&pd->left_context[i];
 821       args.scan = vp9_default_scan_32x32;
 822       args.nb = vp9_default_scan_32x32_neighbors;
 823       break;
 824     default:
 825       assert(0);
 826   }
 827
 828   foreach_transformed_block_in_plane(xd, bsize, 0, block_yrd_txfm, &args);
 829   *distortion = args.dist;
 830   *rate       = args.rate;
 831   *sse        = args.sse;
 832   *skippable  = vp9_sby_is_skippable(xd, bsize) && (!args.skip);
 833 }
 834
 835 static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
 836                                      int *rate, int64_t *distortion,
 837                                      int *skip, int64_t *sse,
 838                                      int64_t ref_best_rd,
 839                                      BLOCK_SIZE_TYPE bs) {
 840   const TX_SIZE max_txfm_size = TX_32X32
 841       - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
 842   VP9_COMMON *const cm = &cpi->common;
 843   MACROBLOCKD *const xd = &x->e_mbd;
 844   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 845   if (max_txfm_size == TX_32X32 &&
 846       (cm->tx_mode == ALLOW_32X32 ||
 847        cm->tx_mode == TX_MODE_SELECT)) {
 848     mbmi->txfm_size = TX_32X32;
 849   } else if (max_txfm_size >= TX_16X16 &&
 850              (cm->tx_mode == ALLOW_16X16 ||
 851               cm->tx_mode == ALLOW_32X32 ||
 852               cm->tx_mode == TX_MODE_SELECT)) {
 853     mbmi->txfm_size = TX_16X16;
 854   } else if (cm->tx_mode != ONLY_4X4) {
 855     mbmi->txfm_size = TX_8X8;
 856   } else {
 857     mbmi->txfm_size = TX_4X4;
 858   }
 859   super_block_yrd_for_txfm(cm, x, rate, distortion, skip,
 860                            &sse[mbmi->txfm_size], ref_best_rd, bs,
 861                            mbmi->txfm_size);
 862   cpi->txfm_stepdown_count[0]++;
 863 }
 864
 865 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
 866                                      int (*r)[2], int *rate,
 867                                      int64_t *d, int64_t *distortion,
 868                                      int *s, int *skip,
 869                                      int64_t txfm_cache[TX_MODES],
 870                                      BLOCK_SIZE_TYPE bs) {
 871   const TX_SIZE max_txfm_size = TX_32X32
 872       - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
 873   VP9_COMMON *const cm = &cpi->common;
 874   MACROBLOCKD *const xd = &x->e_mbd;
 875   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 876   vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
 877   int64_t rd[TX_SIZES][2];
 878   int n, m;
 879   int s0, s1;
 880
 881   const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs);
 882
 883   for (n = TX_4X4; n <= max_txfm_size; n++) {
 884     r[n][1] = r[n][0];
 885     if (r[n][0] == INT_MAX)
 886       continue;
 887     for (m = 0; m <= n - (n == max_txfm_size); m++) {
 888       if (m == n)
 889         r[n][1] += vp9_cost_zero(tx_probs[m]);
 890       else
 891         r[n][1] += vp9_cost_one(tx_probs[m]);
 892     }
 893   }
 894
 895   assert(skip_prob > 0);
 896   s0 = vp9_cost_bit(skip_prob, 0);
 897   s1 = vp9_cost_bit(skip_prob, 1);
 898
 899   for (n = TX_4X4; n <= max_txfm_size; n++) {
 900     if (d[n] == INT64_MAX) {
 901       rd[n][0] = rd[n][1] = INT64_MAX;
 902       continue;
 903     }
 904     if (s[n]) {
 905       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
 906     } else {
 907       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
 908       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
 909     }
 910   }
 911
 912   if (max_txfm_size == TX_32X32 &&
 913       (cm->tx_mode == ALLOW_32X32 ||
 914        (cm->tx_mode == TX_MODE_SELECT &&
 915         rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
 916         rd[TX_32X32][1] < rd[TX_4X4][1]))) {
 917     mbmi->txfm_size = TX_32X32;
 918   } else if (max_txfm_size >= TX_16X16 &&
 919              (cm->tx_mode == ALLOW_16X16 ||
 920               cm->tx_mode == ALLOW_32X32 ||
 921               (cm->tx_mode == TX_MODE_SELECT &&
 922                rd[TX_16X16][1] < rd[TX_8X8][1] &&
 923                rd[TX_16X16][1] < rd[TX_4X4][1]))) {
 924     mbmi->txfm_size = TX_16X16;
 925   } else if (cm->tx_mode == ALLOW_8X8 ||
 926              cm->tx_mode == ALLOW_16X16 ||
 927              cm->tx_mode == ALLOW_32X32 ||
 928            (cm->tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
 929     mbmi->txfm_size = TX_8X8;
 930   } else {
 931     mbmi->txfm_size = TX_4X4;
 932   }
 933
 934   *distortion = d[mbmi->txfm_size];
 935   *rate       = r[mbmi->txfm_size][cm->tx_mode == TX_MODE_SELECT];
 936   *skip       = s[mbmi->txfm_size];
 937
 938   txfm_cache[ONLY_4X4] = rd[TX_4X4][0];
 939   txfm_cache[ALLOW_8X8] = rd[TX_8X8][0];
 940   txfm_cache[ALLOW_16X16] = rd[MIN(max_txfm_size, TX_16X16)][0];
 941   txfm_cache[ALLOW_32X32] = rd[MIN(max_txfm_size, TX_32X32)][0];
 942   if (max_txfm_size == TX_32X32 &&
 943       rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
 944       rd[TX_32X32][1] < rd[TX_4X4][1])
 945     txfm_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
 946   else if (max_txfm_size >= TX_16X16 &&
 947            rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
 948     txfm_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
 949   else
 950     txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
 951                                  rd[TX_4X4][1] : rd[TX_8X8][1];
 952
 953   if (max_txfm_size == TX_32X32 &&
 954       rd[TX_32X32][1] < rd[TX_16X16][1] &&
 955       rd[TX_32X32][1] < rd[TX_8X8][1] &&
 956       rd[TX_32X32][1] < rd[TX_4X4][1]) {
 957     cpi->txfm_stepdown_count[0]++;
 958   } else if (max_txfm_size >= TX_16X16 &&
 959              rd[TX_16X16][1] < rd[TX_8X8][1] &&
 960              rd[TX_16X16][1] < rd[TX_4X4][1]) {
 961     cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++;
 962   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
 963     cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++;
 964   } else {
 965     cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++;
 966   }
 967 }
 968
 969 static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
 970                                           int (*r)[2], int *rate,
 971                                           int64_t *d, int64_t *distortion,
 972                                           int *s, int *skip, int64_t *sse,
 973                                           int64_t ref_best_rd,
 974                                           BLOCK_SIZE_TYPE bs,
 975                                           int *model_used) {
 976   const TX_SIZE max_txfm_size = TX_32X32
 977       - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
 978   VP9_COMMON *const cm = &cpi->common;
 979   MACROBLOCKD *const xd = &x->e_mbd;
 980   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 981   vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
 982   int64_t rd[TX_SIZES][2];
 983   int n, m;
 984   int s0, s1;
 985   double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
 986   // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00};
 987
 988   const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs);
 989
 990   // for (n = TX_4X4; n <= max_txfm_size; n++)
 991   //   r[n][0] = (r[n][0] * scale_r[n]);
 992
 993   for (n = TX_4X4; n <= max_txfm_size; n++) {
 994     r[n][1] = r[n][0];
 995     for (m = 0; m <= n - (n == max_txfm_size); m++) {
 996       if (m == n)
 997         r[n][1] += vp9_cost_zero(tx_probs[m]);
 998       else
 999         r[n][1] += vp9_cost_one(tx_probs[m]);
1000     }
1001   }
1002
1003   assert(skip_prob > 0);
1004   s0 = vp9_cost_bit(skip_prob, 0);
1005   s1 = vp9_cost_bit(skip_prob, 1);
1006
1007   for (n = TX_4X4; n <= max_txfm_size; n++) {
1008     if (s[n]) {
1009       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
1010     } else {
1011       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
1012       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
1013     }
1014   }
1015   for (n = TX_4X4; n <= max_txfm_size; n++) {
1016     rd[n][0] = (scale_rd[n] * rd[n][0]);
1017     rd[n][1] = (scale_rd[n] * rd[n][1]);
1018   }
1019
1020   if (max_txfm_size == TX_32X32 &&
1021       (cm->tx_mode == ALLOW_32X32 ||
1022        (cm->tx_mode == TX_MODE_SELECT &&
1023         rd[TX_32X32][1] <= rd[TX_16X16][1] &&
1024         rd[TX_32X32][1] <= rd[TX_8X8][1] &&
1025         rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
1026     mbmi->txfm_size = TX_32X32;
1027   } else if (max_txfm_size >= TX_16X16 &&
1028              (cm->tx_mode == ALLOW_16X16 ||
1029               cm->tx_mode == ALLOW_32X32 ||
1030               (cm->tx_mode == TX_MODE_SELECT &&
1031                rd[TX_16X16][1] <= rd[TX_8X8][1] &&
1032                rd[TX_16X16][1] <= rd[TX_4X4][1]))) {
1033     mbmi->txfm_size = TX_16X16;
1034   } else if (cm->tx_mode == ALLOW_8X8 ||
1035              cm->tx_mode == ALLOW_16X16 ||
1036              cm->tx_mode == ALLOW_32X32 ||
1037            (cm->tx_mode == TX_MODE_SELECT &&
1038             rd[TX_8X8][1] <= rd[TX_4X4][1])) {
1039     mbmi->txfm_size = TX_8X8;
1040   } else {
1041     mbmi->txfm_size = TX_4X4;
1042   }
1043
1044   if (model_used[mbmi->txfm_size]) {
1045     // Actually encode using the chosen mode if a model was used, but do not
1046     // update the r, d costs
1047     super_block_yrd_for_txfm(cm, x, rate, distortion, skip,
1048                              &sse[mbmi->txfm_size], ref_best_rd,
1049                              bs, mbmi->txfm_size);
1050   } else {
1051     *distortion = d[mbmi->txfm_size];
1052     *rate       = r[mbmi->txfm_size][cm->tx_mode == TX_MODE_SELECT];
1053     *skip       = s[mbmi->txfm_size];
1054   }
1055
1056   if (max_txfm_size == TX_32X32 &&
1057       rd[TX_32X32][1] <= rd[TX_16X16][1] &&
1058       rd[TX_32X32][1] <= rd[TX_8X8][1] &&
1059       rd[TX_32X32][1] <= rd[TX_4X4][1]) {
1060     cpi->txfm_stepdown_count[0]++;
1061   } else if (max_txfm_size >= TX_16X16 &&
1062              rd[TX_16X16][1] <= rd[TX_8X8][1] &&
1063              rd[TX_16X16][1] <= rd[TX_4X4][1]) {
1064     cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++;
1065   } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
1066     cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++;
1067   } else {
1068     cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++;
1069   }
1070 }
1071
1072 static void super_block_yrd(VP9_COMP *cpi,
1073                             MACROBLOCK *x, int *rate, int64_t *distortion,
1074                             int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs,
1075                             int64_t txfm_cache[TX_MODES],
1076                             int64_t ref_best_rd) {
1077   VP9_COMMON *const cm = &cpi->common;
1078   int r[TX_SIZES][2], s[TX_SIZES];
1079   int64_t d[TX_SIZES], sse[TX_SIZES];
1080   MACROBLOCKD *xd = &x->e_mbd;
1081   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
1082
1083   assert(bs == mbmi->sb_type);
1084   if (mbmi->ref_frame[0] > INTRA_FRAME)
1085     vp9_subtract_sby(x, bs);
1086
1087   if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
1088       (cpi->sf.tx_size_search_method != USE_FULL_RD &&
1089        mbmi->ref_frame[0] == INTRA_FRAME)) {
1090     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
1091     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
1092                              ref_best_rd, bs);
1093     if (psse)
1094       *psse = sse[mbmi->txfm_size];
1095     return;
1096   }
1097
1098   if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
1099       mbmi->ref_frame[0] > INTRA_FRAME) {
1100     int model_used[TX_SIZES] = {1, 1, 1, 1};
1101     if (bs >= BLOCK_SIZE_SB32X32) {
1102       if (model_used[TX_32X32]) {
1103         model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
1104                              &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
1105       } else {
1106         super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],
1107                                  &s[TX_32X32], &sse[TX_32X32], INT64_MAX,
1108                                  bs, TX_32X32);
1109       }
1110     }
1111     if (bs >= BLOCK_SIZE_MB16X16) {
1112       if (model_used[TX_16X16]) {
1113         model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,
1114                              &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
1115       } else {
1116         super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],
1117                                  &s[TX_16X16], &sse[TX_16X16], INT64_MAX,
1118                                  bs, TX_16X16);
1119       }
1120     }
1121     if (model_used[TX_8X8]) {
1122       model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,
1123                            &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
1124     } else {
1125       super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
1126                                &sse[TX_8X8], INT64_MAX, bs, TX_8X8);
1127     }
1128     if (model_used[TX_4X4]) {
1129       model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,
1130                            &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
1131     } else {
1132       super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
1133                                &sse[TX_4X4], INT64_MAX, bs, TX_4X4);
1134     }
1135     choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
1136                                   skip, sse, ref_best_rd, bs, model_used);
1137   } else {
1138     if (bs >= BLOCK_SIZE_SB32X32)
1139       super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],
1140                                &s[TX_32X32], &sse[TX_32X32], ref_best_rd,
1141                                bs, TX_32X32);
1142     if (bs >= BLOCK_SIZE_MB16X16)
1143       super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],
1144                                &s[TX_16X16], &sse[TX_16X16], ref_best_rd,
1145                                bs, TX_16X16);
1146     super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
1147                              &sse[TX_8X8], ref_best_rd, bs, TX_8X8);
1148     super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
1149                              &sse[TX_4X4], ref_best_rd, bs, TX_4X4);
1150     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
1151                              skip, txfm_cache, bs);
1152   }
1153   if (psse)
1154     *psse = sse[mbmi->txfm_size];
1155 }
1156
1157 static int conditional_skipintra(MB_PREDICTION_MODE mode,
1158                                  MB_PREDICTION_MODE best_intra_mode) {
1159   if (mode == D117_PRED &&
1160       best_intra_mode != V_PRED &&
1161       best_intra_mode != D135_PRED)
1162     return 1;
1163   if (mode == D63_PRED &&
1164       best_intra_mode != V_PRED &&
1165       best_intra_mode != D45_PRED)
1166     return 1;
1167   if (mode == D27_PRED &&
1168       best_intra_mode != H_PRED &&
1169       best_intra_mode != D45_PRED)
1170     return 1;
1171   if (mode == D153_PRED &&
1172       best_intra_mode != H_PRED &&
1173       best_intra_mode != D135_PRED)
1174     return 1;
1175   return 0;
1176 }
1177
1178 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
1179                                      MB_PREDICTION_MODE *best_mode,
1180                                      int *bmode_costs,
1181                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
1182                                      int *bestrate, int *bestratey,
1183                                      int64_t *bestdistortion,
1184                                      BLOCK_SIZE_TYPE bsize,
1185                                      int64_t rd_thresh) {
1186   MB_PREDICTION_MODE mode;
1187   MACROBLOCKD *xd = &x->e_mbd;
1188   int64_t best_rd = rd_thresh;
1189   int rate = 0;
1190   int64_t distortion;
1191   VP9_COMMON *const cm = &cpi->common;
1192   struct macroblock_plane *p = &x->plane[0];
1193   struct macroblockd_plane *pd = &xd->plane[0];
1194   const int src_stride = p->src.stride;
1195   const int dst_stride = pd->dst.stride;
1196   uint8_t *src_init = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, ib,
1197                                                 p->src.buf, src_stride);
1198   uint8_t *dst_init = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, ib,
1199                                                 pd->dst.buf, dst_stride);
1200   int16_t *src_diff, *coeff;
1201
1202   ENTROPY_CONTEXT ta[2], tempa[2];
1203   ENTROPY_CONTEXT tl[2], templ[2];
1204   TX_TYPE tx_type = DCT_DCT;
1205   int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
1206   int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
1207   int idx, idy, block;
1208   uint8_t best_dst[8 * 8];
1209
1210   assert(ib < 4);
1211
1212   vpx_memcpy(ta, a, sizeof(ta));
1213   vpx_memcpy(tl, l, sizeof(tl));
1214   xd->mode_info_context->mbmi.txfm_size = TX_4X4;
1215
1216   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
1217     int64_t this_rd;
1218     int ratey = 0;
1219     // Only do the oblique modes if the best so far is
1220     // one of the neighboring directional modes
1221     if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
1222       if (conditional_skipintra(mode, *best_mode))
1223           continue;
1224     }
1225
1226     rate = bmode_costs[mode];
1227     distortion = 0;
1228
1229     vpx_memcpy(tempa, ta, sizeof(ta));
1230     vpx_memcpy(templ, tl, sizeof(tl));
1231
1232     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
1233       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
1234         int64_t ssz;
1235         const int16_t *scan;
1236         uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
1237         uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
1238
1239         block = ib + idy * 2 + idx;
1240         xd->mode_info_context->bmi[block].as_mode = mode;
1241         src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, block,
1242                                              p->src_diff);
1243         coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16);
1244         vp9_predict_intra_block(xd, block, 1,
1245                                 TX_4X4, mode,
1246                                 x->skip_encode ? src : dst,
1247                                 x->skip_encode ? src_stride : dst_stride,
1248                                 dst, dst_stride);
1249         vp9_subtract_block(4, 4, src_diff, 8,
1250                            src, src_stride,
1251                            dst, dst_stride);
1252
1253         tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block);
1254         if (tx_type != DCT_DCT) {
1255           vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
1256           x->quantize_b_4x4(x, block, tx_type, 16);
1257         } else {
1258           x->fwd_txm4x4(src_diff, coeff, 16);
1259           x->quantize_b_4x4(x, block, tx_type, 16);
1260         }
1261
1262         scan = get_scan_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block));
1263         ratey += cost_coeffs(cm, x, 0, block, PLANE_TYPE_Y_WITH_DC,
1264                              tempa + idx, templ + idy, TX_4X4, scan,
1265                              vp9_get_coef_neighbors_handle(scan));
1266         distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff,
1267                                                           block, 16),
1268                                       16, &ssz) >> 2;
1269         if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
1270           goto next;
1271
1272         if (tx_type != DCT_DCT)
1273           vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
1274                                dst, pd->dst.stride, tx_type);
1275         else
1276           xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
1277                              dst, pd->dst.stride);
1278       }
1279     }
1280
1281     rate += ratey;
1282     this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
1283
1284     if (this_rd < best_rd) {
1285       *bestrate = rate;
1286       *bestratey = ratey;
1287       *bestdistortion = distortion;
1288       best_rd = this_rd;
1289       *best_mode = mode;
1290       vpx_memcpy(a, tempa, sizeof(tempa));
1291       vpx_memcpy(l, templ, sizeof(templ));
1292       for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
1293         vpx_memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
1294                    num_4x4_blocks_wide * 4);
1295     }
1296   next:
1297     {}
1298   }
1299
1300   if (best_rd >= rd_thresh || x->skip_encode)
1301     return best_rd;
1302
1303   for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
1304     vpx_memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
1305                num_4x4_blocks_wide * 4);
1306
1307   return best_rd;
1308 }
1309
1310 static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
1311                                          int *Rate, int *rate_y,
1312                                          int64_t *Distortion, int64_t best_rd) {
1313   int i, j;
1314   MACROBLOCKD *const xd = &mb->e_mbd;
1315   BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
1316   int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
1317   int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
1318   int idx, idy;
1319   int cost = 0;
1320   int64_t distortion = 0;
1321   int tot_rate_y = 0;
1322   int64_t total_rd = 0;
1323   ENTROPY_CONTEXT t_above[4], t_left[4];
1324   int *bmode_costs;
1325   MODE_INFO *const mic = xd->mode_info_context;
1326
1327   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
1328   vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
1329
1330   bmode_costs = mb->mbmode_cost;
1331
1332   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
1333   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
1334     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
1335       const int mis = xd->mode_info_stride;
1336       MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
1337       int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);
1338       int64_t UNINITIALIZED_IS_SAFE(d), this_rd;
1339       i = idy * 2 + idx;
1340
1341       if (cpi->common.frame_type == KEY_FRAME) {
1342         const MB_PREDICTION_MODE A = above_block_mode(mic, i, mis);
1343         const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
1344                                      left_block_mode(mic, i) : DC_PRED;
1345
1346         bmode_costs  = mb->y_mode_costs[A][L];
1347       }
1348
1349       this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
1350                                       t_above + idx, t_left + idy,
1351                                       &r, &ry, &d, bsize,
1352                                       best_rd - total_rd);
1353       if (this_rd >= best_rd - total_rd)
1354         return INT64_MAX;
1355
1356       total_rd += this_rd;
1357       cost += r;
1358       distortion += d;
1359       tot_rate_y += ry;
1360
1361       mic->bmi[i].as_mode = best_mode;
1362       for (j = 1; j < num_4x4_blocks_high; ++j)
1363         mic->bmi[i + j * 2].as_mode = best_mode;
1364       for (j = 1; j < num_4x4_blocks_wide; ++j)
1365         mic->bmi[i + j].as_mode = best_mode;
1366
1367       if (total_rd >= best_rd)
1368         return INT64_MAX;
1369     }
1370   }
1371
1372   *Rate = cost;
1373   *rate_y = tot_rate_y;
1374   *Distortion = distortion;
1375   xd->mode_info_context->mbmi.mode = mic->bmi[3].as_mode;
1376
1377   return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
1378 }
1379
1380 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
1381                                       int *rate, int *rate_tokenonly,
1382                                       int64_t *distortion, int *skippable,
1383                                       BLOCK_SIZE_TYPE bsize,
1384                                       int64_t txfm_cache[TX_MODES],
1385                                       int64_t best_rd) {
1386   MB_PREDICTION_MODE mode;
1387   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
1388   MACROBLOCKD *const xd = &x->e_mbd;
1389   int this_rate, this_rate_tokenonly, s;
1390   int64_t this_distortion, this_rd;
1391   TX_SIZE UNINITIALIZED_IS_SAFE(best_tx);
1392   int i;
1393   int *bmode_costs = x->mbmode_cost;
1394
1395   if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
1396     for (i = 0; i < TX_MODES; i++)
1397       txfm_cache[i] = INT64_MAX;
1398   }
1399
1400   /* Y Search for intra prediction mode */
1401   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
1402     int64_t local_txfm_cache[TX_MODES];
1403     MODE_INFO *const mic = xd->mode_info_context;
1404     const int mis = xd->mode_info_stride;
1405
1406     if (cpi->common.frame_type == KEY_FRAME) {
1407       const MB_PREDICTION_MODE A = above_block_mode(mic, 0, mis);
1408       const MB_PREDICTION_MODE L = xd->left_available ?
1409                                    left_block_mode(mic, 0) : DC_PRED;
1410
1411       bmode_costs = x->y_mode_costs[A][L];
1412     }
1413     x->e_mbd.mode_info_context->mbmi.mode = mode;
1414
1415     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
1416                     bsize, local_txfm_cache, best_rd);
1417
1418     if (this_rate_tokenonly == INT_MAX)
1419       continue;
1420
1421     this_rate = this_rate_tokenonly + bmode_costs[mode];
1422     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
1423
1424     if (this_rd < best_rd) {
1425       mode_selected   = mode;
1426       best_rd         = this_rd;
1427       best_tx         = x->e_mbd.mode_info_context->mbmi.txfm_size;
1428       *rate           = this_rate;
1429       *rate_tokenonly = this_rate_tokenonly;
1430       *distortion     = this_distortion;
1431       *skippable      = s;
1432     }
1433
1434     if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
1435       for (i = 0; i < TX_MODES; i++) {
1436         int64_t adj_rd = this_rd + local_txfm_cache[i] -
1437             local_txfm_cache[cpi->common.tx_mode];
1438         if (adj_rd < txfm_cache[i]) {
1439           txfm_cache[i] = adj_rd;
1440         }
1441       }
1442     }
1443   }
1444
1445   x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
1446   x->e_mbd.mode_info_context->mbmi.txfm_size = best_tx;
1447
1448   return best_rd;
1449 }
1450
1451 static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
1452                                       int *rate, int64_t *distortion,
1453                                       int *skippable, int64_t *sse,
1454                                       BLOCK_SIZE_TYPE bsize,
1455                                       TX_SIZE uv_tx_size) {
1456   MACROBLOCKD *const xd = &x->e_mbd;
1457   int64_t dummy;
1458   if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
1459     vp9_encode_intra_block_uv(cm, x, bsize);
1460   else
1461     vp9_xform_quant_sbuv(cm, x, bsize);
1462
1463   *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2,
1464                                  sse ? sse : &dummy);
1465   *rate       = rdcost_uv(cm, x, bsize, uv_tx_size);
1466   *skippable  = vp9_sbuv_is_skippable(xd, bsize);
1467 }
1468
1469 static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
1470                              int *rate, int64_t *distortion, int *skippable,
1471                              int64_t *sse, BLOCK_SIZE_TYPE bsize) {
1472   MACROBLOCKD *const xd = &x->e_mbd;
1473   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
1474   TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
1475
1476   if (mbmi->ref_frame[0] > INTRA_FRAME)
1477     vp9_subtract_sbuv(x, bsize);
1478
1479   super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, sse, bsize,
1480                             uv_txfm_size);
1481 }
1482
1483 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
1484                                        int *rate, int *rate_tokenonly,
1485                                        int64_t *distortion, int *skippable,
1486                                        BLOCK_SIZE_TYPE bsize) {
1487   MB_PREDICTION_MODE mode;
1488   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
1489   int64_t best_rd = INT64_MAX, this_rd;
1490   int this_rate_tokenonly, this_rate, s;
1491   int64_t this_distortion;
1492
1493   MB_PREDICTION_MODE last_mode = bsize <= BLOCK_SIZE_SB8X8 ?
1494               TM_PRED : cpi->sf.last_chroma_intra_mode;
1495
1496   for (mode = DC_PRED; mode <= last_mode; mode++) {
1497     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
1498     super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
1499                      &this_distortion, &s, NULL, bsize);
1500     this_rate = this_rate_tokenonly +
1501                 x->intra_uv_mode_cost[cpi->common.frame_type][mode];
1502     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
1503
1504     if (this_rd < best_rd) {
1505       mode_selected   = mode;
1506       best_rd         = this_rd;
1507       *rate           = this_rate;
1508       *rate_tokenonly = this_rate_tokenonly;
1509       *distortion     = this_distortion;
1510       *skippable      = s;
1511     }
1512   }
1513
1514   x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;
1515
1516   return best_rd;
1517 }
1518
1519 static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
1520                               int *rate, int *rate_tokenonly,
1521                               int64_t *distortion, int *skippable,
1522                               BLOCK_SIZE_TYPE bsize) {
1523   int64_t this_rd;
1524
1525   x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
1526   super_block_uvrd(&cpi->common, x, rate_tokenonly,
1527                    distortion, skippable, NULL, bsize);
1528   *rate = *rate_tokenonly +
1529           x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
1530   this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
1531
1532   return this_rd;
1533 }
1534
1535 static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
1536                                  int *rate_uv, int *rate_uv_tokenonly,
1537                                  int64_t *dist_uv, int *skip_uv,
1538                                  MB_PREDICTION_MODE *mode_uv) {
1539   MACROBLOCK *const x = &cpi->mb;
1540
1541   // Use an estimated rd for uv_intra based on DC_PRED if the
1542   // appropriate speed flag is set.
1543   if (cpi->sf.use_uv_intra_rd_estimate) {
1544     rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
1545                    (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
1546                    bsize);
1547   // Else do a proper rd search for each possible transform size that may
1548   // be considered in the main rd loop.
1549   } else {
1550     rd_pick_intra_sbuv_mode(cpi, x,
1551                             rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
1552                             (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8
1553                             : bsize);
1554   }
1555   *mode_uv = x->e_mbd.mode_info_context->mbmi.uv_mode;
1556 }
1557
1558 static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
1559                        int mode_context) {
1560   MACROBLOCK *const x = &cpi->mb;
1561   MACROBLOCKD *const xd = &x->e_mbd;
1562   const int segment_id = xd->mode_info_context->mbmi.segment_id;
1563
1564   // Don't account for mode here if segment skip is enabled.
1565   if (!vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)) {
1566     assert(is_inter_mode(mode));
1567     return x->inter_mode_cost[mode_context][mode - NEARESTMV];
1568   } else {
1569     return 0;
1570   }
1571 }
1572
1573 void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
1574   x->e_mbd.mode_info_context->mbmi.mode = mb;
1575   x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;
1576 }
1577
1578 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
1579                                 BLOCK_SIZE_TYPE bsize,
1580                                 int_mv *frame_mv,
1581                                 int mi_row, int mi_col,
1582                                 int_mv single_newmv[MAX_REF_FRAMES],
1583                                 int *rate_mv);
1584 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
1585                                  BLOCK_SIZE_TYPE bsize,
1586                                  int mi_row, int mi_col,
1587                                  int_mv *tmp_mv, int *rate_mv);
1588
1589 static int labels2mode(MACROBLOCK *x, int i,
1590                        MB_PREDICTION_MODE this_mode,
1591                        int_mv *this_mv, int_mv *this_second_mv,
1592                        int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
1593                        int_mv seg_mvs[MAX_REF_FRAMES],
1594                        int_mv *best_ref_mv,
1595                        int_mv *second_best_ref_mv,
1596                        int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {
1597   MACROBLOCKD *const xd = &x->e_mbd;
1598   MODE_INFO *const mic = xd->mode_info_context;
1599   MB_MODE_INFO * mbmi = &mic->mbmi;
1600   int cost = 0, thismvcost = 0;
1601   int idx, idy;
1602   int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
1603   int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
1604
1605   /* We have to be careful retrieving previously-encoded motion vectors.
1606    Ones from this macroblock have to be pulled from the BLOCKD array
1607    as they have not yet made it to the bmi array in our MB_MODE_INFO. */
1608   MB_PREDICTION_MODE m;
1609
1610   // the only time we should do costing for new motion vector or mode
1611   // is when we are on a new label  (jbb May 08, 2007)
1612   switch (m = this_mode) {
1613     case NEWMV:
1614       this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
1615       thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
1616                                     102, xd->allow_high_precision_mv);
1617       if (mbmi->ref_frame[1] > 0) {
1618         this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
1619         thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
1620                                       mvjcost, mvcost, 102,
1621                                       xd->allow_high_precision_mv);
1622       }
1623       break;
1624     case NEARESTMV:
1625       this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int;
1626       if (mbmi->ref_frame[1] > 0)
1627         this_second_mv->as_int =
1628             frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int;
1629       break;
1630     case NEARMV:
1631       this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int;
1632       if (mbmi->ref_frame[1] > 0)
1633         this_second_mv->as_int =
1634             frame_mv[NEARMV][mbmi->ref_frame[1]].as_int;
1635       break;
1636     case ZEROMV:
1637       this_mv->as_int = 0;
1638       if (mbmi->ref_frame[1] > 0)
1639         this_second_mv->as_int = 0;
1640       break;
1641     default:
1642       break;
1643   }
1644
1645   cost = cost_mv_ref(cpi, this_mode,
1646                      mbmi->mb_mode_context[mbmi->ref_frame[0]]);
1647
1648   mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
1649   if (mbmi->ref_frame[1] > 0)
1650     mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
1651
1652   x->partition_info->bmi[i].mode = m;
1653   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
1654     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
1655       vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
1656                  &mic->bmi[i], sizeof(mic->bmi[i]));
1657
1658   cost += thismvcost;
1659   return cost;
1660 }
1661
1662 static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
1663                                        MACROBLOCK *x,
1664                                        int64_t best_yrd,
1665                                        int i,
1666                                        int *labelyrate,
1667                                        int64_t *distortion, int64_t *sse,
1668                                        ENTROPY_CONTEXT *ta,
1669                                        ENTROPY_CONTEXT *tl) {
1670   int k;
1671   VP9_COMMON *const cm = &cpi->common;
1672   MACROBLOCKD *xd = &x->e_mbd;
1673   BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
1674   const int width = plane_block_width(bsize, &xd->plane[0]);
1675   const int height = plane_block_height(bsize, &xd->plane[0]);
1676   int idx, idy;
1677   const int src_stride = x->plane[0].src.stride;
1678   uint8_t* const src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
1679                                                  x->plane[0].src.buf,
1680                                                  src_stride);
1681   int16_t* src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
1682                                                 x->plane[0].src_diff);
1683   int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
1684   uint8_t* const pre = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
1685                                                  xd->plane[0].pre[0].buf,
1686                                                  xd->plane[0].pre[0].stride);
1687   uint8_t* const dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
1688                                                  xd->plane[0].dst.buf,
1689                                                  xd->plane[0].dst.stride);
1690   int64_t thisdistortion = 0, thissse = 0;
1691   int thisrate = 0;
1692
1693   vp9_build_inter_predictor(pre,
1694                             xd->plane[0].pre[0].stride,
1695                             dst,
1696                             xd->plane[0].dst.stride,
1697                             &xd->mode_info_context->bmi[i].as_mv[0],
1698                             &xd->scale_factor[0],
1699                             width, height, 0, &xd->subpix,
1700                             MV_PRECISION_Q3);
1701
1702   if (xd->mode_info_context->mbmi.ref_frame[1] > 0) {
1703     uint8_t* const second_pre =
1704     raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
1705                               xd->plane[0].pre[1].buf,
1706                               xd->plane[0].pre[1].stride);
1707     vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
1708                               dst, xd->plane[0].dst.stride,
1709                               &xd->mode_info_context->bmi[i].as_mv[1],
1710                               &xd->scale_factor[1],
1711                               width, height, 1,
1712                               &xd->subpix, MV_PRECISION_Q3);
1713   }
1714
1715   vp9_subtract_block(height, width, src_diff, 8,
1716                      src, src_stride,
1717                      dst, xd->plane[0].dst.stride);
1718
1719   k = i;
1720   for (idy = 0; idy < height / 4; ++idy) {
1721     for (idx = 0; idx < width / 4; ++idx) {
1722       int64_t ssz, rd, rd1, rd2;
1723
1724       k += (idy * 2 + idx);
1725       src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
1726                                            x->plane[0].src_diff);
1727       coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, k);
1728       x->fwd_txm4x4(src_diff, coeff, 16);
1729       x->quantize_b_4x4(x, k, DCT_DCT, 16);
1730       thisdistortion += vp9_block_error(coeff,
1731                                         BLOCK_OFFSET(xd->plane[0].dqcoeff,
1732                                                      k, 16), 16, &ssz);
1733       thissse += ssz;
1734       thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,
1735                               ta + (k & 1),
1736                               tl + (k >> 1), TX_4X4,
1737                               vp9_default_scan_4x4,
1738                               vp9_default_scan_4x4_neighbors);
1739       rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
1740       rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
1741       rd = MIN(rd1, rd2);
1742       if (rd >= best_yrd)
1743         return INT64_MAX;
1744     }
1745   }
1746   *distortion = thisdistortion >> 2;
1747   *labelyrate = thisrate;
1748   *sse = thissse >> 2;
1749
1750   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
1751 }
1752
1753 typedef struct {
1754   int eobs;
1755   int brate;
1756   int byrate;
1757   int64_t bdist;
1758   int64_t bsse;
1759   int64_t brdcost;
1760   int_mv mvs[2];
1761   ENTROPY_CONTEXT ta[2];
1762   ENTROPY_CONTEXT tl[2];
1763 } SEG_RDSTAT;
1764
1765 typedef struct {
1766   int_mv *ref_mv, *second_ref_mv;
1767   int_mv mvp;
1768
1769   int64_t segment_rd;
1770   int r;
1771   int64_t d;
1772   int64_t sse;
1773   int segment_yrate;
1774   MB_PREDICTION_MODE modes[4];
1775   SEG_RDSTAT rdstat[4][VP9_INTER_MODES];
1776   int mvthresh;
1777 } BEST_SEG_INFO;
1778
1779 static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
1780   int r = 0;
1781   r |= (mv->as_mv.row >> 3) < x->mv_row_min;
1782   r |= (mv->as_mv.row >> 3) > x->mv_row_max;
1783   r |= (mv->as_mv.col >> 3) < x->mv_col_min;
1784   r |= (mv->as_mv.col >> 3) > x->mv_col_max;
1785   return r;
1786 }
1787
1788 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
1789   MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
1790   x->plane[0].src.buf =
1791       raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
1792                                 x->plane[0].src.buf,
1793                                 x->plane[0].src.stride);
1794   assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0x7) == 0);
1795   x->e_mbd.plane[0].pre[0].buf =
1796       raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
1797                                 x->e_mbd.plane[0].pre[0].buf,
1798                                 x->e_mbd.plane[0].pre[0].stride);
1799   if (mbmi->ref_frame[1])
1800     x->e_mbd.plane[0].pre[1].buf =
1801         raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
1802                                   x->e_mbd.plane[0].pre[1].buf,
1803                                   x->e_mbd.plane[0].pre[1].stride);
1804 }
1805
1806 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
1807                                   struct buf_2d orig_pre[2]) {
1808   MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
1809   x->plane[0].src = orig_src;
1810   x->e_mbd.plane[0].pre[0] = orig_pre[0];
1811   if (mbmi->ref_frame[1])
1812     x->e_mbd.plane[0].pre[1] = orig_pre[1];
1813 }
1814
1815 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
1816                                     BEST_SEG_INFO *bsi_buf, int filter_idx,
1817                                     int_mv seg_mvs[4][MAX_REF_FRAMES],
1818                                     int mi_row, int mi_col) {
1819   int i, j, br = 0, idx, idy;
1820   int64_t bd = 0, block_sse = 0;
1821   MB_PREDICTION_MODE this_mode;
1822   MODE_INFO *mi = x->e_mbd.mode_info_context;
1823   MB_MODE_INFO *const mbmi = &mi->mbmi;
1824   const int label_count = 4;
1825   int64_t this_segment_rd = 0;
1826   int label_mv_thresh;
1827   int segmentyrate = 0;
1828   BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
1829   int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
1830   int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
1831   vp9_variance_fn_ptr_t *v_fn_ptr;
1832   ENTROPY_CONTEXT t_above[2], t_left[2];
1833   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
1834   int mode_idx;
1835   int subpelmv = 1, have_ref = 0;
1836
1837   vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
1838   vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
1839
1840   v_fn_ptr = &cpi->fn_ptr[bsize];
1841
1842   // 64 makes this threshold really big effectively
1843   // making it so that we very rarely check mvs on
1844   // segments.   setting this to 1 would make mv thresh
1845   // roughly equal to what it is for macroblocks
1846   label_mv_thresh = 1 * bsi->mvthresh / label_count;
1847
1848   // Segmentation method overheads
1849   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
1850     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
1851       // TODO(jingning,rbultje): rewrite the rate-distortion optimization
1852       // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
1853       int_mv mode_mv[MB_MODE_COUNT], second_mode_mv[MB_MODE_COUNT];
1854       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
1855       MB_PREDICTION_MODE mode_selected = ZEROMV;
1856       int64_t best_rd = INT64_MAX;
1857       i = idy * 2 + idx;
1858
1859       frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0;
1860       frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0;
1861       vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,
1862                                     &frame_mv[NEARESTMV][mbmi->ref_frame[0]],
1863                                     &frame_mv[NEARMV][mbmi->ref_frame[0]],
1864                                     i, 0);
1865       if (mbmi->ref_frame[1] > 0)
1866         vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,
1867                                    &frame_mv[NEARESTMV][mbmi->ref_frame[1]],
1868                                    &frame_mv[NEARMV][mbmi->ref_frame[1]],
1869                                    i, 1);
1870
1871       // search for the best motion vector on this segment
1872       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
1873         const struct buf_2d orig_src = x->plane[0].src;
1874         struct buf_2d orig_pre[2];
1875
1876         mode_idx = inter_mode_offset(this_mode);
1877         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
1878
1879         // if we're near/nearest and mv == 0,0, compare to zeromv
1880         if ((this_mode == NEARMV || this_mode == NEARESTMV ||
1881              this_mode == ZEROMV) &&
1882             frame_mv[this_mode][mbmi->ref_frame[0]].as_int == 0 &&
1883             (mbmi->ref_frame[1] <= 0 ||
1884              frame_mv[this_mode][mbmi->ref_frame[1]].as_int == 0)) {
1885           int rfc = mbmi->mb_mode_context[mbmi->ref_frame[0]];
1886           int c1 = cost_mv_ref(cpi, NEARMV, rfc);
1887           int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
1888           int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
1889
1890           if (this_mode == NEARMV) {
1891             if (c1 > c3)
1892               continue;
1893           } else if (this_mode == NEARESTMV) {
1894             if (c2 > c3)
1895               continue;
1896           } else {
1897             assert(this_mode == ZEROMV);
1898             if (mbmi->ref_frame[1] <= 0) {
1899               if ((c3 >= c2 &&
1900                    frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
1901                   (c3 >= c1 &&
1902                    frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
1903                 continue;
1904             } else {
1905               if ((c3 >= c2 &&
1906                    frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
1907                    frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
1908                   (c3 >= c1 &&
1909                    frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
1910                    frame_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
1911                 continue;
1912             }
1913           }
1914         }
1915
1916         vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre));
1917         vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
1918                    sizeof(bsi->rdstat[i][mode_idx].ta));
1919         vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
1920                    sizeof(bsi->rdstat[i][mode_idx].tl));
1921
1922         // motion search for newmv (single predictor case only)
1923         if (mbmi->ref_frame[1] <= 0 && this_mode == NEWMV &&
1924             seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
1925           int step_param = 0;
1926           int further_steps;
1927           int thissme, bestsme = INT_MAX;
1928           int sadpb = x->sadperbit4;
1929           int_mv mvp_full;
1930
1931           /* Is the best so far sufficiently good that we cant justify doing
1932            * and new motion search. */
1933           if (best_rd < label_mv_thresh)
1934             break;
1935
1936           if (cpi->compressor_speed) {
1937             // use previous block's result as next block's MV predictor.
1938             if (i > 0) {
1939               bsi->mvp.as_int =
1940               x->e_mbd.mode_info_context->bmi[i - 1].as_mv[0].as_int;
1941               if (i == 2)
1942                 bsi->mvp.as_int =
1943                 x->e_mbd.mode_info_context->bmi[i - 2].as_mv[0].as_int;
1944             }
1945           }
1946           if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
1947             // Take wtd average of the step_params based on the last frame's
1948             // max mv magnitude and the best ref mvs of the current block for
1949             // the given reference.
1950             if (i == 0)
1951               step_param = (vp9_init_search_range(
1952                   cpi, x->max_mv_context[mbmi->ref_frame[0]]) +
1953                   cpi->mv_step_param) >> 1;
1954             else
1955               step_param = (vp9_init_search_range(
1956                   cpi, MAX(abs(bsi->mvp.as_mv.row),
1957                            abs(bsi->mvp.as_mv.col)) >> 3) +
1958                   cpi->mv_step_param) >> 1;
1959           } else {
1960             step_param = cpi->mv_step_param;
1961           }
1962
1963           further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
1964
1965           mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
1966           mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
1967
1968           // adjust src pointer for this block
1969           mi_buf_shift(x, i);
1970           bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
1971                                            sadpb, further_steps, 0, v_fn_ptr,
1972                                            bsi->ref_mv, &mode_mv[NEWMV]);
1973
1974           // Should we do a full search (best quality only)
1975           if (cpi->compressor_speed == 0) {
1976             /* Check if mvp_full is within the range. */
1977             clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
1978                      x->mv_row_min, x->mv_row_max);
1979
1980             thissme = cpi->full_search_sad(x, &mvp_full,
1981                                            sadpb, 16, v_fn_ptr,
1982                                            x->nmvjointcost, x->mvcost,
1983                                            bsi->ref_mv, i);
1984
1985             if (thissme < bestsme) {
1986               bestsme = thissme;
1987               mode_mv[NEWMV].as_int =
1988                   x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int;
1989             } else {
1990               /* The full search result is actually worse so re-instate the
1991                * previous best vector */
1992               x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int =
1993                   mode_mv[NEWMV].as_int;
1994             }
1995           }
1996
1997           if (bestsme < INT_MAX) {
1998             int distortion;
1999             unsigned int sse;
2000             cpi->find_fractional_mv_step(x, &mode_mv[NEWMV],
2001                                          bsi->ref_mv, x->errorperbit, v_fn_ptr,
2002                                          x->nmvjointcost, x->mvcost,
2003                                          &distortion, &sse);
2004
2005             // safe motion search result for use in compound prediction
2006             seg_mvs[i][mbmi->ref_frame[0]].as_int = mode_mv[NEWMV].as_int;
2007           }
2008
2009           // restore src pointers
2010           mi_buf_restore(x, orig_src, orig_pre);
2011         }
2012
2013         if (mbmi->ref_frame[1] > 0 && this_mode == NEWMV &&
2014             mbmi->interp_filter == vp9_switchable_interp[0]) {
2015           if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
2016               seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
2017             continue;
2018
2019           // adjust src pointers
2020           mi_buf_shift(x, i);
2021           if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
2022             int rate_mv;
2023             joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
2024                                 mi_row, mi_col, seg_mvs[i],
2025                                 &rate_mv);
2026             seg_mvs[i][mbmi->ref_frame[0]].as_int =
2027                 frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
2028             seg_mvs[i][mbmi->ref_frame[1]].as_int =
2029                 frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
2030           }
2031           // restore src pointers
2032           mi_buf_restore(x, orig_src, orig_pre);
2033         }
2034
2035         bsi->rdstat[i][mode_idx].brate =
2036             labels2mode(x, i, this_mode, &mode_mv[this_mode],
2037                         &second_mode_mv[this_mode], frame_mv, seg_mvs[i],
2038                         bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
2039                         x->mvcost, cpi);
2040
2041         bsi->rdstat[i][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int;
2042         if (num_4x4_blocks_wide > 1)
2043           bsi->rdstat[i + 1][mode_idx].mvs[0].as_int =
2044               mode_mv[this_mode].as_int;
2045         if (num_4x4_blocks_high > 1)
2046           bsi->rdstat[i + 2][mode_idx].mvs[0].as_int =
2047               mode_mv[this_mode].as_int;
2048         if (mbmi->ref_frame[1] > 0) {
2049           bsi->rdstat[i][mode_idx].mvs[1].as_int =
2050               second_mode_mv[this_mode].as_int;
2051           if (num_4x4_blocks_wide > 1)
2052             bsi->rdstat[i + 1][mode_idx].mvs[1].as_int =
2053                 second_mode_mv[this_mode].as_int;
2054           if (num_4x4_blocks_high > 1)
2055             bsi->rdstat[i + 2][mode_idx].mvs[1].as_int =
2056                 second_mode_mv[this_mode].as_int;
2057         }
2058
2059         // Trap vectors that reach beyond the UMV borders
2060         if (mv_check_bounds(x, &mode_mv[this_mode]))
2061           continue;
2062         if (mbmi->ref_frame[1] > 0 &&
2063             mv_check_bounds(x, &second_mode_mv[this_mode]))
2064           continue;
2065
2066         if (filter_idx > 0) {
2067           BEST_SEG_INFO *ref_bsi = bsi_buf;
2068           subpelmv = (mode_mv[this_mode].as_mv.row & 0x0f) ||
2069                      (mode_mv[this_mode].as_mv.col & 0x0f);
2070           have_ref = mode_mv[this_mode].as_int ==
2071                      ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
2072           if (mbmi->ref_frame[1] > 0) {
2073             subpelmv |= (second_mode_mv[this_mode].as_mv.row & 0x0f) ||
2074                         (second_mode_mv[this_mode].as_mv.col & 0x0f);
2075             have_ref  &= second_mode_mv[this_mode].as_int ==
2076                          ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
2077           }
2078
2079           if (filter_idx > 1 && !subpelmv && !have_ref) {
2080             ref_bsi = bsi_buf + 1;
2081             have_ref = mode_mv[this_mode].as_int ==
2082                        ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
2083             if (mbmi->ref_frame[1] > 0) {
2084               have_ref  &= second_mode_mv[this_mode].as_int ==
2085                            ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
2086             }
2087           }
2088
2089           if (!subpelmv && have_ref &&
2090               ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
2091             vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
2092                        sizeof(SEG_RDSTAT));
2093             if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
2094               mode_selected = this_mode;
2095               best_rd = bsi->rdstat[i][mode_idx].brdcost;
2096             }
2097             continue;
2098           }
2099         }
2100
2101         bsi->rdstat[i][mode_idx].brdcost =
2102             encode_inter_mb_segment(cpi, x,
2103                                     bsi->segment_rd - this_segment_rd, i,
2104                                     &bsi->rdstat[i][mode_idx].byrate,
2105                                     &bsi->rdstat[i][mode_idx].bdist,
2106                                     &bsi->rdstat[i][mode_idx].bsse,
2107                                     bsi->rdstat[i][mode_idx].ta,
2108                                     bsi->rdstat[i][mode_idx].tl);
2109         if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
2110           bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
2111                                             bsi->rdstat[i][mode_idx].brate, 0);
2112           bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
2113           bsi->rdstat[i][mode_idx].eobs = x->e_mbd.plane[0].eobs[i];
2114         }
2115
2116         if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
2117           mode_selected = this_mode;
2118           best_rd = bsi->rdstat[i][mode_idx].brdcost;
2119         }
2120       } /*for each 4x4 mode*/
2121
2122       if (best_rd == INT64_MAX) {
2123         int iy, midx;
2124         for (iy = i + 1; iy < 4; ++iy)
2125           for (midx = 0; midx < VP9_INTER_MODES; ++midx)
2126             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
2127         bsi->segment_rd = INT64_MAX;
2128         return;
2129       }
2130
2131       mode_idx = inter_mode_offset(mode_selected);
2132       vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
2133       vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
2134
2135       labels2mode(x, i, mode_selected, &mode_mv[mode_selected],
2136                   &second_mode_mv[mode_selected], frame_mv, seg_mvs[i],
2137                   bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
2138                   x->mvcost, cpi);
2139
2140       br += bsi->rdstat[i][mode_idx].brate;
2141       bd += bsi->rdstat[i][mode_idx].bdist;
2142       block_sse += bsi->rdstat[i][mode_idx].bsse;
2143       segmentyrate += bsi->rdstat[i][mode_idx].byrate;
2144       this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
2145
2146       if (this_segment_rd > bsi->segment_rd) {
2147         int iy, midx;
2148         for (iy = i + 1; iy < 4; ++iy)
2149           for (midx = 0; midx < VP9_INTER_MODES; ++midx)
2150             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
2151         bsi->segment_rd = INT64_MAX;
2152         return;
2153       }
2154
2155       for (j = 1; j < num_4x4_blocks_high; ++j)
2156         vpx_memcpy(&x->partition_info->bmi[i + j * 2],
2157                    &x->partition_info->bmi[i],
2158                    sizeof(x->partition_info->bmi[i]));
2159       for (j = 1; j < num_4x4_blocks_wide; ++j)
2160         vpx_memcpy(&x->partition_info->bmi[i + j],
2161                    &x->partition_info->bmi[i],
2162                    sizeof(x->partition_info->bmi[i]));
2163     }
2164   } /* for each label */
2165
2166   bsi->r = br;
2167   bsi->d = bd;
2168   bsi->segment_yrate = segmentyrate;
2169   bsi->segment_rd = this_segment_rd;
2170   bsi->sse = block_sse;
2171
2172   // update the coding decisions
2173   for (i = 0; i < 4; ++i)
2174     bsi->modes[i] = x->partition_info->bmi[i].mode;
2175 }
2176
2177 static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
2178                                            int_mv *best_ref_mv,
2179                                            int_mv *second_best_ref_mv,
2180                                            int64_t best_rd,
2181                                            int *returntotrate,
2182                                            int *returnyrate,
2183                                            int64_t *returndistortion,
2184                                            int *skippable, int64_t *psse,
2185                                            int mvthresh,
2186                                            int_mv seg_mvs[4][MAX_REF_FRAMES],
2187                                            BEST_SEG_INFO *bsi_buf,
2188                                            int filter_idx,
2189                                            int mi_row, int mi_col) {
2190   int i;
2191   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
2192   MACROBLOCKD *xd = &x->e_mbd;
2193   MODE_INFO *mi = xd->mode_info_context;
2194   MB_MODE_INFO *mbmi = &mi->mbmi;
2195   int mode_idx;
2196
2197   vp9_zero(*bsi);
2198
2199   bsi->segment_rd = best_rd;
2200   bsi->ref_mv = best_ref_mv;
2201   bsi->second_ref_mv = second_best_ref_mv;
2202   bsi->mvp.as_int = best_ref_mv->as_int;
2203   bsi->mvthresh = mvthresh;
2204
2205   for (i = 0; i < 4; i++)
2206     bsi->modes[i] = ZEROMV;
2207
2208   rd_check_segment_txsize(cpi, x, bsi_buf, filter_idx, seg_mvs, mi_row, mi_col);
2209
2210   if (bsi->segment_rd > best_rd)
2211     return INT64_MAX;
2212   /* set it to the best */
2213   for (i = 0; i < 4; i++) {
2214     mode_idx = inter_mode_offset(bsi->modes[i]);
2215     mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
2216     if (mbmi->ref_frame[1] > 0)
2217       mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
2218     xd->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
2219     x->partition_info->bmi[i].mode = bsi->modes[i];
2220   }
2221
2222   /*
2223    * used to set mbmi->mv.as_int
2224    */
2225   *returntotrate = bsi->r;
2226   *returndistortion = bsi->d;
2227   *returnyrate = bsi->segment_yrate;
2228   *skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_SB8X8);
2229   *psse = bsi->sse;
2230   mbmi->mode = bsi->modes[3];
2231
2232   return bsi->segment_rd;
2233 }
2234
2235 static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
2236                     uint8_t *ref_y_buffer, int ref_y_stride,
2237                     int ref_frame, BLOCK_SIZE_TYPE block_size ) {
2238   MACROBLOCKD *xd = &x->e_mbd;
2239   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
2240   int_mv this_mv;
2241   int i;
2242   int zero_seen = 0;
2243   int best_index = 0;
2244   int best_sad = INT_MAX;
2245   int this_sad = INT_MAX;
2246   unsigned int max_mv = 0;
2247
2248   uint8_t *src_y_ptr = x->plane[0].src.buf;
2249   uint8_t *ref_y_ptr;
2250   int row_offset, col_offset;
2251
2252   // Get the sad for each candidate reference mv
2253   for (i = 0; i < MAX_MV_REF_CANDIDATES; i++) {
2254     this_mv.as_int = mbmi->ref_mvs[ref_frame][i].as_int;
2255
2256     max_mv = MAX(max_mv,
2257                  MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
2258     // The list is at an end if we see 0 for a second time.
2259     if (!this_mv.as_int && zero_seen)
2260       break;
2261     zero_seen = zero_seen || !this_mv.as_int;
2262
2263     row_offset = this_mv.as_mv.row >> 3;
2264     col_offset = this_mv.as_mv.col >> 3;
2265     ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset;
2266
2267     // Find sad for current vector.
2268     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
2269                                            ref_y_ptr, ref_y_stride,
2270                                            0x7fffffff);
2271
2272     // Note if it is the best so far.
2273     if (this_sad < best_sad) {
2274       best_sad = this_sad;
2275       best_index = i;
2276     }
2277   }
2278
2279   // Note the index of the mv that worked best in the reference list.
2280   x->mv_best_ref_index[ref_frame] = best_index;
2281   x->max_mv_context[ref_frame] = max_mv;
2282 }
2283
2284 static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
2285                                      unsigned int *ref_costs_single,
2286                                      unsigned int *ref_costs_comp,
2287                                      vp9_prob *comp_mode_p) {
2288   VP9_COMMON *const cm = &cpi->common;
2289   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
2290   int seg_ref_active = vp9_segfeature_active(&xd->seg, segment_id,
2291                                              SEG_LVL_REF_FRAME);
2292   if (seg_ref_active) {
2293     vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
2294     vpx_memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
2295     *comp_mode_p = 128;
2296   } else {
2297     vp9_prob intra_inter_p = vp9_get_pred_prob_intra_inter(cm, xd);
2298     vp9_prob comp_inter_p = 128;
2299
2300     if (cm->comp_pred_mode == HYBRID_PREDICTION) {
2301       comp_inter_p = vp9_get_pred_prob_comp_inter_inter(cm, xd);
2302       *comp_mode_p = comp_inter_p;
2303     } else {
2304       *comp_mode_p = 128;
2305     }
2306
2307     ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
2308
2309     if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
2310       vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
2311       vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
2312       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
2313
2314       if (cm->comp_pred_mode == HYBRID_PREDICTION)
2315         base_cost += vp9_cost_bit(comp_inter_p, 0);
2316
2317       ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
2318           ref_costs_single[ALTREF_FRAME] = base_cost;
2319       ref_costs_single[LAST_FRAME]   += vp9_cost_bit(ref_single_p1, 0);
2320       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
2321       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
2322       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
2323       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
2324     } else {
2325       ref_costs_single[LAST_FRAME]   = 512;
2326       ref_costs_single[GOLDEN_FRAME] = 512;
2327       ref_costs_single[ALTREF_FRAME] = 512;
2328     }
2329     if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) {
2330       vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
2331       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
2332
2333       if (cm->comp_pred_mode == HYBRID_PREDICTION)
2334         base_cost += vp9_cost_bit(comp_inter_p, 1);
2335
2336       ref_costs_comp[LAST_FRAME]   = base_cost + vp9_cost_bit(ref_comp_p, 0);
2337       ref_costs_comp[GOLDEN_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 1);
2338     } else {
2339       ref_costs_comp[LAST_FRAME]   = 512;
2340       ref_costs_comp[GOLDEN_FRAME] = 512;
2341     }
2342   }
2343 }
2344
2345 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
2346                          int mode_index,
2347                          PARTITION_INFO *partition,
2348                          int_mv *ref_mv,
2349                          int_mv *second_ref_mv,
2350                          int64_t comp_pred_diff[NB_PREDICTION_TYPES],
2351                          int64_t txfm_size_diff[TX_MODES],
2352                          int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1]) {
2353   MACROBLOCKD *const xd = &x->e_mbd;
2354
2355   // Take a snapshot of the coding context so it can be
2356   // restored if we decide to encode this way
2357   ctx->skip = x->skip;
2358   ctx->best_mode_index = mode_index;
2359   ctx->mic = *xd->mode_info_context;
2360
2361   if (partition)
2362     ctx->partition_info = *partition;
2363
2364   ctx->best_ref_mv.as_int = ref_mv->as_int;
2365   ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
2366
2367   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_PREDICTION_ONLY];
2368   ctx->comp_pred_diff   = (int)comp_pred_diff[COMP_PREDICTION_ONLY];
2369   ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION];
2370
2371   // FIXME(rbultje) does this memcpy the whole array? I believe sizeof()
2372   // doesn't actually work this way
2373   memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
2374   memcpy(ctx->best_filter_diff, best_filter_diff,
2375          sizeof(*best_filter_diff) * (VP9_SWITCHABLE_FILTERS + 1));
2376 }
2377
2378 static void setup_pred_block(const MACROBLOCKD *xd,
2379                              struct buf_2d dst[MAX_MB_PLANE],
2380                              const YV12_BUFFER_CONFIG *src,
2381                              int mi_row, int mi_col,
2382                              const struct scale_factors *scale,
2383                              const struct scale_factors *scale_uv) {
2384   int i;
2385
2386   dst[0].buf = src->y_buffer;
2387   dst[0].stride = src->y_stride;
2388   dst[1].buf = src->u_buffer;
2389   dst[2].buf = src->v_buffer;
2390   dst[1].stride = dst[2].stride = src->uv_stride;
2391 #if CONFIG_ALPHA
2392   dst[3].buf = src->alpha_buffer;
2393   dst[3].stride = src->alpha_stride;
2394 #endif
2395
2396   // TODO(jkoleszar): Make scale factors per-plane data
2397   for (i = 0; i < MAX_MB_PLANE; i++) {
2398     setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
2399                      i ? scale_uv : scale,
2400                      xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
2401   }
2402 }
2403
2404 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
2405                                int idx, MV_REFERENCE_FRAME frame_type,
2406                                BLOCK_SIZE_TYPE block_size,
2407                                int mi_row, int mi_col,
2408                                int_mv frame_nearest_mv[MAX_REF_FRAMES],
2409                                int_mv frame_near_mv[MAX_REF_FRAMES],
2410                                struct buf_2d yv12_mb[4][MAX_MB_PLANE],
2411                                struct scale_factors scale[MAX_REF_FRAMES]) {
2412   VP9_COMMON *cm = &cpi->common;
2413   YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
2414   MACROBLOCKD *const xd = &x->e_mbd;
2415   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
2416
2417   // set up scaling factors
2418   scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];
2419
2420   scale[frame_type].x_offset_q4 =
2421       ROUND_POWER_OF_TWO(mi_col * MI_SIZE * scale[frame_type].x_scale_fp,
2422        VP9_REF_SCALE_SHIFT) & 0xf;
2423   scale[frame_type].y_offset_q4 =
2424       ROUND_POWER_OF_TWO(mi_row * MI_SIZE * scale[frame_type].y_scale_fp,
2425        VP9_REF_SCALE_SHIFT) & 0xf;
2426
2427   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
2428   // use the UV scaling factors.
2429   setup_pred_block(xd, yv12_mb[frame_type], yv12, mi_row, mi_col,
2430                    &scale[frame_type], &scale[frame_type]);
2431
2432   // Gets an initial list of candidate vectors from neighbours and orders them
2433   vp9_find_mv_refs(&cpi->common, xd, xd->mode_info_context,
2434                    xd->prev_mode_info_context,
2435                    frame_type,
2436                    mbmi->ref_mvs[frame_type],
2437                    cpi->common.ref_frame_sign_bias);
2438
2439   // Candidate refinement carried out at encoder and decoder
2440   vp9_find_best_ref_mvs(xd,
2441                         mbmi->ref_mvs[frame_type],
2442                         &frame_nearest_mv[frame_type],
2443                         &frame_near_mv[frame_type]);
2444
2445   // Further refinement that is encode side only to test the top few candidates
2446   // in full and choose the best as the centre point for subsequent searches.
2447   // The current implementation doesn't support scaling.
2448   if (scale[frame_type].x_scale_fp == VP9_REF_NO_SCALE &&
2449       scale[frame_type].y_scale_fp == VP9_REF_NO_SCALE)
2450     mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride,
2451             frame_type, block_size);
2452 }
2453
2454 static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
2455   YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
2456   int fb = get_ref_frame_idx(cpi, ref_frame);
2457   if (cpi->scaled_ref_idx[fb] != cpi->common.ref_frame_map[fb])
2458     scaled_ref_frame = &cpi->common.yv12_fb[cpi->scaled_ref_idx[fb]];
2459   return scaled_ref_frame;
2460 }
2461
2462 static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {
2463   MACROBLOCKD *xd = &x->e_mbd;
2464   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
2465
2466   const int c = vp9_get_pred_context_switchable_interp(xd);
2467   const int m = vp9_switchable_interp_map[mbmi->interp_filter];
2468   return SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
2469 }
2470
2471 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
2472                                  BLOCK_SIZE_TYPE bsize,
2473                                  int mi_row, int mi_col,
2474                                  int_mv *tmp_mv, int *rate_mv) {
2475   MACROBLOCKD *xd = &x->e_mbd;
2476   VP9_COMMON *cm = &cpi->common;
2477   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
2478   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
2479   int bestsme = INT_MAX;
2480   int further_steps, step_param;
2481   int sadpb = x->sadperbit16;
2482   int_mv mvp_full;
2483   int ref = mbmi->ref_frame[0];
2484   int_mv ref_mv = mbmi->ref_mvs[ref][0];
2485   const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
2486
2487   int tmp_col_min = x->mv_col_min;
2488   int tmp_col_max = x->mv_col_max;
2489   int tmp_row_min = x->mv_row_min;
2490   int tmp_row_max = x->mv_row_max;
2491
2492   YV12_BUFFER_CONFIG *scaled_ref_frame = get_scaled_ref_frame(cpi, ref);
2493
2494   if (scaled_ref_frame) {
2495     int i;
2496     // Swap out the reference frame for a version that's been scaled to
2497     // match the resolution of the current frame, allowing the existing
2498     // motion search code to be used without additional modifications.
2499     for (i = 0; i < MAX_MB_PLANE; i++)
2500       backup_yv12[i] = xd->plane[i].pre[0];
2501
2502     setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
2503   }
2504
2505   vp9_clamp_mv_min_max(x, &ref_mv);
2506
2507   // Adjust search parameters based on small partitions' result.
2508   if (x->fast_ms) {
2509     // && abs(mvp_full.as_mv.row - x->pred_mv.as_mv.row) < 24 &&
2510     // abs(mvp_full.as_mv.col - x->pred_mv.as_mv.col) < 24) {
2511     // adjust search range
2512     step_param = 6;
2513     if (x->fast_ms > 1)
2514       step_param = 8;
2515
2516     // Get prediction MV.
2517     mvp_full.as_int = x->pred_mv.as_int;
2518
2519     // Adjust MV sign if needed.
2520     if (cm->ref_frame_sign_bias[ref]) {
2521       mvp_full.as_mv.col *= -1;
2522       mvp_full.as_mv.row *= -1;
2523     }
2524   } else {
2525     // Work out the size of the first step in the mv step search.
2526     // 0 here is maximum length first step. 1 is MAX >> 1 etc.
2527     if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
2528       // Take wtd average of the step_params based on the last frame's
2529       // max mv magnitude and that based on the best ref mvs of the current
2530       // block for the given reference.
2531       step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) +
2532                     cpi->mv_step_param) >> 1;
2533     } else {
2534       step_param = cpi->mv_step_param;
2535     }
2536     // mvp_full.as_int = ref_mv[0].as_int;
2537     mvp_full.as_int =
2538         mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int;
2539   }
2540
2541   mvp_full.as_mv.col >>= 3;
2542   mvp_full.as_mv.row >>= 3;
2543
2544   // Further step/diamond searches as necessary
2545   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
2546
2547   bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
2548                                    sadpb, further_steps, 1,
2549                                    &cpi->fn_ptr[block_size],
2550                                    &ref_mv, tmp_mv);
2551
2552   x->mv_col_min = tmp_col_min;
2553   x->mv_col_max = tmp_col_max;
2554   x->mv_row_min = tmp_row_min;
2555   x->mv_row_max = tmp_row_max;
2556
2557   if (bestsme < INT_MAX) {
2558     int dis; /* TODO: use dis in distortion calculation later. */
2559     unsigned int sse;
2560     cpi->find_fractional_mv_step(x, tmp_mv, &ref_mv,
2561                                  x->errorperbit,
2562                                  &cpi->fn_ptr[block_size],
2563                                  x->nmvjointcost, x->mvcost,
2564                                  &dis, &sse);
2565   }
2566   *rate_mv = vp9_mv_bit_cost(tmp_mv, &ref_mv,
2567                              x->nmvjointcost, x->mvcost,
2568                              96, xd->allow_high_precision_mv);
2569   if (scaled_ref_frame) {
2570     int i;
2571     for (i = 0; i < MAX_MB_PLANE; i++)
2572       xd->plane[i].pre[0] = backup_yv12[i];
2573   }
2574 }
2575
2576 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
2577                                 BLOCK_SIZE_TYPE bsize,
2578                                 int_mv *frame_mv,
2579                                 int mi_row, int mi_col,
2580                                 int_mv single_newmv[MAX_REF_FRAMES],
2581                                 int *rate_mv) {
2582   int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
2583   MACROBLOCKD *xd = &x->e_mbd;
2584   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
2585   int refs[2] = { mbmi->ref_frame[0],
2586     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
2587   int_mv ref_mv[2];
2588   const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
2589   int ite;
2590   // Prediction buffer from second frame.
2591   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
2592
2593   // Do joint motion search in compound mode to get more accurate mv.
2594   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
2595   struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
2596   struct buf_2d scaled_first_yv12;
2597   int last_besterr[2] = {INT_MAX, INT_MAX};
2598   YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
2599   scaled_ref_frame[0] = get_scaled_ref_frame(cpi, mbmi->ref_frame[0]);
2600   scaled_ref_frame[1] = get_scaled_ref_frame(cpi, mbmi->ref_frame[1]);
2601
2602   ref_mv[0] = mbmi->ref_mvs[refs[0]][0];
2603   ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
2604
2605   if (scaled_ref_frame[0]) {
2606     int i;
2607     // Swap out the reference frame for a version that's been scaled to
2608     // match the resolution of the current frame, allowing the existing
2609     // motion search code to be used without additional modifications.
2610     for (i = 0; i < MAX_MB_PLANE; i++)
2611       backup_yv12[i] = xd->plane[i].pre[0];
2612     setup_pre_planes(xd, 0, scaled_ref_frame[0], mi_row, mi_col, NULL);
2613   }
2614
2615   if (scaled_ref_frame[1]) {
2616     int i;
2617     for (i = 0; i < MAX_MB_PLANE; i++)
2618       backup_second_yv12[i] = xd->plane[i].pre[1];
2619
2620     setup_pre_planes(xd, 0, scaled_ref_frame[1], mi_row, mi_col, NULL);
2621   }
2622
2623   xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
2624                                          mi_row, mi_col);
2625   xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
2626                                          mi_row, mi_col);
2627   scaled_first_yv12 = xd->plane[0].pre[0];
2628
2629   // Initialize mv using single prediction mode result.
2630   frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
2631   frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
2632
2633   // Allow joint search multiple times iteratively for each ref frame
2634   // and break out the search loop if it couldn't find better mv.
2635   for (ite = 0; ite < 4; ite++) {
2636     struct buf_2d ref_yv12[2];
2637     int bestsme = INT_MAX;
2638     int sadpb = x->sadperbit16;
2639     int_mv tmp_mv;
2640     int search_range = 3;
2641
2642     int tmp_col_min = x->mv_col_min;
2643     int tmp_col_max = x->mv_col_max;
2644     int tmp_row_min = x->mv_row_min;
2645     int tmp_row_max = x->mv_row_max;
2646     int id = ite % 2;
2647
2648     // Initialized here because of compiler problem in Visual Studio.
2649     ref_yv12[0] = xd->plane[0].pre[0];
2650     ref_yv12[1] = xd->plane[0].pre[1];
2651
2652     // Get pred block from second frame.
2653     vp9_build_inter_predictor(ref_yv12[!id].buf,
2654                               ref_yv12[!id].stride,
2655                               second_pred, pw,
2656                               &frame_mv[refs[!id]],
2657                               &xd->scale_factor[!id],
2658                               pw, ph, 0,
2659                               &xd->subpix, MV_PRECISION_Q3);
2660
2661     // Compound motion search on first ref frame.
2662     if (id)
2663       xd->plane[0].pre[0] = ref_yv12[id];
2664     vp9_clamp_mv_min_max(x, &ref_mv[id]);
2665
2666     // Use mv result from single mode as mvp.
2667     tmp_mv.as_int = frame_mv[refs[id]].as_int;
2668
2669     tmp_mv.as_mv.col >>= 3;
2670     tmp_mv.as_mv.row >>= 3;
2671
2672     // Small-range full-pixel motion search
2673     bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
2674                                        search_range,
2675                                        &cpi->fn_ptr[block_size],
2676                                        x->nmvjointcost, x->mvcost,
2677                                        &ref_mv[id], second_pred,
2678                                        pw, ph);
2679
2680     x->mv_col_min = tmp_col_min;
2681     x->mv_col_max = tmp_col_max;
2682     x->mv_row_min = tmp_row_min;
2683     x->mv_row_max = tmp_row_max;
2684
2685     if (bestsme < INT_MAX) {
2686       int dis; /* TODO: use dis in distortion calculation later. */
2687       unsigned int sse;
2688
2689       bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv,
2690                                              &ref_mv[id],
2691                                              x->errorperbit,
2692                                              &cpi->fn_ptr[block_size],
2693                                              x->nmvjointcost, x->mvcost,
2694                                              &dis, &sse, second_pred,
2695                                              pw, ph);
2696     }
2697
2698     if (id)
2699       xd->plane[0].pre[0] = scaled_first_yv12;
2700
2701     if (bestsme < last_besterr[id]) {
2702       frame_mv[refs[id]].as_int = tmp_mv.as_int;
2703       last_besterr[id] = bestsme;
2704     } else {
2705       break;
2706     }
2707   }
2708
2709   // restore the predictor
2710   if (scaled_ref_frame[0]) {
2711     int i;
2712     for (i = 0; i < MAX_MB_PLANE; i++)
2713       xd->plane[i].pre[0] = backup_yv12[i];
2714   }
2715
2716   if (scaled_ref_frame[1]) {
2717     int i;
2718     for (i = 0; i < MAX_MB_PLANE; i++)
2719       xd->plane[i].pre[1] = backup_second_yv12[i];
2720   }
2721   *rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]],
2722                               &mbmi->ref_mvs[refs[0]][0],
2723                               x->nmvjointcost, x->mvcost, 96,
2724                               x->e_mbd.allow_high_precision_mv);
2725   *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]],
2726                               &mbmi->ref_mvs[refs[1]][0],
2727                               x->nmvjointcost, x->mvcost, 96,
2728                               x->e_mbd.allow_high_precision_mv);
2729
2730   vpx_free(second_pred);
2731 }
2732
2733 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
2734                                  BLOCK_SIZE_TYPE bsize,
2735                                  int64_t txfm_cache[],
2736                                  int *rate2, int64_t *distortion,
2737                                  int *skippable,
2738                                  int *rate_y, int64_t *distortion_y,
2739                                  int *rate_uv, int64_t *distortion_uv,
2740                                  int *mode_excluded, int *disable_skip,
2741                                  INTERPOLATIONFILTERTYPE *best_filter,
2742                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
2743                                  int mi_row, int mi_col,
2744                                  int_mv single_newmv[MAX_REF_FRAMES],
2745                                  int64_t *psse, int64_t ref_best_rd) {
2746   VP9_COMMON *cm = &cpi->common;
2747   MACROBLOCKD *xd = &x->e_mbd;
2748   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
2749   const int is_comp_pred = (mbmi->ref_frame[1] > 0);
2750   const int num_refs = is_comp_pred ? 2 : 1;
2751   const int this_mode = mbmi->mode;
2752   int_mv *frame_mv = mode_mv[this_mode];
2753   int i;
2754   int refs[2] = { mbmi->ref_frame[0],
2755     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
2756   int_mv cur_mv[2];
2757   int64_t this_rd = 0;
2758   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
2759   int pred_exists = 0;
2760   int interpolating_intpel_seen = 0;
2761   int intpel_mv;
2762   int64_t rd, best_rd = INT64_MAX;
2763   int best_needs_copy = 0;
2764   uint8_t *orig_dst[MAX_MB_PLANE];
2765   int orig_dst_stride[MAX_MB_PLANE];
2766   int rs = 0;
2767
2768   if (this_mode == NEWMV) {
2769     int rate_mv;
2770     if (is_comp_pred) {
2771       // Initialize mv using single prediction mode result.
2772       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
2773       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
2774
2775       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
2776         joint_motion_search(cpi, x, bsize, frame_mv,
2777                             mi_row, mi_col, single_newmv, &rate_mv);
2778       } else {
2779         rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]],
2780                                    &mbmi->ref_mvs[refs[0]][0],
2781                                    x->nmvjointcost, x->mvcost, 96,
2782                                    x->e_mbd.allow_high_precision_mv);
2783         rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]],
2784                                    &mbmi->ref_mvs[refs[1]][0],
2785                                    x->nmvjointcost, x->mvcost, 96,
2786                                    x->e_mbd.allow_high_precision_mv);
2787       }
2788       if (frame_mv[refs[0]].as_int == INVALID_MV ||
2789           frame_mv[refs[1]].as_int == INVALID_MV)
2790         return INT64_MAX;
2791       *rate2 += rate_mv;
2792     } else {
2793       int_mv tmp_mv;
2794       single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
2795       *rate2 += rate_mv;
2796       frame_mv[refs[0]].as_int =
2797           xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
2798       single_newmv[refs[0]].as_int = tmp_mv.as_int;
2799     }
2800   }
2801
2802   // if we're near/nearest and mv == 0,0, compare to zeromv
2803   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
2804       frame_mv[refs[0]].as_int == 0 &&
2805       !vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
2806       (num_refs == 1 || frame_mv[refs[1]].as_int == 0)) {
2807     int rfc = mbmi->mb_mode_context[mbmi->ref_frame[0]];
2808     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
2809     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
2810     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
2811
2812     if (this_mode == NEARMV) {
2813       if (c1 > c3)
2814         return INT64_MAX;
2815     } else if (this_mode == NEARESTMV) {
2816       if (c2 > c3)
2817         return INT64_MAX;
2818     } else {
2819       assert(this_mode == ZEROMV);
2820       if (num_refs == 1) {
2821         if ((c3 >= c2 &&
2822              mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
2823             (c3 >= c1 &&
2824              mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
2825           return INT64_MAX;
2826       } else {
2827         if ((c3 >= c2 &&
2828              mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
2829              mode_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
2830             (c3 >= c1 &&
2831              mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
2832              mode_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
2833           return INT64_MAX;
2834       }
2835     }
2836   }
2837
2838   for (i = 0; i < num_refs; ++i) {
2839     cur_mv[i] = frame_mv[refs[i]];
2840     // Clip "next_nearest" so that it does not extend to far out of image
2841     if (this_mode == NEWMV)
2842       assert(!clamp_mv2(&cur_mv[i], xd));
2843     else
2844       clamp_mv2(&cur_mv[i], xd);
2845
2846     if (mv_check_bounds(x, &cur_mv[i]))
2847       return INT64_MAX;
2848     mbmi->mv[i].as_int = cur_mv[i].as_int;
2849   }
2850
2851   // do first prediction into the destination buffer. Do the next
2852   // prediction into a temporary buffer. Then keep track of which one
2853   // of these currently holds the best predictor, and use the other
2854   // one for future predictions. In the end, copy from tmp_buf to
2855   // dst if necessary.
2856   for (i = 0; i < MAX_MB_PLANE; i++) {
2857     orig_dst[i] = xd->plane[i].dst.buf;
2858     orig_dst_stride[i] = xd->plane[i].dst.stride;
2859   }
2860
2861   /* We don't include the cost of the second reference here, because there
2862    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
2863    * words if you present them in that order, the second one is always known
2864    * if the first is known */
2865   *rate2 += cost_mv_ref(cpi, this_mode,
2866                         mbmi->mb_mode_context[mbmi->ref_frame[0]]);
2867
2868   if (!(*mode_excluded)) {
2869     if (is_comp_pred) {
2870       *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
2871     } else {
2872       *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
2873     }
2874   }
2875
2876   pred_exists = 0;
2877   interpolating_intpel_seen = 0;
2878   // Are all MVs integer pel for Y and UV
2879   intpel_mv = (mbmi->mv[0].as_mv.row & 15) == 0 &&
2880       (mbmi->mv[0].as_mv.col & 15) == 0;
2881   if (is_comp_pred)
2882     intpel_mv &= (mbmi->mv[1].as_mv.row & 15) == 0 &&
2883         (mbmi->mv[1].as_mv.col & 15) == 0;
2884   // Search for best switchable filter by checking the variance of
2885   // pred error irrespective of whether the filter will be used
2886   *best_filter = EIGHTTAP;
2887   if (cpi->sf.use_8tap_always) {
2888     *best_filter = EIGHTTAP;
2889     vp9_zero(cpi->rd_filter_cache);
2890   } else {
2891     int i, newbest;
2892     int tmp_rate_sum = 0;
2893     int64_t tmp_dist_sum = 0;
2894
2895     cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX;
2896     for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
2897       int j;
2898       int64_t rs_rd;
2899       const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
2900       const int is_intpel_interp = intpel_mv;
2901       mbmi->interp_filter = filter;
2902       vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
2903       rs = get_switchable_rate(cm, x);
2904       rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
2905
2906       if (interpolating_intpel_seen && is_intpel_interp) {
2907         cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
2908                                          tmp_rate_sum, tmp_dist_sum);
2909         cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
2910             MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS],
2911                 cpi->rd_filter_cache[i] + rs_rd);
2912         rd = cpi->rd_filter_cache[i];
2913         if (cm->mcomp_filter_type == SWITCHABLE)
2914           rd += rs_rd;
2915       } else {
2916         int rate_sum = 0;
2917         int64_t dist_sum = 0;
2918         if ((cm->mcomp_filter_type == SWITCHABLE &&
2919              (!i || best_needs_copy)) ||
2920             (cm->mcomp_filter_type != SWITCHABLE &&
2921              (cm->mcomp_filter_type == mbmi->interp_filter ||
2922               (!interpolating_intpel_seen && is_intpel_interp)))) {
2923           for (j = 0; j < MAX_MB_PLANE; j++) {
2924             xd->plane[j].dst.buf = orig_dst[j];
2925             xd->plane[j].dst.stride = orig_dst_stride[j];
2926           }
2927         } else {
2928           for (j = 0; j < MAX_MB_PLANE; j++) {
2929             xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
2930             xd->plane[j].dst.stride = 64;
2931           }
2932         }
2933         vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
2934         model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
2935         cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
2936                                          rate_sum, dist_sum);
2937         cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
2938             MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS],
2939                 cpi->rd_filter_cache[i] + rs_rd);
2940         rd = cpi->rd_filter_cache[i];
2941         if (cm->mcomp_filter_type == SWITCHABLE)
2942           rd += rs_rd;
2943         if (!interpolating_intpel_seen && is_intpel_interp) {
2944           tmp_rate_sum = rate_sum;
2945           tmp_dist_sum = dist_sum;
2946         }
2947       }
2948       if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
2949         if (rd / 2 > ref_best_rd) {
2950           for (i = 0; i < MAX_MB_PLANE; i++) {
2951             xd->plane[i].dst.buf = orig_dst[i];
2952             xd->plane[i].dst.stride = orig_dst_stride[i];
2953           }
2954           return INT64_MAX;
2955         }
2956       }
2957       newbest = i == 0 || rd < best_rd;
2958
2959       if (newbest) {
2960         best_rd = rd;
2961         *best_filter = mbmi->interp_filter;
2962         if (cm->mcomp_filter_type == SWITCHABLE && i &&
2963             !(interpolating_intpel_seen && is_intpel_interp))
2964           best_needs_copy = !best_needs_copy;
2965       }
2966
2967       if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
2968           (cm->mcomp_filter_type != SWITCHABLE &&
2969            cm->mcomp_filter_type == mbmi->interp_filter)) {
2970         pred_exists = 1;
2971       }
2972       interpolating_intpel_seen |= is_intpel_interp;
2973     }
2974
2975     for (i = 0; i < MAX_MB_PLANE; i++) {
2976       xd->plane[i].dst.buf = orig_dst[i];
2977       xd->plane[i].dst.stride = orig_dst_stride[i];
2978     }
2979   }
2980   // Set the appropriate filter
2981   mbmi->interp_filter = cm->mcomp_filter_type != SWITCHABLE ?
2982       cm->mcomp_filter_type : *best_filter;
2983   vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
2984   rs = (cm->mcomp_filter_type == SWITCHABLE ? get_switchable_rate(cm, x) : 0);
2985
2986   if (pred_exists) {
2987     if (best_needs_copy) {
2988       // again temporarily set the buffers to local memory to prevent a memcpy
2989       for (i = 0; i < MAX_MB_PLANE; i++) {
2990         xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
2991         xd->plane[i].dst.stride = 64;
2992       }
2993     }
2994   } else {
2995     // Handles the special case when a filter that is not in the
2996     // switchable list (ex. bilinear, 6-tap) is indicated at the frame level
2997     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
2998   }
2999
3000
3001   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
3002     int tmp_rate;
3003     int64_t tmp_dist;
3004     model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
3005     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
3006     // if current pred_error modeled rd is substantially more than the best
3007     // so far, do not bother doing full rd
3008     if (rd / 2 > ref_best_rd) {
3009       for (i = 0; i < MAX_MB_PLANE; i++) {
3010         xd->plane[i].dst.buf = orig_dst[i];
3011         xd->plane[i].dst.stride = orig_dst_stride[i];
3012       }
3013       return INT64_MAX;
3014     }
3015   }
3016
3017   if (cpi->common.mcomp_filter_type == SWITCHABLE)
3018     *rate2 += get_switchable_rate(cm, x);
3019
3020   if (!is_comp_pred) {
3021     if (cpi->active_map_enabled && x->active_ptr[0] == 0)
3022       x->skip = 1;
3023     else if (x->encode_breakout) {
3024       const BLOCK_SIZE_TYPE y_size = get_plane_block_size(bsize, &xd->plane[0]);
3025       const BLOCK_SIZE_TYPE uv_size = get_plane_block_size(bsize,
3026                                                            &xd->plane[1]);
3027       unsigned int var, sse;
3028       // Skipping threshold for ac.
3029       unsigned int thresh_ac;
3030       // The encode_breakout input
3031       unsigned int encode_breakout = x->encode_breakout << 4;
3032
3033       // Calculate threshold according to dequant value.
3034       thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
3035
3036       // Set a maximum for threshold to avoid big PSNR loss in low bitrate case.
3037       if (thresh_ac > 36000)
3038         thresh_ac = 36000;
3039
3040       // Use encode_breakout input if it is bigger than internal threshold.
3041       if (thresh_ac < encode_breakout)
3042         thresh_ac = encode_breakout;
3043
3044       var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
3045                                    xd->plane[0].dst.buf,
3046                                    xd->plane[0].dst.stride, &sse);
3047
3048       // Adjust threshold according to partition size.
3049       thresh_ac >>= 8 - (b_width_log2_lookup[bsize] +
3050           b_height_log2_lookup[bsize]);
3051
3052       // Y skipping condition checking
3053       if (sse < thresh_ac || sse == 0) {
3054         // Skipping threshold for dc
3055         unsigned int thresh_dc;
3056
3057         thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
3058
3059         // dc skipping checking
3060         if ((sse - var) < thresh_dc || sse == var) {
3061           unsigned int sse_u, sse_v;
3062           unsigned int var_u, var_v;
3063
3064           var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
3065                                           x->plane[1].src.stride,
3066                                           xd->plane[1].dst.buf,
3067                                           xd->plane[1].dst.stride, &sse_u);
3068
3069           // U skipping condition checking
3070           if ((sse_u * 4 < thresh_ac || sse_u == 0) &&
3071               (sse_u - var_u < thresh_dc || sse_u == var_u)) {
3072             var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
3073                                             x->plane[2].src.stride,
3074                                             xd->plane[2].dst.buf,
3075                                             xd->plane[2].dst.stride, &sse_v);
3076
3077             // V skipping condition checking
3078             if ((sse_v * 4 < thresh_ac || sse_v == 0) &&
3079                 (sse_v - var_v < thresh_dc || sse_v == var_v)) {
3080               x->skip = 1;
3081
3082               *rate2 = 500;
3083               *rate_uv = 0;
3084
3085               // Scaling factor for SSE from spatial domain to frequency domain
3086               // is 16. Adjust distortion accordingly.
3087               *distortion_uv = (sse_u + sse_v) << 4;
3088               *distortion = (sse << 4) + *distortion_uv;
3089
3090               *disable_skip = 1;
3091               this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
3092             }
3093           }
3094         }
3095       }
3096     }
3097   }
3098
3099   if (!x->skip) {
3100     int skippable_y, skippable_uv;
3101     int64_t sseuv = INT_MAX;
3102
3103     // Y cost and distortion
3104     super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
3105                     bsize, txfm_cache, ref_best_rd);
3106
3107     if (*rate_y == INT_MAX) {
3108       *rate2 = INT_MAX;
3109       *distortion = INT64_MAX;
3110       for (i = 0; i < MAX_MB_PLANE; i++) {
3111         xd->plane[i].dst.buf = orig_dst[i];
3112         xd->plane[i].dst.stride = orig_dst_stride[i];
3113       }
3114       return INT64_MAX;
3115     }
3116
3117     *rate2 += *rate_y;
3118     *distortion += *distortion_y;
3119
3120     super_block_uvrd(cm, x, rate_uv, distortion_uv,
3121                      &skippable_uv, &sseuv, bsize);
3122
3123     *psse += sseuv;
3124     *rate2 += *rate_uv;
3125     *distortion += *distortion_uv;
3126     *skippable = skippable_y && skippable_uv;
3127   }
3128
3129   for (i = 0; i < MAX_MB_PLANE; i++) {
3130     xd->plane[i].dst.buf = orig_dst[i];
3131     xd->plane[i].dst.stride = orig_dst_stride[i];
3132   }
3133
3134   return this_rd;  // if 0, this will be re-calculated by caller
3135 }
3136
3137 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
3138                                int *returnrate, int64_t *returndist,
3139                                BLOCK_SIZE_TYPE bsize,
3140                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
3141   VP9_COMMON *const cm = &cpi->common;
3142   MACROBLOCKD *const xd = &x->e_mbd;
3143   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
3144   int y_skip = 0, uv_skip;
3145   int64_t dist_y = 0, dist_uv = 0, txfm_cache[TX_MODES];
3146
3147   x->skip_encode = 0;
3148   vpx_memset(&txfm_cache, 0, sizeof(txfm_cache));
3149   ctx->skip = 0;
3150   xd->mode_info_context->mbmi.ref_frame[0] = INTRA_FRAME;
3151   if (bsize >= BLOCK_SIZE_SB8X8) {
3152     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
3153                                &dist_y, &y_skip, bsize, txfm_cache,
3154                                best_rd) >= best_rd) {
3155       *returnrate = INT_MAX;
3156       return;
3157     }
3158     rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
3159                             &dist_uv, &uv_skip, bsize);
3160   } else {
3161     y_skip = 0;
3162     if (rd_pick_intra4x4mby_modes(cpi, x, &rate_y, &rate_y_tokenonly,
3163                                   &dist_y, best_rd) >= best_rd) {
3164       *returnrate = INT_MAX;
3165       return;
3166     }
3167     rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
3168                             &dist_uv, &uv_skip, BLOCK_SIZE_SB8X8);
3169   }
3170
3171   if (y_skip && uv_skip) {
3172     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
3173                   vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
3174     *returndist = dist_y + (dist_uv >> 2);
3175     memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
3176   } else {
3177     int i;
3178     *returnrate = rate_y + rate_uv +
3179         vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0);
3180     *returndist = dist_y + (dist_uv >> 2);
3181     if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
3182       for (i = 0; i < TX_MODES; i++) {
3183         ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->tx_mode];
3184       }
3185     }
3186   }
3187
3188   ctx->mic = *xd->mode_info_context;
3189 }
3190
3191 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
3192                                   int mi_row, int mi_col,
3193                                   int *returnrate,
3194                                   int64_t *returndistortion,
3195                                   BLOCK_SIZE_TYPE bsize,
3196                                   PICK_MODE_CONTEXT *ctx,
3197                                   int64_t best_rd_so_far) {
3198   VP9_COMMON *cm = &cpi->common;
3199   MACROBLOCKD *xd = &x->e_mbd;
3200   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
3201   const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
3202   MB_PREDICTION_MODE this_mode;
3203   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
3204   unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
3205   int comp_pred, i;
3206   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
3207   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
3208   int_mv single_newmv[MAX_REF_FRAMES];
3209   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
3210                                     VP9_ALT_FLAG };
3211   int idx_list[4] = {0,
3212                      cpi->lst_fb_idx,
3213                      cpi->gld_fb_idx,
3214                      cpi->alt_fb_idx};
3215   int64_t best_rd = best_rd_so_far;
3216   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
3217   int64_t best_txfm_rd[TX_MODES];
3218   int64_t best_txfm_diff[TX_MODES];
3219   int64_t best_pred_diff[NB_PREDICTION_TYPES];
3220   int64_t best_pred_rd[NB_PREDICTION_TYPES];
3221   int64_t best_filter_rd[VP9_SWITCHABLE_FILTERS + 1];
3222   int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
3223   MB_MODE_INFO best_mbmode;
3224   int j;
3225   int mode_index, best_mode_index = 0;
3226   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
3227   vp9_prob comp_mode_p;
3228   int64_t best_intra_rd = INT64_MAX;
3229   int64_t best_inter_rd = INT64_MAX;
3230   MB_PREDICTION_MODE best_intra_mode = DC_PRED;
3231   // MB_PREDICTION_MODE best_inter_mode = ZEROMV;
3232   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
3233   INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
3234   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
3235   int64_t dist_uv[TX_SIZES];
3236   int skip_uv[TX_SIZES];
3237   MB_PREDICTION_MODE mode_uv[TX_SIZES];
3238   struct scale_factors scale_factor[4];
3239   unsigned int ref_frame_mask = 0;
3240   unsigned int mode_mask = 0;
3241   int64_t mode_distortions[MB_MODE_COUNT] = {-1};
3242   int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
3243   int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
3244                                              cpi->common.y_dc_delta_q);
3245   int_mv seg_mvs[4][MAX_REF_FRAMES];
3246   union b_mode_info best_bmodes[4];
3247   PARTITION_INFO best_partition;
3248   int bwsl = b_width_log2(bsize);
3249   int bws = (1 << bwsl) / 4;  // mode_info step for subsize
3250   int bhsl = b_height_log2(bsize);
3251   int bhs = (1 << bhsl) / 4;  // mode_info step for subsize
3252   int best_skip2 = 0;
3253
3254   x->skip_encode = (cpi->sf.skip_encode_frame &&
3255                     xd->q_index < QIDX_SKIP_THRESH);
3256
3257   for (i = 0; i < 4; i++) {
3258     int j;
3259     for (j = 0; j < MAX_REF_FRAMES; j++)
3260       seg_mvs[i][j].as_int = INVALID_MV;
3261   }
3262   // Everywhere the flag is set the error is much higher than its neighbors.
3263   ctx->frames_with_high_error = 0;
3264   ctx->modes_with_high_error = 0;
3265
3266   estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
3267                            &comp_mode_p);
3268   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
3269   vpx_memset(&single_newmv, 0, sizeof(single_newmv));
3270
3271   for (i = 0; i < NB_PREDICTION_TYPES; ++i)
3272     best_pred_rd[i] = INT64_MAX;
3273   for (i = 0; i < TX_MODES; i++)
3274     best_txfm_rd[i] = INT64_MAX;
3275   for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
3276     best_filter_rd[i] = INT64_MAX;
3277   for (i = 0; i < TX_SIZES; i++)
3278     rate_uv_intra[i] = INT_MAX;
3279
3280   *returnrate = INT_MAX;
3281
3282   // Create a mask set to 1 for each reference frame used by a smaller
3283   // resolution.
3284   if (cpi->sf.use_avoid_tested_higherror) {
3285     switch (block_size) {
3286       case BLOCK_64X64:
3287         for (i = 0; i < 4; i++) {
3288           for (j = 0; j < 4; j++) {
3289             ref_frame_mask |= x->mb_context[i][j].frames_with_high_error;
3290             mode_mask |= x->mb_context[i][j].modes_with_high_error;
3291           }
3292         }
3293         for (i = 0; i < 4; i++) {
3294           ref_frame_mask |= x->sb32_context[i].frames_with_high_error;
3295           mode_mask |= x->sb32_context[i].modes_with_high_error;
3296         }
3297         break;
3298       case BLOCK_32X32:
3299         for (i = 0; i < 4; i++) {
3300           ref_frame_mask |=
3301               x->mb_context[xd->sb_index][i].frames_with_high_error;
3302           mode_mask |= x->mb_context[xd->sb_index][i].modes_with_high_error;
3303         }
3304         break;
3305       default:
3306         // Until we handle all block sizes set it to present;
3307         ref_frame_mask = 0;
3308         mode_mask = 0;
3309         break;
3310     }
3311     ref_frame_mask = ~ref_frame_mask;
3312     mode_mask = ~mode_mask;
3313   }
3314
3315   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
3316     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
3317       setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
3318                          mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
3319                          yv12_mb, scale_factor);
3320     }
3321     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
3322     frame_mv[ZEROMV][ref_frame].as_int = 0;
3323   }
3324
3325   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
3326     int mode_excluded = 0;
3327     int64_t this_rd = INT64_MAX;
3328     int disable_skip = 0;
3329     int compmode_cost = 0;
3330     int rate2 = 0, rate_y = 0, rate_uv = 0;
3331     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
3332     int skippable;
3333     int64_t txfm_cache[TX_MODES];
3334     int i;
3335     int this_skip2 = 0;
3336     int64_t total_sse = INT_MAX;
3337     int early_term = 0;
3338
3339     for (i = 0; i < TX_MODES; ++i)
3340       txfm_cache[i] = INT64_MAX;
3341
3342     x->skip = 0;
3343     this_mode = vp9_mode_order[mode_index].mode;
3344     ref_frame = vp9_mode_order[mode_index].ref_frame;
3345     second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
3346
3347     // Skip modes that have been masked off but always consider first mode.
3348     if (mode_index && (bsize > cpi->sf.unused_mode_skip_lvl) &&
3349          (cpi->unused_mode_skip_mask & (1 << mode_index)) )
3350       continue;
3351
3352     // Skip if the current reference frame has been masked off
3353     if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
3354         (cpi->ref_frame_mask & (1 << ref_frame)))
3355       continue;
3356
3357     // Test best rd so far against threshold for trying this mode.
3358     if ((best_rd < ((cpi->rd_threshes[bsize][mode_index] *
3359                      cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 4)) ||
3360         cpi->rd_threshes[bsize][mode_index] == INT_MAX)
3361       continue;
3362
3363     // Do not allow compound prediction if the segment level reference
3364     // frame feature is in use as in this case there can only be one reference.
3365     if ((second_ref_frame > INTRA_FRAME) &&
3366          vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME))
3367       continue;
3368
3369     // Skip some checking based on small partitions' result.
3370     if (x->fast_ms > 1 && !ref_frame)
3371       continue;
3372     if (x->fast_ms > 2 && ref_frame != x->subblock_ref)
3373       continue;
3374
3375     if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_SIZE_SB8X8) {
3376       if (!(ref_frame_mask & (1 << ref_frame))) {
3377         continue;
3378       }
3379       if (!(mode_mask & (1 << this_mode))) {
3380         continue;
3381       }
3382       if (second_ref_frame != NONE
3383           && !(ref_frame_mask & (1 << second_ref_frame))) {
3384         continue;
3385       }
3386     }
3387
3388     mbmi->ref_frame[0] = ref_frame;
3389     mbmi->ref_frame[1] = second_ref_frame;
3390
3391     if (!(ref_frame == INTRA_FRAME
3392         || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
3393       continue;
3394     }
3395     if (!(second_ref_frame == NONE
3396         || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) {
3397       continue;
3398     }
3399
3400     comp_pred = second_ref_frame > INTRA_FRAME;
3401     if (comp_pred) {
3402       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
3403         if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME)
3404           continue;
3405       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
3406         if (ref_frame != best_inter_ref_frame &&
3407             second_ref_frame != best_inter_ref_frame)
3408           continue;
3409     }
3410     // TODO(jingning, jkoleszar): scaling reference frame not supported for
3411     // SPLITMV.
3412     if (ref_frame > 0 &&
3413         (scale_factor[ref_frame].x_scale_fp != VP9_REF_NO_SCALE ||
3414          scale_factor[ref_frame].y_scale_fp != VP9_REF_NO_SCALE) &&
3415         this_mode == SPLITMV)
3416       continue;
3417
3418     if (second_ref_frame > 0 &&
3419         (scale_factor[second_ref_frame].x_scale_fp != VP9_REF_NO_SCALE ||
3420          scale_factor[second_ref_frame].y_scale_fp != VP9_REF_NO_SCALE) &&
3421         this_mode == SPLITMV)
3422       continue;
3423
3424     set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
3425     mbmi->mode = this_mode;
3426     mbmi->uv_mode = DC_PRED;
3427
3428     // Evaluate all sub-pel filters irrespective of whether we can use
3429     // them for this frame.
3430     mbmi->interp_filter = cm->mcomp_filter_type;
3431     vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
3432
3433     if (bsize >= BLOCK_SIZE_SB8X8 &&
3434         (this_mode == I4X4_PRED || this_mode == SPLITMV))
3435       continue;
3436     if (bsize < BLOCK_SIZE_SB8X8 &&
3437         !(this_mode == I4X4_PRED || this_mode == SPLITMV))
3438       continue;
3439
3440     if (comp_pred) {
3441       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
3442         continue;
3443       set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
3444
3445       mode_excluded = mode_excluded
3446                          ? mode_excluded
3447                          : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
3448     } else {
3449       if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
3450         mode_excluded =
3451             mode_excluded ?
3452                 mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
3453       }
3454     }
3455
3456     // Select predictors
3457     for (i = 0; i < MAX_MB_PLANE; i++) {
3458       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
3459       if (comp_pred)
3460         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
3461     }
3462
3463     // If the segment reference frame feature is enabled....
3464     // then do nothing if the current ref frame is not allowed..
3465     if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME) &&
3466         vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME) !=
3467             (int)ref_frame) {
3468       continue;
3469     // If the segment skip feature is enabled....
3470     // then do nothing if the current mode is not allowed..
3471     } else if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP) &&
3472                (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
3473       continue;
3474     // Disable this drop out case if the ref frame
3475     // segment level feature is enabled for this segment. This is to
3476     // prevent the possibility that we end up unable to pick any mode.
3477     } else if (!vp9_segfeature_active(&xd->seg, segment_id,
3478                                       SEG_LVL_REF_FRAME)) {
3479       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
3480       // unless ARNR filtering is enabled in which case we want
3481       // an unfiltered alternative. We allow near/nearest as well
3482       // because they may result in zero-zero MVs but be cheaper.
3483       if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
3484         if ((this_mode != ZEROMV &&
3485              !(this_mode == NEARMV &&
3486                frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) &&
3487              !(this_mode == NEARESTMV &&
3488                frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
3489             ref_frame != ALTREF_FRAME) {
3490           continue;
3491         }
3492       }
3493     }
3494     // TODO(JBB): This is to make up for the fact that we don't have sad
3495     // functions that work when the block size reads outside the umv.  We
3496     // should fix this either by making the motion search just work on
3497     // a representative block in the boundary ( first ) and then implement a
3498     // function that does sads when inside the border..
3499     if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&
3500         this_mode == NEWMV) {
3501       continue;
3502     }
3503
3504     if (this_mode == I4X4_PRED) {
3505       int rate;
3506
3507       /*
3508       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
3509           (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME))
3510         continue;
3511         */
3512
3513       mbmi->txfm_size = TX_4X4;
3514       if (rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
3515                                     &distortion_y, best_rd) >= best_rd)
3516         continue;
3517       rate2 += rate;
3518       rate2 += intra_cost_penalty;
3519       distortion2 += distortion_y;
3520
3521       if (rate_uv_intra[TX_4X4] == INT_MAX) {
3522         choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4],
3523                              &rate_uv_tokenonly[TX_4X4],
3524                              &dist_uv[TX_4X4], &skip_uv[TX_4X4],
3525                              &mode_uv[TX_4X4]);
3526       }
3527       rate2 += rate_uv_intra[TX_4X4];
3528       rate_uv = rate_uv_tokenonly[TX_4X4];
3529       distortion2 += dist_uv[TX_4X4];
3530       distortion_uv = dist_uv[TX_4X4];
3531       mbmi->uv_mode = mode_uv[TX_4X4];
3532       txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
3533       for (i = 0; i < TX_MODES; ++i)
3534         txfm_cache[i] = txfm_cache[ONLY_4X4];
3535     } else if (ref_frame == INTRA_FRAME) {
3536       TX_SIZE uv_tx;
3537       // Only search the oblique modes if the best so far is
3538       // one of the neighboring directional modes
3539       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
3540           (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
3541         if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME)
3542           continue;
3543       }
3544       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
3545         if (conditional_skipintra(mbmi->mode, best_intra_mode))
3546             continue;
3547       }
3548       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
3549                       bsize, txfm_cache, best_rd);
3550
3551       if (rate_y == INT_MAX)
3552         continue;
3553
3554       uv_tx = MIN(mbmi->txfm_size, max_uv_txsize_lookup[bsize]);
3555       if (rate_uv_intra[uv_tx] == INT_MAX) {
3556         choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[uv_tx],
3557                              &rate_uv_tokenonly[uv_tx],
3558                              &dist_uv[uv_tx], &skip_uv[uv_tx],
3559                              &mode_uv[uv_tx]);
3560       }
3561
3562       rate_uv = rate_uv_tokenonly[uv_tx];
3563       distortion_uv = dist_uv[uv_tx];
3564       skippable = skippable && skip_uv[uv_tx];
3565       mbmi->uv_mode = mode_uv[uv_tx];
3566
3567       rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
3568       if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
3569         rate2 += intra_cost_penalty;
3570       distortion2 = distortion_y + distortion_uv;
3571     } else if (this_mode == SPLITMV) {
3572       const int is_comp_pred = second_ref_frame > 0;
3573       int rate;
3574       int64_t distortion;
3575       int64_t this_rd_thresh;
3576       int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
3577       int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
3578       int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
3579       int tmp_best_skippable = 0;
3580       int switchable_filter_index;
3581       int_mv *second_ref = is_comp_pred ?
3582           &mbmi->ref_mvs[second_ref_frame][0] : NULL;
3583       union b_mode_info tmp_best_bmodes[16];
3584       MB_MODE_INFO tmp_best_mbmode;
3585       PARTITION_INFO tmp_best_partition;
3586       BEST_SEG_INFO bsi[VP9_SWITCHABLE_FILTERS];
3587       int pred_exists = 0;
3588       int uv_skippable;
3589       if (is_comp_pred) {
3590         if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
3591           if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME)
3592             continue;
3593         if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
3594           if (ref_frame != best_inter_ref_frame &&
3595               second_ref_frame != best_inter_ref_frame)
3596             continue;
3597       }
3598
3599       this_rd_thresh = (ref_frame == LAST_FRAME) ?
3600           cpi->rd_threshes[bsize][THR_NEWMV] :
3601           cpi->rd_threshes[bsize][THR_NEWA];
3602       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
3603           cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh;
3604       xd->mode_info_context->mbmi.txfm_size = TX_4X4;
3605
3606       cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX;
3607       for (switchable_filter_index = 0;
3608            switchable_filter_index < VP9_SWITCHABLE_FILTERS;
3609            ++switchable_filter_index) {
3610         int newbest, rs;
3611         int64_t rs_rd;
3612         mbmi->interp_filter =
3613             vp9_switchable_interp[switchable_filter_index];
3614         vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
3615
3616         tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
3617                      &mbmi->ref_mvs[ref_frame][0],
3618                      second_ref,
3619                      best_yrd,
3620                      &rate, &rate_y, &distortion,
3621                      &skippable, &total_sse,
3622                      (int)this_rd_thresh, seg_mvs,
3623                      bsi, switchable_filter_index,
3624                      mi_row, mi_col);
3625
3626         if (tmp_rd == INT64_MAX)
3627           continue;
3628         cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
3629         rs = get_switchable_rate(cm, x);
3630         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
3631         cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
3632             MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS], tmp_rd + rs_rd);
3633         if (cm->mcomp_filter_type == SWITCHABLE)
3634           tmp_rd += rs_rd;
3635
3636         newbest = (tmp_rd < tmp_best_rd);
3637         if (newbest) {
3638           tmp_best_filter = mbmi->interp_filter;
3639           tmp_best_rd = tmp_rd;
3640         }
3641         if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
3642             (mbmi->interp_filter == cm->mcomp_filter_type &&
3643              cm->mcomp_filter_type != SWITCHABLE)) {
3644           tmp_best_rdu = tmp_rd;
3645           tmp_best_rate = rate;
3646           tmp_best_ratey = rate_y;
3647           tmp_best_distortion = distortion;
3648           tmp_best_sse = total_sse;
3649           tmp_best_skippable = skippable;
3650           tmp_best_mbmode = *mbmi;
3651           tmp_best_partition = *x->partition_info;
3652           for (i = 0; i < 4; i++)
3653             tmp_best_bmodes[i] = xd->mode_info_context->bmi[i];
3654           pred_exists = 1;
3655           if (switchable_filter_index == 0 &&
3656               cpi->sf.use_rd_breakout &&
3657               best_rd < INT64_MAX) {
3658             if (tmp_best_rdu / 2 > best_rd) {
3659               // skip searching the other filters if the first is
3660               // already substantially larger than the best so far
3661               tmp_best_filter = mbmi->interp_filter;
3662               tmp_best_rdu = INT64_MAX;
3663               break;
3664             }
3665           }
3666         }
3667       }  // switchable_filter_index loop
3668
3669       if (tmp_best_rdu == INT64_MAX)
3670         continue;
3671
3672       mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
3673                              tmp_best_filter : cm->mcomp_filter_type);
3674       vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
3675       if (!pred_exists) {
3676         // Handles the special case when a filter that is not in the
3677         // switchable list (bilinear, 6-tap) is indicated at the frame level
3678         tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
3679                      &mbmi->ref_mvs[ref_frame][0],
3680                      second_ref,
3681                      best_yrd,
3682                      &rate, &rate_y, &distortion,
3683                      &skippable, &total_sse,
3684                      (int)this_rd_thresh, seg_mvs,
3685                      bsi, 0,
3686                      mi_row, mi_col);
3687         if (tmp_rd == INT64_MAX)
3688           continue;
3689       } else {
3690         if (cpi->common.mcomp_filter_type == SWITCHABLE) {
3691           int rs = get_switchable_rate(cm, x);
3692           tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
3693         }
3694         tmp_rd = tmp_best_rdu;
3695         total_sse = tmp_best_sse;
3696         rate = tmp_best_rate;
3697         rate_y = tmp_best_ratey;
3698         distortion = tmp_best_distortion;
3699         skippable = tmp_best_skippable;
3700         *mbmi = tmp_best_mbmode;
3701         *x->partition_info = tmp_best_partition;
3702         for (i = 0; i < 4; i++)
3703           xd->mode_info_context->bmi[i] = tmp_best_bmodes[i];
3704       }
3705
3706       rate2 += rate;
3707       distortion2 += distortion;
3708
3709       if (cpi->common.mcomp_filter_type == SWITCHABLE)
3710         rate2 += get_switchable_rate(cm, x);
3711
3712       if (!mode_excluded) {
3713         if (is_comp_pred)
3714           mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
3715         else
3716           mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
3717       }
3718       compmode_cost = vp9_cost_bit(comp_mode_p, is_comp_pred);
3719
3720       if (RDCOST(x->rdmult, x->rddiv, rate2, distortion2) <
3721           best_rd) {
3722         // If even the 'Y' rd value of split is higher than best so far
3723         // then dont bother looking at UV
3724         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
3725                                         BLOCK_SIZE_SB8X8);
3726         vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
3727         super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
3728                                   &uv_skippable, &uv_sse,
3729                                   BLOCK_SIZE_SB8X8, TX_4X4);
3730         rate2 += rate_uv;
3731         distortion2 += distortion_uv;
3732         skippable = skippable && uv_skippable;
3733         total_sse += uv_sse;
3734
3735         txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
3736         for (i = 0; i < TX_MODES; ++i)
3737           txfm_cache[i] = txfm_cache[ONLY_4X4];
3738       }
3739     } else {
3740       compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
3741       this_rd = handle_inter_mode(cpi, x, bsize,
3742                                   txfm_cache,
3743                                   &rate2, &distortion2, &skippable,
3744                                   &rate_y, &distortion_y,
3745                                   &rate_uv, &distortion_uv,
3746                                   &mode_excluded, &disable_skip,
3747                                   &tmp_best_filter, frame_mv,
3748                                   mi_row, mi_col,
3749                                   single_newmv, &total_sse, best_rd);
3750       if (this_rd == INT64_MAX)
3751         continue;
3752     }
3753
3754     if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
3755       rate2 += compmode_cost;
3756     }
3757
3758     // Estimate the reference frame signaling cost and add it
3759     // to the rolling cost variable.
3760     if (second_ref_frame > INTRA_FRAME) {
3761       rate2 += ref_costs_comp[ref_frame];
3762     } else {
3763       rate2 += ref_costs_single[ref_frame];
3764     }
3765
3766     if (!disable_skip) {
3767       // Test for the condition where skip block will be activated
3768       // because there are no non zero coefficients and make any
3769       // necessary adjustment for rate. Ignore if skip is coded at
3770       // segment level as the cost wont have been added in.
3771       // Is Mb level skip allowed (i.e. not coded at segment level).
3772       const int mb_skip_allowed = !vp9_segfeature_active(&xd->seg, segment_id,
3773                                                          SEG_LVL_SKIP);
3774
3775       if (skippable && bsize >= BLOCK_SIZE_SB8X8) {
3776         // Back out the coefficient coding costs
3777         rate2 -= (rate_y + rate_uv);
3778         // for best yrd calculation
3779         rate_uv = 0;
3780
3781         if (mb_skip_allowed) {
3782           int prob_skip_cost;
3783
3784           // Cost the skip mb case
3785           vp9_prob skip_prob =
3786             vp9_get_pred_prob_mbskip(cm, xd);
3787
3788           if (skip_prob) {
3789             prob_skip_cost = vp9_cost_bit(skip_prob, 1);
3790             rate2 += prob_skip_cost;
3791           }
3792         }
3793       } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
3794         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
3795             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
3796           // Add in the cost of the no skip flag.
3797           int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
3798                                             0);
3799           rate2 += prob_skip_cost;
3800         } else {
3801           // FIXME(rbultje) make this work for splitmv also
3802           int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
3803                                             1);
3804           rate2 += prob_skip_cost;
3805           distortion2 = total_sse;
3806           assert(total_sse >= 0);
3807           rate2 -= (rate_y + rate_uv);
3808           rate_y = 0;
3809           rate_uv = 0;
3810           this_skip2 = 1;
3811         }
3812       } else if (mb_skip_allowed) {
3813         // Add in the cost of the no skip flag.
3814         int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
3815                                           0);
3816         rate2 += prob_skip_cost;
3817       }
3818
3819       // Calculate the final RD estimate for this mode.
3820       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
3821     }
3822
3823     // Keep record of best intra rd
3824     if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME &&
3825         is_intra_mode(xd->mode_info_context->mbmi.mode) &&
3826         this_rd < best_intra_rd) {
3827       best_intra_rd = this_rd;
3828       best_intra_mode = xd->mode_info_context->mbmi.mode;
3829     }
3830     // Keep record of best inter rd with single reference
3831     if (xd->mode_info_context->mbmi.ref_frame[0] > INTRA_FRAME &&
3832         xd->mode_info_context->mbmi.ref_frame[1] == NONE &&
3833         !mode_excluded &&
3834         this_rd < best_inter_rd) {
3835       best_inter_rd = this_rd;
3836       best_inter_ref_frame = ref_frame;
3837       // best_inter_mode = xd->mode_info_context->mbmi.mode;
3838     }
3839
3840     if (!disable_skip && ref_frame == INTRA_FRAME) {
3841       for (i = 0; i < NB_PREDICTION_TYPES; ++i)
3842         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
3843       for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
3844         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
3845     }
3846
3847     if (this_mode != I4X4_PRED && this_mode != SPLITMV) {
3848       // Store the respective mode distortions for later use.
3849       if (mode_distortions[this_mode] == -1
3850           || distortion2 < mode_distortions[this_mode]) {
3851         mode_distortions[this_mode] = distortion2;
3852       }
3853       if (frame_distortions[ref_frame] == -1
3854           || distortion2 < frame_distortions[ref_frame]) {
3855         frame_distortions[ref_frame] = distortion2;
3856       }
3857     }
3858
3859     // Did this mode help.. i.e. is it the new best mode
3860     if (this_rd < best_rd || x->skip) {
3861       if (!mode_excluded) {
3862         // Note index of best mode so far
3863         const int qstep = xd->plane[0].dequant[1];
3864
3865         best_mode_index = mode_index;
3866
3867         if (ref_frame == INTRA_FRAME) {
3868           /* required for left and above block mv */
3869           mbmi->mv[0].as_int = 0;
3870         }
3871
3872         *returnrate = rate2;
3873         *returndistortion = distortion2;
3874         best_rd = this_rd;
3875         best_yrd = best_rd -
3876                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
3877         best_mbmode = *mbmi;
3878         best_skip2 = this_skip2;
3879         best_partition = *x->partition_info;
3880
3881         if (this_mode == I4X4_PRED || this_mode == SPLITMV)
3882           for (i = 0; i < 4; i++)
3883             best_bmodes[i] = xd->mode_info_context->bmi[i];
3884
3885         // TODO(debargha): enhance this test with a better distortion prediction
3886         // based on qp, activity mask and history
3887         if (cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE)
3888           if (ref_frame > INTRA_FRAME && distortion2 * 4 < qstep * qstep)
3889             early_term = 1;
3890       }
3891 #if 0
3892       // Testing this mode gave rise to an improvement in best error score.
3893       // Lower threshold a bit for next time
3894       cpi->rd_thresh_mult[mode_index] =
3895           (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
3896               cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
3897       cpi->rd_threshes[mode_index] =
3898           (cpi->rd_baseline_thresh[mode_index] >> 7)
3899               * cpi->rd_thresh_mult[mode_index];
3900 #endif
3901     } else {
3902       // If the mode did not help improve the best error case then
3903       // raise the threshold for testing that mode next time around.
3904 #if 0
3905       cpi->rd_thresh_mult[mode_index] += 4;
3906
3907       if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
3908         cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
3909
3910       cpi->rd_threshes[mode_index] =
3911           (cpi->rd_baseline_thresh[mode_index] >> 7)
3912               * cpi->rd_thresh_mult[mode_index];
3913 #endif
3914     }
3915
3916     /* keep record of best compound/single-only prediction */
3917     if (!disable_skip && ref_frame != INTRA_FRAME) {
3918       int single_rd, hybrid_rd, single_rate, hybrid_rate;
3919
3920       if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
3921         single_rate = rate2 - compmode_cost;
3922         hybrid_rate = rate2;
3923       } else {
3924         single_rate = rate2;
3925         hybrid_rate = rate2 + compmode_cost;
3926       }
3927
3928       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
3929       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
3930
3931       if (second_ref_frame <= INTRA_FRAME &&
3932           single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
3933         best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
3934       } else if (second_ref_frame > INTRA_FRAME &&
3935                  single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
3936         best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
3937       }
3938       if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
3939         best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
3940     }
3941
3942     /* keep record of best filter type */
3943     if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
3944         cm->mcomp_filter_type != BILINEAR) {
3945       int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
3946                               VP9_SWITCHABLE_FILTERS :
3947                               vp9_switchable_interp_map[cm->mcomp_filter_type]];
3948       for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
3949         int64_t adj_rd;
3950         // In cases of poor prediction, filter_cache[] can contain really big
3951         // values, which actually are bigger than this_rd itself. This can
3952         // cause negative best_filter_rd[] values, which is obviously silly.
3953         // Therefore, if filter_cache < ref, we do an adjusted calculation.
3954         if (cpi->rd_filter_cache[i] >= ref)
3955           adj_rd = this_rd + cpi->rd_filter_cache[i] - ref;
3956         else  // FIXME(rbultje) do this for comppred also
3957           adj_rd = this_rd - (ref - cpi->rd_filter_cache[i]) * this_rd / ref;
3958         best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
3959       }
3960     }
3961
3962     /* keep record of best txfm size */
3963     if (bsize < BLOCK_SIZE_SB32X32) {
3964       if (bsize < BLOCK_SIZE_MB16X16) {
3965         if (this_mode == SPLITMV || this_mode == I4X4_PRED)
3966           txfm_cache[ALLOW_8X8] = txfm_cache[ONLY_4X4];
3967         txfm_cache[ALLOW_16X16] = txfm_cache[ALLOW_8X8];
3968       }
3969       txfm_cache[ALLOW_32X32] = txfm_cache[ALLOW_16X16];
3970     }
3971     if (!mode_excluded && this_rd != INT64_MAX) {
3972       for (i = 0; i < TX_MODES; i++) {
3973         int64_t adj_rd = INT64_MAX;
3974         if (this_mode != I4X4_PRED) {
3975           adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->tx_mode];
3976         } else {
3977           adj_rd = this_rd;
3978         }
3979
3980         if (adj_rd < best_txfm_rd[i])
3981           best_txfm_rd[i] = adj_rd;
3982       }
3983     }
3984
3985     if (early_term)
3986       break;
3987
3988     if (x->skip && !comp_pred)
3989       break;
3990   }
3991
3992   if (best_rd >= best_rd_so_far)
3993     return INT64_MAX;
3994
3995   // If we used an estimate for the uv intra rd in the loop above...
3996   if (cpi->sf.use_uv_intra_rd_estimate) {
3997     // Do Intra UV best rd mode selection if best mode choice above was intra.
3998     if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
3999       TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
4000       rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
4001                               &rate_uv_tokenonly[uv_tx_size],
4002                               &dist_uv[uv_tx_size],
4003                               &skip_uv[uv_tx_size],
4004                               (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8
4005                                                          : bsize);
4006     }
4007   }
4008
4009   // If indicated then mark the index of the chosen mode to be inspected at
4010   // other block sizes.
4011   if (bsize <= cpi->sf.unused_mode_skip_lvl) {
4012     cpi->unused_mode_skip_mask = cpi->unused_mode_skip_mask &
4013                                  (~((int64_t)1 << best_mode_index));
4014   }
4015
4016   // If we are using reference masking and the set mask flag is set then
4017   // create the reference frame mask.
4018   if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
4019     cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame);
4020
4021   // Flag all modes that have a distortion thats > 2x the best we found at
4022   // this level.
4023   for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
4024     if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV)
4025       continue;
4026
4027     if (mode_distortions[mode_index] > 2 * *returndistortion) {
4028       ctx->modes_with_high_error |= (1 << mode_index);
4029     }
4030   }
4031
4032   // Flag all ref frames that have a distortion thats > 2x the best we found at
4033   // this level.
4034   for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
4035     if (frame_distortions[ref_frame] > 2 * *returndistortion) {
4036       ctx->frames_with_high_error |= (1 << ref_frame);
4037     }
4038   }
4039
4040   if (best_rd == INT64_MAX && bsize < BLOCK_SIZE_SB8X8) {
4041     *returnrate = INT_MAX;
4042     *returndistortion = INT_MAX;
4043     return best_rd;
4044   }
4045
4046   assert((cm->mcomp_filter_type == SWITCHABLE) ||
4047          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
4048          (best_mbmode.ref_frame[0] == INTRA_FRAME));
4049
4050   // Updating rd_thresh_freq_fact[] here means that the differnt
4051   // partition/block sizes are handled independently based on the best
4052   // choice for the current partition. It may well be better to keep a scaled
4053   // best rd so far value and update rd_thresh_freq_fact based on the mode/size
4054   // combination that wins out.
4055   if (cpi->sf.adaptive_rd_thresh) {
4056     for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
4057       if (mode_index == best_mode_index) {
4058         cpi->rd_thresh_freq_fact[bsize][mode_index] = BASE_RD_THRESH_FREQ_FACT;
4059       } else {
4060         cpi->rd_thresh_freq_fact[bsize][mode_index] += MAX_RD_THRESH_FREQ_INC;
4061         if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
4062             (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT)) {
4063           cpi->rd_thresh_freq_fact[bsize][mode_index] =
4064             cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT;
4065         }
4066       }
4067     }
4068   }
4069
4070   // TODO(rbultje) integrate with RD trd_thresh_freq_facthresholding
4071 #if 0
4072   // Reduce the activation RD thresholds for the best choice mode
4073   if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
4074       (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
4075     int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
4076
4077     cpi->rd_thresh_mult[best_mode_index] =
4078       (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ?
4079       cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
4080     cpi->rd_threshes[best_mode_index] =
4081       (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
4082   }
4083 #endif
4084
4085   // macroblock modes
4086   *mbmi = best_mbmode;
4087   x->skip |= best_skip2;
4088   if (best_mbmode.ref_frame[0] == INTRA_FRAME &&
4089       best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
4090     for (i = 0; i < 4; i++)
4091       xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
4092   }
4093
4094   if (best_mbmode.ref_frame[0] != INTRA_FRAME &&
4095       best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
4096     for (i = 0; i < 4; i++)
4097       xd->mode_info_context->bmi[i].as_mv[0].as_int =
4098           best_bmodes[i].as_mv[0].as_int;
4099
4100     if (mbmi->ref_frame[1] > 0)
4101       for (i = 0; i < 4; i++)
4102         xd->mode_info_context->bmi[i].as_mv[1].as_int =
4103             best_bmodes[i].as_mv[1].as_int;
4104
4105     *x->partition_info = best_partition;
4106
4107     mbmi->mv[0].as_int = xd->mode_info_context->bmi[3].as_mv[0].as_int;
4108     mbmi->mv[1].as_int = xd->mode_info_context->bmi[3].as_mv[1].as_int;
4109   }
4110
4111   for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
4112     if (best_pred_rd[i] == INT64_MAX)
4113       best_pred_diff[i] = INT_MIN;
4114     else
4115       best_pred_diff[i] = best_rd - best_pred_rd[i];
4116   }
4117
4118   if (!x->skip) {
4119     for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
4120       if (best_filter_rd[i] == INT64_MAX)
4121         best_filter_diff[i] = 0;
4122       else
4123         best_filter_diff[i] = best_rd - best_filter_rd[i];
4124     }
4125     if (cm->mcomp_filter_type == SWITCHABLE)
4126       assert(best_filter_diff[VP9_SWITCHABLE_FILTERS] == 0);
4127   } else {
4128     vpx_memset(best_filter_diff, 0, sizeof(best_filter_diff));
4129   }
4130
4131   if (!x->skip) {
4132     for (i = 0; i < TX_MODES; i++) {
4133       if (best_txfm_rd[i] == INT64_MAX)
4134         best_txfm_diff[i] = 0;
4135       else
4136         best_txfm_diff[i] = best_rd - best_txfm_rd[i];
4137     }
4138   } else {
4139     vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
4140   }
4141
4142   set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
4143                     scale_factor);
4144   store_coding_context(x, ctx, best_mode_index,
4145                        &best_partition,
4146                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
4147                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
4148                                       mbmi->ref_frame[1]][0],
4149                        best_pred_diff, best_txfm_diff, best_filter_diff);
4150
4151   return best_rd;
4152 }