granicus.if.org Git - libvpx/blob - vp10/encoder/rd.c

   1 /*
   2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include <assert.h>
  12 #include <math.h>
  13 #include <stdio.h>
  14
  15 #include "./vp10_rtcd.h"
  16
  17 #include "vpx_dsp/vpx_dsp_common.h"
  18 #include "vpx_mem/vpx_mem.h"
  19 #include "vpx_ports/bitops.h"
  20 #include "vpx_ports/mem.h"
  21 #include "vpx_ports/system_state.h"
  22
  23 #include "vp10/common/common.h"
  24 #include "vp10/common/entropy.h"
  25 #include "vp10/common/entropymode.h"
  26 #include "vp10/common/mvref_common.h"
  27 #include "vp10/common/pred_common.h"
  28 #include "vp10/common/quant_common.h"
  29 #include "vp10/common/reconinter.h"
  30 #include "vp10/common/reconintra.h"
  31 #include "vp10/common/seg_common.h"
  32
  33 #include "vp10/encoder/cost.h"
  34 #include "vp10/encoder/encodemb.h"
  35 #include "vp10/encoder/encodemv.h"
  36 #include "vp10/encoder/encoder.h"
  37 #include "vp10/encoder/mcomp.h"
  38 #include "vp10/encoder/quantize.h"
  39 #include "vp10/encoder/ratectrl.h"
  40 #include "vp10/encoder/rd.h"
  41 #include "vp10/encoder/tokenize.h"
  42
  43 #define RD_THRESH_POW      1.25
  44 #define RD_MULT_EPB_RATIO  64
  45
  46 // Factor to weigh the rate for switchable interp filters.
  47 #define SWITCHABLE_INTERP_RATE_FACTOR 1
  48
  49 void vp10_rd_cost_reset(RD_COST *rd_cost) {
  50   rd_cost->rate = INT_MAX;
  51   rd_cost->dist = INT64_MAX;
  52   rd_cost->rdcost = INT64_MAX;
  53 }
  54
  55 void vp10_rd_cost_init(RD_COST *rd_cost) {
  56   rd_cost->rate = 0;
  57   rd_cost->dist = 0;
  58   rd_cost->rdcost = 0;
  59 }
  60
  61 // The baseline rd thresholds for breaking out of the rd loop for
  62 // certain modes are assumed to be based on 8x8 blocks.
  63 // This table is used to correct for block size.
  64 // The factors here are << 2 (2 = x0.5, 32 = x8 etc).
  65 static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
  66   2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32
  67 };
  68
  69 static void fill_mode_costs(VP10_COMP *cpi) {
  70   const FRAME_CONTEXT *const fc = cpi->common.fc;
  71   int i, j;
  72
  73   for (i = 0; i < INTRA_MODES; ++i)
  74     for (j = 0; j < INTRA_MODES; ++j)
  75       vp10_cost_tokens(cpi->y_mode_costs[i][j], vp10_kf_y_mode_prob[i][j],
  76                       vp10_intra_mode_tree);
  77
  78   vp10_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp10_intra_mode_tree);
  79   vp10_cost_tokens(cpi->intra_uv_mode_cost[KEY_FRAME],
  80                   vp10_kf_uv_mode_prob[TM_PRED], vp10_intra_mode_tree);
  81   vp10_cost_tokens(cpi->intra_uv_mode_cost[INTER_FRAME],
  82                   fc->uv_mode_prob[TM_PRED], vp10_intra_mode_tree);
  83
  84   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
  85     vp10_cost_tokens(cpi->switchable_interp_costs[i],
  86                     fc->switchable_interp_prob[i], vp10_switchable_interp_tree);
  87
  88   for (i = 0; i < PALETTE_BLOCK_SIZES; ++i) {
  89     vp10_cost_tokens(cpi->palette_y_size_cost[i],
  90                      vp10_default_palette_y_size_prob[i],
  91                      vp10_palette_size_tree);
  92     vp10_cost_tokens(cpi->palette_uv_size_cost[i],
  93                      vp10_default_palette_uv_size_prob[i],
  94                      vp10_palette_size_tree);
  95   }
  96
  97   for (i = 0; i < PALETTE_MAX_SIZE - 1; ++i)
  98     for (j = 0; j < PALETTE_COLOR_CONTEXTS; ++j) {
  99       vp10_cost_tokens(cpi->palette_y_color_cost[i][j],
 100                        vp10_default_palette_y_color_prob[i][j],
 101                        vp10_palette_color_tree[i]);
 102       vp10_cost_tokens(cpi->palette_uv_color_cost[i][j],
 103                        vp10_default_palette_uv_color_prob[i][j],
 104                        vp10_palette_color_tree[i]);
 105     }
 106 }
 107
 108 static void fill_token_costs(vp10_coeff_cost *c,
 109                              vp10_coeff_probs_model (*p)[PLANE_TYPES]) {
 110   int i, j, k, l;
 111   TX_SIZE t;
 112   for (t = TX_4X4; t <= TX_32X32; ++t)
 113     for (i = 0; i < PLANE_TYPES; ++i)
 114       for (j = 0; j < REF_TYPES; ++j)
 115         for (k = 0; k < COEF_BANDS; ++k)
 116           for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
 117             vpx_prob probs[ENTROPY_NODES];
 118             vp10_model_to_full_probs(p[t][i][j][k][l], probs);
 119             vp10_cost_tokens((int *)c[t][i][j][k][0][l], probs,
 120                             vp10_coef_tree);
 121             vp10_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
 122                                  vp10_coef_tree);
 123             assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
 124                    c[t][i][j][k][1][l][EOB_TOKEN]);
 125           }
 126 }
 127
 128 // Values are now correlated to quantizer.
 129 static int sad_per_bit16lut_8[QINDEX_RANGE];
 130 static int sad_per_bit4lut_8[QINDEX_RANGE];
 131
 132 #if CONFIG_VP9_HIGHBITDEPTH
 133 static int sad_per_bit16lut_10[QINDEX_RANGE];
 134 static int sad_per_bit4lut_10[QINDEX_RANGE];
 135 static int sad_per_bit16lut_12[QINDEX_RANGE];
 136 static int sad_per_bit4lut_12[QINDEX_RANGE];
 137 #endif
 138
 139 static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
 140                             vpx_bit_depth_t bit_depth) {
 141   int i;
 142   // Initialize the sad lut tables using a formulaic calculation for now.
 143   // This is to make it easier to resolve the impact of experimental changes
 144   // to the quantizer tables.
 145   for (i = 0; i < range; i++) {
 146     const double q = vp10_convert_qindex_to_q(i, bit_depth);
 147     bit16lut[i] = (int)(0.0418 * q + 2.4107);
 148     bit4lut[i] = (int)(0.063 * q + 2.742);
 149   }
 150 }
 151
 152 void vp10_init_me_luts(void) {
 153   init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE,
 154                   VPX_BITS_8);
 155 #if CONFIG_VP9_HIGHBITDEPTH
 156   init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE,
 157                   VPX_BITS_10);
 158   init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE,
 159                   VPX_BITS_12);
 160 #endif
 161 }
 162
 163 static const int rd_boost_factor[16] = {
 164   64, 32, 32, 32, 24, 16, 12, 12,
 165   8, 8, 4, 4, 2, 2, 1, 0
 166 };
 167 static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
 168   128, 144, 128, 128, 144
 169 };
 170
 171 int vp10_compute_rd_mult(const VP10_COMP *cpi, int qindex) {
 172   const int64_t q = vp10_dc_quant(qindex, 0, cpi->common.bit_depth);
 173 #if CONFIG_VP9_HIGHBITDEPTH
 174   int64_t rdmult = 0;
 175   switch (cpi->common.bit_depth) {
 176     case VPX_BITS_8:
 177       rdmult = 88 * q * q / 24;
 178       break;
 179     case VPX_BITS_10:
 180       rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4);
 181       break;
 182     case VPX_BITS_12:
 183       rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8);
 184       break;
 185     default:
 186       assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
 187       return -1;
 188   }
 189 #else
 190   int64_t rdmult = 88 * q * q / 24;
 191 #endif  // CONFIG_VP9_HIGHBITDEPTH
 192   if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
 193     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
 194     const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
 195     const int boost_index = VPXMIN(15, (cpi->rc.gfu_boost / 100));
 196
 197     rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
 198     rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
 199   }
 200   if (rdmult < 1)
 201     rdmult = 1;
 202   return (int)rdmult;
 203 }
 204
 205 static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
 206   double q;
 207 #if CONFIG_VP9_HIGHBITDEPTH
 208   switch (bit_depth) {
 209     case VPX_BITS_8:
 210       q = vp10_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
 211       break;
 212     case VPX_BITS_10:
 213       q = vp10_dc_quant(qindex, 0, VPX_BITS_10) / 16.0;
 214       break;
 215     case VPX_BITS_12:
 216       q = vp10_dc_quant(qindex, 0, VPX_BITS_12) / 64.0;
 217       break;
 218     default:
 219       assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
 220       return -1;
 221   }
 222 #else
 223   (void) bit_depth;
 224   q = vp10_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
 225 #endif  // CONFIG_VP9_HIGHBITDEPTH
 226   // TODO(debargha): Adjust the function below.
 227   return VPXMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
 228 }
 229
 230 void vp10_initialize_me_consts(VP10_COMP *cpi, MACROBLOCK *x, int qindex) {
 231 #if CONFIG_VP9_HIGHBITDEPTH
 232   switch (cpi->common.bit_depth) {
 233     case VPX_BITS_8:
 234       x->sadperbit16 = sad_per_bit16lut_8[qindex];
 235       x->sadperbit4 = sad_per_bit4lut_8[qindex];
 236       break;
 237     case VPX_BITS_10:
 238       x->sadperbit16 = sad_per_bit16lut_10[qindex];
 239       x->sadperbit4 = sad_per_bit4lut_10[qindex];
 240       break;
 241     case VPX_BITS_12:
 242       x->sadperbit16 = sad_per_bit16lut_12[qindex];
 243       x->sadperbit4 = sad_per_bit4lut_12[qindex];
 244       break;
 245     default:
 246       assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
 247   }
 248 #else
 249   (void)cpi;
 250   x->sadperbit16 = sad_per_bit16lut_8[qindex];
 251   x->sadperbit4 = sad_per_bit4lut_8[qindex];
 252 #endif  // CONFIG_VP9_HIGHBITDEPTH
 253 }
 254
 255 static void set_block_thresholds(const VP10_COMMON *cm, RD_OPT *rd) {
 256   int i, bsize, segment_id;
 257
 258   for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
 259     const int qindex =
 260         clamp(vp10_get_qindex(&cm->seg, segment_id, cm->base_qindex) +
 261               cm->y_dc_delta_q, 0, MAXQ);
 262     const int q = compute_rd_thresh_factor(qindex, cm->bit_depth);
 263
 264     for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
 265       // Threshold here seems unnecessarily harsh but fine given actual
 266       // range of values used for cpi->sf.thresh_mult[].
 267       const int t = q * rd_thresh_block_size_factor[bsize];
 268       const int thresh_max = INT_MAX / t;
 269
 270       if (bsize >= BLOCK_8X8) {
 271         for (i = 0; i < MAX_MODES; ++i)
 272           rd->threshes[segment_id][bsize][i] =
 273               rd->thresh_mult[i] < thresh_max
 274                   ? rd->thresh_mult[i] * t / 4
 275                   : INT_MAX;
 276       } else {
 277         for (i = 0; i < MAX_REFS; ++i)
 278           rd->threshes[segment_id][bsize][i] =
 279               rd->thresh_mult_sub8x8[i] < thresh_max
 280                   ? rd->thresh_mult_sub8x8[i] * t / 4
 281                   : INT_MAX;
 282       }
 283     }
 284   }
 285 }
 286
 287 void vp10_initialize_rd_consts(VP10_COMP *cpi) {
 288   VP10_COMMON *const cm = &cpi->common;
 289   MACROBLOCK *const x = &cpi->td.mb;
 290   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 291   RD_OPT *const rd = &cpi->rd;
 292   int i;
 293
 294   vpx_clear_system_state();
 295
 296   rd->RDDIV = RDDIV_BITS;  // In bits (to multiply D by 128).
 297   rd->RDMULT = vp10_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
 298
 299   x->errorperbit = rd->RDMULT / RD_MULT_EPB_RATIO;
 300   x->errorperbit += (x->errorperbit == 0);
 301
 302   x->select_tx_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
 303                        cm->frame_type != KEY_FRAME) ? 0 : 1;
 304
 305   set_block_thresholds(cm, rd);
 306   set_partition_probs(cm, xd);
 307
 308   fill_token_costs(x->token_costs, cm->fc->coef_probs);
 309
 310   if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
 311       cm->frame_type == KEY_FRAME) {
 312     for (i = 0; i < PARTITION_CONTEXTS; ++i)
 313       vp10_cost_tokens(cpi->partition_cost[i], get_partition_probs(xd, i),
 314                       vp10_partition_tree);
 315   }
 316
 317   fill_mode_costs(cpi);
 318
 319   if (!frame_is_intra_only(cm)) {
 320     vp10_build_nmv_cost_table(x->nmvjointcost,
 321                              cm->allow_high_precision_mv ? x->nmvcost_hp
 322                                                          : x->nmvcost,
 323                              &cm->fc->nmvc, cm->allow_high_precision_mv);
 324
 325     for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
 326       vp10_cost_tokens((int *)cpi->inter_mode_cost[i],
 327                       cm->fc->inter_mode_probs[i], vp10_inter_mode_tree);
 328   }
 329 }
 330
 331 static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
 332   // NOTE: The tables below must be of the same size.
 333
 334   // The functions described below are sampled at the four most significant
 335   // bits of x^2 + 8 / 256.
 336
 337   // Normalized rate:
 338   // This table models the rate for a Laplacian source with given variance
 339   // when quantized with a uniform quantizer with given stepsize. The
 340   // closed form expression is:
 341   // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
 342   // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
 343   // and H(x) is the binary entropy function.
 344   static const int rate_tab_q10[] = {
 345     65536,  6086,  5574,  5275,  5063,  4899,  4764,  4651,
 346      4553,  4389,  4255,  4142,  4044,  3958,  3881,  3811,
 347      3748,  3635,  3538,  3453,  3376,  3307,  3244,  3186,
 348      3133,  3037,  2952,  2877,  2809,  2747,  2690,  2638,
 349      2589,  2501,  2423,  2353,  2290,  2232,  2179,  2130,
 350      2084,  2001,  1928,  1862,  1802,  1748,  1698,  1651,
 351      1608,  1530,  1460,  1398,  1342,  1290,  1243,  1199,
 352      1159,  1086,  1021,   963,   911,   864,   821,   781,
 353       745,   680,   623,   574,   530,   490,   455,   424,
 354       395,   345,   304,   269,   239,   213,   190,   171,
 355       154,   126,   104,    87,    73,    61,    52,    44,
 356        38,    28,    21,    16,    12,    10,     8,     6,
 357         5,     3,     2,     1,     1,     1,     0,     0,
 358   };
 359   // Normalized distortion:
 360   // This table models the normalized distortion for a Laplacian source
 361   // with given variance when quantized with a uniform quantizer
 362   // with given stepsize. The closed form expression is:
 363   // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
 364   // where x = qpstep / sqrt(variance).
 365   // Note the actual distortion is Dn * variance.
 366   static const int dist_tab_q10[] = {
 367        0,     0,     1,     1,     1,     2,     2,     2,
 368        3,     3,     4,     5,     5,     6,     7,     7,
 369        8,     9,    11,    12,    13,    15,    16,    17,
 370       18,    21,    24,    26,    29,    31,    34,    36,
 371       39,    44,    49,    54,    59,    64,    69,    73,
 372       78,    88,    97,   106,   115,   124,   133,   142,
 373      151,   167,   184,   200,   215,   231,   245,   260,
 374      274,   301,   327,   351,   375,   397,   418,   439,
 375      458,   495,   528,   559,   587,   613,   637,   659,
 376      680,   717,   749,   777,   801,   823,   842,   859,
 377      874,   899,   919,   936,   949,   960,   969,   977,
 378      983,   994,  1001,  1006,  1010,  1013,  1015,  1017,
 379     1018,  1020,  1022,  1022,  1023,  1023,  1023,  1024,
 380   };
 381   static const int xsq_iq_q10[] = {
 382          0,      4,      8,     12,     16,     20,     24,     28,
 383         32,     40,     48,     56,     64,     72,     80,     88,
 384         96,    112,    128,    144,    160,    176,    192,    208,
 385        224,    256,    288,    320,    352,    384,    416,    448,
 386        480,    544,    608,    672,    736,    800,    864,    928,
 387        992,   1120,   1248,   1376,   1504,   1632,   1760,   1888,
 388       2016,   2272,   2528,   2784,   3040,   3296,   3552,   3808,
 389       4064,   4576,   5088,   5600,   6112,   6624,   7136,   7648,
 390       8160,   9184,  10208,  11232,  12256,  13280,  14304,  15328,
 391      16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,
 392      32736,  36832,  40928,  45024,  49120,  53216,  57312,  61408,
 393      65504,  73696,  81888,  90080,  98272, 106464, 114656, 122848,
 394     131040, 147424, 163808, 180192, 196576, 212960, 229344, 245728,
 395   };
 396   const int tmp = (xsq_q10 >> 2) + 8;
 397   const int k = get_msb(tmp) - 3;
 398   const int xq = (k << 3) + ((tmp >> k) & 0x7);
 399   const int one_q10 = 1 << 10;
 400   const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
 401   const int b_q10 = one_q10 - a_q10;
 402   *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
 403   *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
 404 }
 405
 406 void vp10_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
 407                                   unsigned int qstep, int *rate,
 408                                   int64_t *dist) {
 409   // This function models the rate and distortion for a Laplacian
 410   // source with given variance when quantized with a uniform quantizer
 411   // with given stepsize. The closed form expressions are in:
 412   // Hang and Chen, "Source Model for transform video coder and its
 413   // application - Part I: Fundamental Theory", IEEE Trans. Circ.
 414   // Sys. for Video Tech., April 1997.
 415   if (var == 0) {
 416     *rate = 0;
 417     *dist = 0;
 418   } else {
 419     int d_q10, r_q10;
 420     static const uint32_t MAX_XSQ_Q10 = 245727;
 421     const uint64_t xsq_q10_64 =
 422         (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
 423     const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
 424     model_rd_norm(xsq_q10, &r_q10, &d_q10);
 425     *rate = ((r_q10 << n_log2) + 2) >> 2;
 426     *dist = (var * (int64_t)d_q10 + 512) >> 10;
 427   }
 428 }
 429
 430 void vp10_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
 431                               const struct macroblockd_plane *pd,
 432                               ENTROPY_CONTEXT t_above[16],
 433                               ENTROPY_CONTEXT t_left[16]) {
 434   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
 435   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
 436   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
 437   const ENTROPY_CONTEXT *const above = pd->above_context;
 438   const ENTROPY_CONTEXT *const left = pd->left_context;
 439
 440   int i;
 441   switch (tx_size) {
 442     case TX_4X4:
 443       memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
 444       memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
 445       break;
 446     case TX_8X8:
 447       for (i = 0; i < num_4x4_w; i += 2)
 448         t_above[i] = !!*(const uint16_t *)&above[i];
 449       for (i = 0; i < num_4x4_h; i += 2)
 450         t_left[i] = !!*(const uint16_t *)&left[i];
 451       break;
 452     case TX_16X16:
 453       for (i = 0; i < num_4x4_w; i += 4)
 454         t_above[i] = !!*(const uint32_t *)&above[i];
 455       for (i = 0; i < num_4x4_h; i += 4)
 456         t_left[i] = !!*(const uint32_t *)&left[i];
 457       break;
 458     case TX_32X32:
 459       for (i = 0; i < num_4x4_w; i += 8)
 460         t_above[i] = !!*(const uint64_t *)&above[i];
 461       for (i = 0; i < num_4x4_h; i += 8)
 462         t_left[i] = !!*(const uint64_t *)&left[i];
 463       break;
 464     default:
 465       assert(0 && "Invalid transform size.");
 466       break;
 467   }
 468 }
 469
 470 void vp10_mv_pred(VP10_COMP *cpi, MACROBLOCK *x,
 471                  uint8_t *ref_y_buffer, int ref_y_stride,
 472                  int ref_frame, BLOCK_SIZE block_size) {
 473   int i;
 474   int zero_seen = 0;
 475   int best_index = 0;
 476   int best_sad = INT_MAX;
 477   int this_sad = INT_MAX;
 478   int max_mv = 0;
 479   int near_same_nearest;
 480   uint8_t *src_y_ptr = x->plane[0].src.buf;
 481   uint8_t *ref_y_ptr;
 482   const int num_mv_refs = MAX_MV_REF_CANDIDATES +
 483                     (cpi->sf.adaptive_motion_search &&
 484                      block_size < x->max_partition_size);
 485
 486   MV pred_mv[3];
 487   pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv;
 488   pred_mv[1] = x->mbmi_ext->ref_mvs[ref_frame][1].as_mv;
 489   pred_mv[2] = x->pred_mv[ref_frame];
 490   assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
 491
 492   near_same_nearest =
 493       x->mbmi_ext->ref_mvs[ref_frame][0].as_int ==
 494           x->mbmi_ext->ref_mvs[ref_frame][1].as_int;
 495   // Get the sad for each candidate reference mv.
 496   for (i = 0; i < num_mv_refs; ++i) {
 497     const MV *this_mv = &pred_mv[i];
 498     int fp_row, fp_col;
 499
 500     if (i == 1 && near_same_nearest)
 501       continue;
 502     fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
 503     fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
 504     max_mv = VPXMAX(max_mv, VPXMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
 505
 506     if (fp_row ==0 && fp_col == 0 && zero_seen)
 507       continue;
 508     zero_seen |= (fp_row ==0 && fp_col == 0);
 509
 510     ref_y_ptr =&ref_y_buffer[ref_y_stride * fp_row + fp_col];
 511     // Find sad for current vector.
 512     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
 513                                            ref_y_ptr, ref_y_stride);
 514     // Note if it is the best so far.
 515     if (this_sad < best_sad) {
 516       best_sad = this_sad;
 517       best_index = i;
 518     }
 519   }
 520
 521   // Note the index of the mv that worked best in the reference list.
 522   x->mv_best_ref_index[ref_frame] = best_index;
 523   x->max_mv_context[ref_frame] = max_mv;
 524   x->pred_mv_sad[ref_frame] = best_sad;
 525 }
 526
 527 void vp10_setup_pred_block(const MACROBLOCKD *xd,
 528                           struct buf_2d dst[MAX_MB_PLANE],
 529                           const YV12_BUFFER_CONFIG *src,
 530                           int mi_row, int mi_col,
 531                           const struct scale_factors *scale,
 532                           const struct scale_factors *scale_uv) {
 533   int i;
 534
 535   dst[0].buf = src->y_buffer;
 536   dst[0].stride = src->y_stride;
 537   dst[1].buf = src->u_buffer;
 538   dst[2].buf = src->v_buffer;
 539   dst[1].stride = dst[2].stride = src->uv_stride;
 540
 541   for (i = 0; i < MAX_MB_PLANE; ++i) {
 542     setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
 543                      i ? scale_uv : scale,
 544                      xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
 545   }
 546 }
 547
 548 int vp10_raster_block_offset(BLOCK_SIZE plane_bsize,
 549                             int raster_block, int stride) {
 550   const int bw = b_width_log2_lookup[plane_bsize];
 551   const int y = 4 * (raster_block >> bw);
 552   const int x = 4 * (raster_block & ((1 << bw) - 1));
 553   return y * stride + x;
 554 }
 555
 556 int16_t* vp10_raster_block_offset_int16(BLOCK_SIZE plane_bsize,
 557                                        int raster_block, int16_t *base) {
 558   const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
 559   return base + vp10_raster_block_offset(plane_bsize, raster_block, stride);
 560 }
 561
 562 YV12_BUFFER_CONFIG *vp10_get_scaled_ref_frame(const VP10_COMP *cpi,
 563                                              int ref_frame) {
 564   const VP10_COMMON *const cm = &cpi->common;
 565   const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
 566   const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
 567   return
 568       (scaled_idx != ref_idx && scaled_idx != INVALID_IDX) ?
 569           &cm->buffer_pool->frame_bufs[scaled_idx].buf : NULL;
 570 }
 571
 572 int vp10_get_switchable_rate(const VP10_COMP *cpi,
 573                              const MACROBLOCKD *const xd) {
 574   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 575   const int ctx = vp10_get_pred_context_switchable_interp(xd);
 576   return SWITCHABLE_INTERP_RATE_FACTOR *
 577              cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
 578 }
 579
 580 void vp10_set_rd_speed_thresholds(VP10_COMP *cpi) {
 581   int i;
 582   RD_OPT *const rd = &cpi->rd;
 583   SPEED_FEATURES *const sf = &cpi->sf;
 584
 585   // Set baseline threshold values.
 586   for (i = 0; i < MAX_MODES; ++i)
 587     rd->thresh_mult[i] = cpi->oxcf.mode == BEST ? -500 : 0;
 588
 589   if (sf->adaptive_rd_thresh) {
 590     rd->thresh_mult[THR_NEARESTMV] = 300;
 591     rd->thresh_mult[THR_NEARESTG] = 300;
 592     rd->thresh_mult[THR_NEARESTA] = 300;
 593   } else {
 594     rd->thresh_mult[THR_NEARESTMV] = 0;
 595     rd->thresh_mult[THR_NEARESTG] = 0;
 596     rd->thresh_mult[THR_NEARESTA] = 0;
 597   }
 598
 599   rd->thresh_mult[THR_DC] += 1000;
 600
 601   rd->thresh_mult[THR_NEWMV] += 1000;
 602   rd->thresh_mult[THR_NEWA] += 1000;
 603   rd->thresh_mult[THR_NEWG] += 1000;
 604
 605   rd->thresh_mult[THR_NEARMV] += 1000;
 606   rd->thresh_mult[THR_NEARA] += 1000;
 607   rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
 608   rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
 609
 610   rd->thresh_mult[THR_TM] += 1000;
 611
 612   rd->thresh_mult[THR_COMP_NEARLA] += 1500;
 613   rd->thresh_mult[THR_COMP_NEWLA] += 2000;
 614   rd->thresh_mult[THR_NEARG] += 1000;
 615   rd->thresh_mult[THR_COMP_NEARGA] += 1500;
 616   rd->thresh_mult[THR_COMP_NEWGA] += 2000;
 617
 618   rd->thresh_mult[THR_ZEROMV] += 2000;
 619   rd->thresh_mult[THR_ZEROG] += 2000;
 620   rd->thresh_mult[THR_ZEROA] += 2000;
 621   rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
 622   rd->thresh_mult[THR_COMP_ZEROGA] += 2500;
 623
 624   rd->thresh_mult[THR_H_PRED] += 2000;
 625   rd->thresh_mult[THR_V_PRED] += 2000;
 626   rd->thresh_mult[THR_D45_PRED ] += 2500;
 627   rd->thresh_mult[THR_D135_PRED] += 2500;
 628   rd->thresh_mult[THR_D117_PRED] += 2500;
 629   rd->thresh_mult[THR_D153_PRED] += 2500;
 630   rd->thresh_mult[THR_D207_PRED] += 2500;
 631   rd->thresh_mult[THR_D63_PRED] += 2500;
 632 }
 633
 634 void vp10_set_rd_speed_thresholds_sub8x8(VP10_COMP *cpi) {
 635   static const int thresh_mult[2][MAX_REFS] =
 636       {{2500, 2500, 2500, 4500, 4500, 2500},
 637        {2000, 2000, 2000, 4000, 4000, 2000}};
 638   RD_OPT *const rd = &cpi->rd;
 639   const int idx = cpi->oxcf.mode == BEST;
 640   memcpy(rd->thresh_mult_sub8x8, thresh_mult[idx], sizeof(thresh_mult[idx]));
 641 }
 642
 643 void vp10_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
 644                                int bsize, int best_mode_index) {
 645   if (rd_thresh > 0) {
 646     const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
 647     int mode;
 648     for (mode = 0; mode < top_mode; ++mode) {
 649       const BLOCK_SIZE min_size = VPXMAX(bsize - 1, BLOCK_4X4);
 650       const BLOCK_SIZE max_size = VPXMIN(bsize + 2, BLOCK_64X64);
 651       BLOCK_SIZE bs;
 652       for (bs = min_size; bs <= max_size; ++bs) {
 653         int *const fact = &factor_buf[bs][mode];
 654         if (mode == best_mode_index) {
 655           *fact -= (*fact >> 4);
 656         } else {
 657           *fact = VPXMIN(*fact + RD_THRESH_INC, rd_thresh * RD_THRESH_MAX_FACT);
 658         }
 659       }
 660     }
 661   }
 662 }
 663
 664 int vp10_get_intra_cost_penalty(int qindex, int qdelta,
 665                                vpx_bit_depth_t bit_depth) {
 666   const int q = vp10_dc_quant(qindex, qdelta, bit_depth);
 667 #if CONFIG_VP9_HIGHBITDEPTH
 668   switch (bit_depth) {
 669     case VPX_BITS_8:
 670       return 20 * q;
 671     case VPX_BITS_10:
 672       return 5 * q;
 673     case VPX_BITS_12:
 674       return ROUND_POWER_OF_TWO(5 * q, 2);
 675     default:
 676       assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
 677       return -1;
 678   }
 679 #else
 680   return 20 * q;
 681 #endif  // CONFIG_VP9_HIGHBITDEPTH
 682 }
 683