Merge "Cleanup typos, remove unnecessary lines, replace switch"

[libvpx] / vp9 / encoder / vp9_rdopt.c
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c

index 7d82f18e553fdf7f477951b72cd751c32e6638d9..4e9621065874cbf50bb7a2f9af6bd122e77c528a 100644 (file)
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -109,7 +109,7 @@ static int rd_thresh_block_size_factor[BLOCK_SIZE_TYPES] =
  #define MAX_RD_THRESH_FREQ_FACT 32
  #define MAX_RD_THRESH_FREQ_INC 1
  
-static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES][2],
+static void fill_token_costs(vp9_coeff_cost *c,
                               vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
    int i, j, k, l;
    TX_SIZE t;
@@ -120,12 +120,12 @@ static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES][2],
            for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
              vp9_prob probs[ENTROPY_NODES];
              vp9_model_to_full_probs(p[t][i][j][k][l], probs);
-            vp9_cost_tokens((int *)c[t][i][j][0][k][l], probs,
+            vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs,
                              vp9_coef_tree);
-            vp9_cost_tokens_skip((int *)c[t][i][j][1][k][l], probs,
+            vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
                                   vp9_coef_tree);
-            assert(c[t][i][j][0][k][l][DCT_EOB_TOKEN] ==
-                   c[t][i][j][1][k][l][DCT_EOB_TOKEN]);
+            assert(c[t][i][j][k][0][l][DCT_EOB_TOKEN] ==
+                   c[t][i][j][k][1][l][DCT_EOB_TOKEN]);
            }
  }
  
@@ -513,11 +513,16 @@ int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
    return error;
  }
  
-static const int16_t band_counts[TX_SIZE_MAX_SB][8] = {
-  { 1, 2, 3, 4,  3,   16 - 13 },
-  { 1, 2, 3, 4, 11,   64 - 21 },
-  { 1, 2, 3, 4, 11,  256 - 21 },
-  { 1, 2, 3, 4, 11, 1024 - 21 },
+/* The trailing '0' is a terminator which is used inside cost_coeffs() to
+ * decide whether to include cost of a trailing EOB node or not (i.e. we
+ * can skip this if the last coefficient in this transform block, e.g. the
+ * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
+ * were non-zero). */
+static const int16_t band_counts[TX_SIZES][8] = {
+  { 1, 2, 3, 4,  3,   16 - 13, 0 },
+  { 1, 2, 3, 4, 11,   64 - 21, 0 },
+  { 1, 2, 3, 4, 11,  256 - 21, 0 },
+  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
  };
  
  static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
@@ -528,11 +533,11 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
    MACROBLOCKD *const xd = &mb->e_mbd;
    MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
    int pt, c, cost;
-  const int16_t *band_count = band_counts[tx_size];
+  const int16_t *band_count = &band_counts[tx_size][1];
    const int eob = xd->plane[plane].eobs[block];
    const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
    const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
-  unsigned int (*token_costs)[COEF_BANDS][PREV_COEF_CONTEXTS]
+  unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS]
                      [MAX_ENTROPY_TOKENS] = mb->token_costs[tx_size][type][ref];
    ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
    uint8_t token_cache[1024];
@@ -552,13 +557,14 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
      cost = token_costs[0][0][pt][DCT_EOB_TOKEN];
      c = 0;
    } else {
-    int v, prev_t, band = 1, band_left = band_count[1];
+    int v, prev_t, band_left = *band_count++;
  
      // dc token
      v = qcoeff_ptr[0];
      prev_t = vp9_dct_value_tokens_ptr[v].token;
-    cost = token_costs[0][0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
+    cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
      token_cache[0] = vp9_pt_energy_class[prev_t];
+    ++token_costs;
  
      // ac tokens
      for (c = 1; c < eob; c++) {
@@ -568,18 +574,19 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
        v = qcoeff_ptr[rc];
        t = vp9_dct_value_tokens_ptr[v].token;
        pt = get_coef_context(nb, token_cache, c);
-      cost += token_costs[!prev_t][band][pt][t] + vp9_dct_value_cost_ptr[v];
+      cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
        token_cache[rc] = vp9_pt_energy_class[t];
        prev_t = t;
        if (!--band_left) {
-        band_left = band_count[++band];
+        band_left = *band_count++;
+        ++token_costs;
        }
      }
  
      // eob token
-    if (band < 6) {
+    if (band_left) {
        pt = get_coef_context(nb, token_cache, c);
-      cost += token_costs[0][band][pt][DCT_EOB_TOKEN];
+      cost += (*token_costs)[0][pt][DCT_EOB_TOKEN];
      }
    }
  
@@ -859,7 +866,7 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
                                       int (*r)[2], int *rate,
                                       int64_t *d, int64_t *distortion,
                                       int *s, int *skip,
-                                     int64_t txfm_cache[NB_TXFM_MODES],
+                                     int64_t txfm_cache[TX_MODES],
                                       BLOCK_SIZE_TYPE bs) {
    const TX_SIZE max_txfm_size = TX_32X32
        - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
@@ -867,7 +874,7 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
    MACROBLOCKD *const xd = &x->e_mbd;
    MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
    vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
-  int64_t rd[TX_SIZE_MAX_SB][2];
+  int64_t rd[TX_SIZES][2];
    int n, m;
    int s0, s1;
  
@@ -972,11 +979,11 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
    MACROBLOCKD *const xd = &x->e_mbd;
    MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
    vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
-  int64_t rd[TX_SIZE_MAX_SB][2];
+  int64_t rd[TX_SIZES][2];
    int n, m;
    int s0, s1;
-  double scale_rd[TX_SIZE_MAX_SB] = {1.73, 1.44, 1.20, 1.00};
-  // double scale_r[TX_SIZE_MAX_SB] = {2.82, 2.00, 1.41, 1.00};
+  double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
+  // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00};
  
    const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs);
  
@@ -1065,11 +1072,11 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
  static void super_block_yrd(VP9_COMP *cpi,
                              MACROBLOCK *x, int *rate, int64_t *distortion,
                              int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs,
-                            int64_t txfm_cache[NB_TXFM_MODES],
+                            int64_t txfm_cache[TX_MODES],
                              int64_t ref_best_rd) {
    VP9_COMMON *const cm = &cpi->common;
-  int r[TX_SIZE_MAX_SB][2], s[TX_SIZE_MAX_SB];
-  int64_t d[TX_SIZE_MAX_SB], sse[TX_SIZE_MAX_SB];
+  int r[TX_SIZES][2], s[TX_SIZES];
+  int64_t d[TX_SIZES], sse[TX_SIZES];
    MACROBLOCKD *xd = &x->e_mbd;
    MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
  
@@ -1080,7 +1087,7 @@ static void super_block_yrd(VP9_COMP *cpi,
    if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
        (cpi->sf.tx_size_search_method != USE_FULL_RD &&
         mbmi->ref_frame[0] == INTRA_FRAME)) {
-    vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t));
+    vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
      choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
                               ref_best_rd, bs);
      if (psse)
@@ -1090,7 +1097,7 @@ static void super_block_yrd(VP9_COMP *cpi,
  
    if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
        mbmi->ref_frame[0] > INTRA_FRAME) {
-    int model_used[TX_SIZE_MAX_SB] = {1, 1, 1, 1};
+    int model_used[TX_SIZES] = {1, 1, 1, 1};
      if (bs >= BLOCK_SIZE_SB32X32) {
        if (model_used[TX_32X32]) {
          model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
@@ -1174,10 +1181,11 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                                       int *bestrate, int *bestratey,
                                       int64_t *bestdistortion,
-                                     BLOCK_SIZE_TYPE bsize) {
+                                     BLOCK_SIZE_TYPE bsize,
+                                     int64_t rd_thresh) {
    MB_PREDICTION_MODE mode;
    MACROBLOCKD *xd = &x->e_mbd;
-  int64_t best_rd = INT64_MAX;
+  int64_t best_rd = rd_thresh;
    int rate = 0;
    int64_t distortion;
    VP9_COMMON *const cm = &cpi->common;
@@ -1185,17 +1193,19 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
    struct macroblockd_plane *pd = &xd->plane[0];
    const int src_stride = p->src.stride;
    const int dst_stride = pd->dst.stride;
-  uint8_t *src, *dst;
+  uint8_t *src_init = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, ib,
+                                                p->src.buf, src_stride);
+  uint8_t *dst_init = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, ib,
+                                                pd->dst.buf, dst_stride);
    int16_t *src_diff, *coeff;
  
    ENTROPY_CONTEXT ta[2], tempa[2];
    ENTROPY_CONTEXT tl[2], templ[2];
    TX_TYPE tx_type = DCT_DCT;
-  TX_TYPE best_tx_type = DCT_DCT;
    int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
    int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
    int idx, idy, block;
-  DECLARE_ALIGNED(16, int16_t, best_dqcoeff[4][16]);
+  uint8_t best_dst[8 * 8];
  
    assert(ib < 4);
  
@@ -1223,17 +1233,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
        for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
          int64_t ssz;
          const int16_t *scan;
+        uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
+        uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
  
          block = ib + idy * 2 + idx;
          xd->mode_info_context->bmi[block].as_mode = mode;
-        src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                        p->src.buf, src_stride);
          src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, block,
                                               p->src_diff);
          coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16);
-        dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                        pd->dst.buf, dst_stride);
-        vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8),
+        vp9_predict_intra_block(xd, block, 1,
                                  TX_4X4, mode,
                                  x->skip_encode ? src : dst,
                                  x->skip_encode ? src_stride : dst_stride,
@@ -1258,6 +1266,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
          distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff,
                                                            block, 16),
                                        16, &ssz) >> 2;
+        if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+          goto next;
  
          if (tx_type != DCT_DCT)
            vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
@@ -1277,46 +1287,22 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
        *bestdistortion = distortion;
        best_rd = this_rd;
        *best_mode = mode;
-      best_tx_type = tx_type;
        vpx_memcpy(a, tempa, sizeof(tempa));
        vpx_memcpy(l, templ, sizeof(templ));
-      for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
-        for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
-          block = ib + idy * 2 + idx;
-          vpx_memcpy(best_dqcoeff[idy * 2 + idx],
-                     BLOCK_OFFSET(pd->dqcoeff, block, 16),
-                     sizeof(best_dqcoeff[0]));
-        }
-      }
+      for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
+        vpx_memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
+                   num_4x4_blocks_wide * 4);
      }
+  next:
+    {}
    }
  
-  if (x->skip_encode)
+  if (best_rd >= rd_thresh || x->skip_encode)
      return best_rd;
  
-  for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
-    for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
-      block = ib + idy * 2 + idx;
-      xd->mode_info_context->bmi[block].as_mode = *best_mode;
-      src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                      p->src.buf, src_stride);
-      dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                      pd->dst.buf, dst_stride);
-
-      vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8), TX_4X4,
-                              *best_mode,
-                              x->skip_encode ? src : dst,
-                              x->skip_encode ? src_stride : dst_stride,
-                              dst, dst_stride);
-      // inverse transform
-      if (best_tx_type != DCT_DCT)
-        vp9_short_iht4x4_add(best_dqcoeff[idy * 2 + idx], dst,
-                             dst_stride, best_tx_type);
-      else
-        xd->inv_txm4x4_add(best_dqcoeff[idy * 2 + idx], dst,
-                           dst_stride);
-    }
-  }
+  for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
+    vpx_memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
+               num_4x4_blocks_wide * 4);
  
    return best_rd;
  }
@@ -1349,7 +1335,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
        const int mis = xd->mode_info_stride;
        MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
        int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);
-      int64_t UNINITIALIZED_IS_SAFE(d);
+      int64_t UNINITIALIZED_IS_SAFE(d), this_rd;
        i = idy * 2 + idx;
  
        if (cpi->common.frame_type == KEY_FRAME) {
@@ -1360,9 +1346,14 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
          bmode_costs  = mb->y_mode_costs[A][L];
        }
  
-      total_rd += rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
-                                        t_above + idx, t_left + idy,
-                                        &r, &ry, &d, bsize);
+      this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
+                                      t_above + idx, t_left + idy,
+                                      &r, &ry, &d, bsize,
+                                      best_rd - total_rd);
+      if (this_rd >= best_rd - total_rd)
+        return INT64_MAX;
+
+      total_rd += this_rd;
        cost += r;
        distortion += d;
        tot_rate_y += ry;
@@ -1390,7 +1381,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                        int *rate, int *rate_tokenonly,
                                        int64_t *distortion, int *skippable,
                                        BLOCK_SIZE_TYPE bsize,
-                                      int64_t txfm_cache[NB_TXFM_MODES],
+                                      int64_t txfm_cache[TX_MODES],
                                        int64_t best_rd) {
    MB_PREDICTION_MODE mode;
    MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
@@ -1402,13 +1393,13 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
    int *bmode_costs = x->mbmode_cost;
  
    if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
-    for (i = 0; i < NB_TXFM_MODES; i++)
+    for (i = 0; i < TX_MODES; i++)
        txfm_cache[i] = INT64_MAX;
    }
  
    /* Y Search for intra prediction mode */
    for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    int64_t local_txfm_cache[NB_TXFM_MODES];
+    int64_t local_txfm_cache[TX_MODES];
      MODE_INFO *const mic = xd->mode_info_context;
      const int mis = xd->mode_info_stride;
  
@@ -1441,7 +1432,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
      }
  
      if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
-      for (i = 0; i < NB_TXFM_MODES; i++) {
+      for (i = 0; i < TX_MODES; i++) {
          int64_t adj_rd = this_rd + local_txfm_cache[i] -
              local_txfm_cache[cpi->common.tx_mode];
          if (adj_rd < txfm_cache[i]) {
@@ -2203,7 +2194,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
    MB_MODE_INFO *mbmi = &mi->mbmi;
    int mode_idx;
  
-  vpx_memset(bsi, 0, sizeof(*bsi));
+  vp9_zero(*bsi);
  
    bsi->segment_rd = best_rd;
    bsi->ref_mv = best_ref_mv;
@@ -2357,7 +2348,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                           int_mv *ref_mv,
                           int_mv *second_ref_mv,
                           int64_t comp_pred_diff[NB_PREDICTION_TYPES],
-                         int64_t txfm_size_diff[NB_TXFM_MODES],
+                         int64_t txfm_size_diff[TX_MODES],
                           int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1]) {
    MACROBLOCKD *const xd = &x->e_mbd;
  
@@ -3026,53 +3017,80 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
    if (cpi->common.mcomp_filter_type == SWITCHABLE)
      *rate2 += get_switchable_rate(cm, x);
  
-  if (cpi->active_map_enabled && x->active_ptr[0] == 0)
-    x->skip = 1;
-  else if (x->encode_breakout) {
-    const BLOCK_SIZE_TYPE y_size = get_plane_block_size(bsize, &xd->plane[0]);
-    const BLOCK_SIZE_TYPE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
-
-    unsigned int var, sse;
-    int threshold = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1] >> 4);
-
-
-    if (threshold < x->encode_breakout)
-      threshold = x->encode_breakout;
-
-    var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
-                                 xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-                                 &sse);
-
-    if ((int)sse < threshold) {
-      unsigned int q2dc = xd->plane[0].dequant[0];
-      // If there is no codeable 2nd order dc
-      // or a very small uniform pixel change change
-      if ((sse - var < q2dc * q2dc >> 4) ||
-          (sse / 2 > var && sse - var < 64)) {
-        // Check u and v to make sure skip is ok
-        int sse2;
-        unsigned int sse2u, sse2v;
-        var = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
-                                      x->plane[1].src.stride,
-                                      xd->plane[1].dst.buf,
-                                      xd->plane[1].dst.stride, &sse2u);
-        var = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
-                                      x->plane[2].src.stride,
-                                      xd->plane[2].dst.buf,
-                                      xd->plane[2].dst.stride, &sse2v);
-        sse2 = sse2u + sse2v;
-
-        if (sse2 * 2 < threshold) {
-          x->skip = 1;
-          *distortion = sse + sse2;
-          *rate2 = 500;
-
-          // for best yrd calculation
-          *rate_uv = 0;
-          *distortion_uv = sse2;
-
-          *disable_skip = 1;
-          this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
+  if (!is_comp_pred) {
+    if (cpi->active_map_enabled && x->active_ptr[0] == 0)
+      x->skip = 1;
+    else if (x->encode_breakout) {
+      const BLOCK_SIZE_TYPE y_size = get_plane_block_size(bsize, &xd->plane[0]);
+      const BLOCK_SIZE_TYPE uv_size = get_plane_block_size(bsize,
+                                                           &xd->plane[1]);
+      unsigned int var, sse;
+      // Skipping threshold for ac.
+      unsigned int thresh_ac;
+      // The encode_breakout input
+      unsigned int encode_breakout = x->encode_breakout << 4;
+
+      // Calculate threshold according to dequant value.
+      thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
+
+      // Set a maximum for threshold to avoid big PSNR loss in low bitrate case.
+      if (thresh_ac > 36000)
+        thresh_ac = 36000;
+
+      // Use encode_breakout input if it is bigger than internal threshold.
+      if (thresh_ac < encode_breakout)
+        thresh_ac = encode_breakout;
+
+      var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                                   xd->plane[0].dst.buf,
+                                   xd->plane[0].dst.stride, &sse);
+
+      // Adjust threshold according to partition size.
+      thresh_ac >>= 8 - (b_width_log2_lookup[bsize] +
+          b_height_log2_lookup[bsize]);
+
+      // Y skipping condition checking
+      if (sse < thresh_ac || sse == 0) {
+        // Skipping threshold for dc
+        unsigned int thresh_dc;
+
+        thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+
+        // dc skipping checking
+        if ((sse - var) < thresh_dc || sse == var) {
+          unsigned int sse_u, sse_v;
+          unsigned int var_u, var_v;
+
+          var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
+                                          x->plane[1].src.stride,
+                                          xd->plane[1].dst.buf,
+                                          xd->plane[1].dst.stride, &sse_u);
+
+          // U skipping condition checking
+          if ((sse_u * 4 < thresh_ac || sse_u == 0) &&
+              (sse_u - var_u < thresh_dc || sse_u == var_u)) {
+            var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
+                                            x->plane[2].src.stride,
+                                            xd->plane[2].dst.buf,
+                                            xd->plane[2].dst.stride, &sse_v);
+
+            // V skipping condition checking
+            if ((sse_v * 4 < thresh_ac || sse_v == 0) &&
+                (sse_v - var_v < thresh_dc || sse_v == var_v)) {
+              x->skip = 1;
+
+              *rate2 = 500;
+              *rate_uv = 0;
+
+              // Scaling factor for SSE from spatial domain to frequency domain
+              // is 16. Adjust distortion accordingly.
+              *distortion_uv = (sse_u + sse_v) << 4;
+              *distortion = (sse << 4) + *distortion_uv;
+
+              *disable_skip = 1;
+              this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
+            }
+          }
          }
        }
      }
@@ -3124,7 +3142,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    MACROBLOCKD *const xd = &x->e_mbd;
    int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
    int y_skip = 0, uv_skip;
-  int64_t dist_y = 0, dist_uv = 0, txfm_cache[NB_TXFM_MODES];
+  int64_t dist_y = 0, dist_uv = 0, txfm_cache[TX_MODES];
  
    x->skip_encode = 0;
    vpx_memset(&txfm_cache, 0, sizeof(txfm_cache));
@@ -3161,7 +3179,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
          vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0);
      *returndist = dist_y + (dist_uv >> 2);
      if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
-      for (i = 0; i < NB_TXFM_MODES; i++) {
+      for (i = 0; i < TX_MODES; i++) {
          ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->tx_mode];
        }
      }
@@ -3196,8 +3214,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                       cpi->alt_fb_idx};
    int64_t best_rd = best_rd_so_far;
    int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
-  int64_t best_txfm_rd[NB_TXFM_MODES];
-  int64_t best_txfm_diff[NB_TXFM_MODES];
+  int64_t best_txfm_rd[TX_MODES];
+  int64_t best_txfm_diff[TX_MODES];
    int64_t best_pred_diff[NB_PREDICTION_TYPES];
    int64_t best_pred_rd[NB_PREDICTION_TYPES];
    int64_t best_filter_rd[VP9_SWITCHABLE_FILTERS + 1];
@@ -3213,10 +3231,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    // MB_PREDICTION_MODE best_inter_mode = ZEROMV;
    MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
    INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
-  int rate_uv_intra[TX_SIZE_MAX_SB], rate_uv_tokenonly[TX_SIZE_MAX_SB];
-  int64_t dist_uv[TX_SIZE_MAX_SB];
-  int skip_uv[TX_SIZE_MAX_SB];
-  MB_PREDICTION_MODE mode_uv[TX_SIZE_MAX_SB];
+  int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
+  int64_t dist_uv[TX_SIZES];
+  int skip_uv[TX_SIZES];
+  MB_PREDICTION_MODE mode_uv[TX_SIZES];
    struct scale_factors scale_factor[4];
    unsigned int ref_frame_mask = 0;
    unsigned int mode_mask = 0;
@@ -3252,11 +3270,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
  
    for (i = 0; i < NB_PREDICTION_TYPES; ++i)
      best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < NB_TXFM_MODES; i++)
+  for (i = 0; i < TX_MODES; i++)
      best_txfm_rd[i] = INT64_MAX;
    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
      best_filter_rd[i] = INT64_MAX;
-  for (i = 0; i < TX_SIZE_MAX_SB; i++)
+  for (i = 0; i < TX_SIZES; i++)
      rate_uv_intra[i] = INT_MAX;
  
    *returnrate = INT_MAX;
@@ -3312,15 +3330,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
      int rate2 = 0, rate_y = 0, rate_uv = 0;
      int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
      int skippable;
-    int64_t txfm_cache[NB_TXFM_MODES];
+    int64_t txfm_cache[TX_MODES];
      int i;
      int this_skip2 = 0;
      int64_t total_sse = INT_MAX;
      int early_term = 0;
  
-    for (i = 0; i < NB_TXFM_MODES; ++i)
+    for (i = 0; i < TX_MODES; ++i)
        txfm_cache[i] = INT64_MAX;
  
+    x->skip = 0;
      this_mode = vp9_mode_order[mode_index].mode;
      ref_frame = vp9_mode_order[mode_index].ref_frame;
      second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
@@ -3347,8 +3366,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME))
        continue;
  
-    x->skip = 0;
-
      // Skip some checking based on small partitions' result.
      if (x->fast_ms > 1 && !ref_frame)
        continue;
@@ -3494,8 +3511,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
          */
  
        mbmi->txfm_size = TX_4X4;
-      rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
-                                &distortion_y, INT64_MAX);
+      if (rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
+                                    &distortion_y, best_rd) >= best_rd)
+        continue;
        rate2 += rate;
        rate2 += intra_cost_penalty;
        distortion2 += distortion_y;
@@ -3512,7 +3530,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
        distortion_uv = dist_uv[TX_4X4];
        mbmi->uv_mode = mode_uv[TX_4X4];
        txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-      for (i = 0; i < NB_TXFM_MODES; ++i)
+      for (i = 0; i < TX_MODES; ++i)
          txfm_cache[i] = txfm_cache[ONLY_4X4];
      } else if (ref_frame == INTRA_FRAME) {
        TX_SIZE uv_tx;
@@ -3715,7 +3733,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
          total_sse += uv_sse;
  
          txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-        for (i = 0; i < NB_TXFM_MODES; ++i)
+        for (i = 0; i < TX_MODES; ++i)
            txfm_cache[i] = txfm_cache[ONLY_4X4];
        }
      } else {
@@ -3951,7 +3969,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
        txfm_cache[ALLOW_32X32] = txfm_cache[ALLOW_16X16];
      }
      if (!mode_excluded && this_rd != INT64_MAX) {
-      for (i = 0; i < NB_TXFM_MODES; i++) {
+      for (i = 0; i < TX_MODES; i++) {
          int64_t adj_rd = INT64_MAX;
          if (this_mode != I4X4_PRED) {
            adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->tx_mode];
@@ -3967,9 +3985,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
      if (early_term)
        break;
  
-    if (x->skip && !mode_excluded)
+    if (x->skip && !comp_pred)
        break;
    }
+
    if (best_rd >= best_rd_so_far)
      return INT64_MAX;
  
@@ -4110,7 +4129,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    }
  
    if (!x->skip) {
-    for (i = 0; i < NB_TXFM_MODES; i++) {
+    for (i = 0; i < TX_MODES; i++) {
        if (best_txfm_rd[i] == INT64_MAX)
          best_txfm_diff[i] = 0;
        else