Make coefficient skip condition an explicit RD choice.

author Ronald S. Bultje <rbultje@google.com>

Fri, 28 Jun 2013 00:41:54 +0000 (17:41 -0700)

committer Ronald S. Bultje <rbultje@google.com>

Fri, 28 Jun 2013 17:28:49 +0000 (10:28 -0700)
author Ronald S. Bultje <rbultje@google.com>
Fri, 28 Jun 2013 00:41:54 +0000 (17:41 -0700)
committer Ronald S. Bultje <rbultje@google.com>
Fri, 28 Jun 2013 17:28:49 +0000 (10:28 -0700)
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh

index c76e8f7364b946782183d61052a44c2d6977100b..bddbd49ec638af1f240c8fbf68dc2f0989d37397 100644 (file)
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -558,7 +558,7 @@ prototype unsigned int vp9_get_mb_ss "const int16_t *"
  specialize vp9_get_mb_ss mmx sse2
  # ENCODEMB INVOKE
  
-prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size"
+prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"
  specialize vp9_block_error sse2
  
  prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h

index 59cc3d95cf479a832982d953f0d4a285d5cac1f1..74f61a1010d0b584aa9171e3a09ec4b9cd77d6e6 100644 (file)
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -68,7 +68,6 @@ struct macroblock_plane {
    int16_t *quant;
    uint8_t *quant_shift;
    int16_t *zbin;
-  int16_t *zrun_zbin_boost;
    int16_t *round;
  
    // Zbin Over Quant value
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h

index a1f567aedbf89a50c478a581da1f36e23f9544fd..0e6c97a5e100f5ed7fe853c571e40f95b49eac9e 100644 (file)
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -268,11 +268,7 @@ typedef struct VP9_COMP {
    DECLARE_ALIGNED(16, unsigned char, a_quant_shift[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, a_zbin[QINDEX_RANGE][16]);
    DECLARE_ALIGNED(16, short, a_round[QINDEX_RANGE][16]);
-
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_a[QINDEX_RANGE][16]);
  #endif
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
  
    MACROBLOCK mb;
    VP9_COMMON common;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c

index 8deeea13dcb8a2893a9ecff219d4864e708c73b4..e68a48b12b63bcbf6245570acec16ba4a844bf06 100644 (file)
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -21,8 +21,7 @@
  extern int enc_debug;
  #endif
  
-static void quantize(int16_t *zbin_boost_orig_ptr,
-                     int16_t *coeff_ptr, int n_coeffs, int skip_block,
+static void quantize(int16_t *coeff_ptr, int n_coeffs, int skip_block,
                       int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
                       uint8_t *quant_shift_ptr,
                       int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
@@ -31,8 +30,6 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
    int i, rc, eob;
    int zbins[2], nzbins[2], zbin;
    int x, y, z, sz;
-  int zero_run = 0;
-  int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
    int zero_flag = n_coeffs;
  
    vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
@@ -65,8 +62,7 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
        rc = scan[i];
        z  = coeff_ptr[rc];
  
-      zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
-      zero_run += (zero_run < 15);
+      zbin = (zbins[rc != 0]);
  
        sz = (z >> 31);                               // sign of z
        x  = (z ^ sz) - sz;
@@ -81,7 +77,6 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
  
          if (y) {
            eob = i;                                  // last nonzero coeffs
-          zero_run = 0;                             // set zero_run
          }
        }
      }
@@ -90,8 +85,7 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
  }
  
  // This function works well for large transform size.
-static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
-                            int16_t *coeff_ptr, int n_coeffs, int skip_block,
+static void quantize_sparse(int16_t *coeff_ptr, int n_coeffs, int skip_block,
                              int16_t *zbin_ptr, int16_t *round_ptr,
                              int16_t *quant_ptr, uint8_t *quant_shift_ptr,
                              int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
@@ -101,10 +95,7 @@ static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
    int i, rc, eob;
    int zbins[2], nzbins[2], zbin;
    int x, y, z, sz;
-  int zero_run = 0;
-  int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
    int idx = 0;
-  int pre_idx = 0;
  
    vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
    vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
@@ -135,11 +126,8 @@ static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
        rc = scan[idx_arr[i]];
  
        // Calculate ZBIN
-      zero_run += idx_arr[i] - pre_idx;
-      if(zero_run > 15) zero_run = 15;
-      zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
+      zbin = (zbins[rc != 0]);
  
-      pre_idx = idx_arr[i];
        z = coeff_ptr[rc] * 2;
        sz = (z >> 31);                               // sign of z
        x  = (z ^ sz) - sz;                           // x = abs(z)
@@ -155,7 +143,6 @@ static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
  
          if (y) {
            eob = idx_arr[i];                         // last nonzero coeffs
-          zero_run = -1;                            // set zero_run
          }
        }
      }
@@ -189,8 +176,7 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
      // Save index of picked coefficient in pre-scan pass.
      int idx_arr[1024];
  
-    quantize_sparse(mb->plane[plane].zrun_zbin_boost,
-                    BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+    quantize_sparse(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
                      n_coeffs, mb->skip_block,
                      mb->plane[plane].zbin,
                      mb->plane[plane].round,
@@ -204,8 +190,7 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
                      scan, idx_arr);
    }
    else {
-    quantize(mb->plane[plane].zrun_zbin_boost,
-             BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+    quantize(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
               n_coeffs, mb->skip_block,
               mb->plane[plane].zbin,
               mb->plane[plane].round,
@@ -226,8 +211,7 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
    const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
    const int *pt_scan = get_scan_4x4(tx_type);
  
-  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
-           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
+  quantize(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
             16, mb->skip_block,
             mb->plane[pb_idx.plane].zbin,
             mb->plane[pb_idx.plane].round,
@@ -261,9 +245,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
  #endif
    int q;
  
-  static const int zbin_boost[16] = { 0,  0,  0,  8,  8,  8, 10, 12,
-                                     14, 16, 20, 24, 28, 32, 36, 40 };
-
    for (q = 0; q < QINDEX_RANGE; q++) {
      int qzbin_factor = (vp9_dc_quant(q, 0) < 148) ? 84 : 80;
      int qrounding_factor = 48;
@@ -277,14 +258,12 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
      cpi->y_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
      cpi->y_round[q][0] = (qrounding_factor * quant_val) >> 7;
      cpi->common.y_dequant[q][0] = quant_val;
-    cpi->zrun_zbin_boost_y[q][0] = (quant_val * zbin_boost[0]) >> 7;
  
      quant_val = vp9_dc_quant(q, cpi->common.uv_dc_delta_q);
      invert_quant(cpi->uv_quant[q] + 0, cpi->uv_quant_shift[q] + 0, quant_val);
      cpi->uv_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
      cpi->uv_round[q][0] = (qrounding_factor * quant_val) >> 7;
      cpi->common.uv_dequant[q][0] = quant_val;
-    cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7;
  
  #if CONFIG_ALPHA
      quant_val = vp9_dc_quant(q, cpi->common.a_dc_delta_q);
@@ -292,7 +271,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
      cpi->a_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
      cpi->a_round[q][0] = (qrounding_factor * quant_val) >> 7;
      cpi->common.a_dequant[q][0] = quant_val;
-    cpi->zrun_zbin_boost_a[q][0] = (quant_val * zbin_boost[0]) >> 7;
  #endif
  
      quant_val = vp9_ac_quant(q, 0);
@@ -310,15 +288,11 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
        invert_quant(cpi->y_quant[q] + rc, cpi->y_quant_shift[q] + rc, quant_val);
        cpi->y_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
        cpi->y_round[q][rc] = (qrounding_factor * quant_val) >> 7;
-      cpi->zrun_zbin_boost_y[q][i] =
-          ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);
  
        invert_quant(cpi->uv_quant[q] + rc, cpi->uv_quant_shift[q] + rc,
          quant_uv_val);
        cpi->uv_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);
        cpi->uv_round[q][rc] = (qrounding_factor * quant_uv_val) >> 7;
-      cpi->zrun_zbin_boost_uv[q][i] =
-          ROUND_POWER_OF_TWO(quant_uv_val * zbin_boost[i], 7);
  
  #if CONFIG_ALPHA
        invert_quant(cpi->a_quant[q] + rc, cpi->a_quant_shift[q] + rc,
@@ -326,8 +300,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
        cpi->a_zbin[q][rc] =
            ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7);
        cpi->a_round[q][rc] = (qrounding_factor * quant_alpha_val) >> 7;
-      cpi->zrun_zbin_boost_a[q][i] =
-          ROUND_POWER_OF_TWO(quant_alpha_val * zbin_boost[i], 7);
  #endif
      }
    }
@@ -348,7 +320,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
    x->plane[0].quant_shift = cpi->y_quant_shift[qindex];
    x->plane[0].zbin = cpi->y_zbin[qindex];
    x->plane[0].round = cpi->y_round[qindex];
-  x->plane[0].zrun_zbin_boost = cpi->zrun_zbin_boost_y[qindex];
    x->plane[0].zbin_extra = (int16_t)zbin_extra;
    x->e_mbd.plane[0].dequant = cpi->common.y_dequant[qindex];
  
@@ -361,7 +332,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
      x->plane[i].quant_shift = cpi->uv_quant_shift[qindex];
      x->plane[i].zbin = cpi->uv_zbin[qindex];
      x->plane[i].round = cpi->uv_round[qindex];
-    x->plane[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[qindex];
      x->plane[i].zbin_extra = (int16_t)zbin_extra;
      x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex];
    }
@@ -371,7 +341,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
    x->plane[3].quant_shift = cpi->a_quant_shift[qindex];
    x->plane[3].zbin = cpi->a_zbin[qindex];
    x->plane[3].round = cpi->a_round[qindex];
-  x->plane[3].zrun_zbin_boost = cpi->zrun_zbin_boost_a[qindex];
    x->plane[3].zbin_extra = (int16_t)zbin_extra;
    x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex];
  #endif
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c

index 37fc9316dc2ee93169116440eeb1a3ab8b1366a1..7a2ec56bb0801e27f3a728beb781274bcaf1487d 100644 (file)
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -283,15 +283,17 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
  }
  
  int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
-                          intptr_t block_size) {
+                          intptr_t block_size, int64_t *ssz) {
    int i;
-  int64_t error = 0;
+  int64_t error = 0, sqcoeff = 0;
  
    for (i = 0; i < block_size; i++) {
      int this_diff = coeff[i] - dqcoeff[i];
      error += (unsigned)this_diff * this_diff;
+    sqcoeff += (unsigned) coeff[i] * coeff[i];
    }
  
+  *ssz = sqcoeff;
    return error;
  }
  
@@ -501,27 +503,31 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
  }
  
  static int64_t block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
-                               int shift) {
+                               int shift, int64_t *sse) {
    struct macroblockd_plane *p = &x->e_mbd.plane[0];
    const int bw = plane_block_width(bsize, p);
    const int bh = plane_block_height(bsize, p);
-  return vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
-                         bw * bh) >> shift;
+  int64_t e = vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
+                              bw * bh, sse) >> shift;
+  *sse >>= shift;
+  return e;
  }
  
  static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
-                                int shift) {
-  int64_t sum = 0;
+                                int shift, int64_t *sse) {
+  int64_t sum = 0, this_sse;
    int plane;
  
+  *sse = 0;
    for (plane = 1; plane < MAX_MB_PLANE; plane++) {
      struct macroblockd_plane *p = &x->e_mbd.plane[plane];
      const int bw = plane_block_width(bsize, p);
      const int bh = plane_block_height(bsize, p);
      sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
-                           bw * bh);
+                           bw * bh, &this_sse);
+    *sse += this_sse;
    }
-
+  *sse >>= shift;
    return sum >> shift;
  }
  
@@ -581,7 +587,7 @@ static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
  
  static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
                                       int *rate, int64_t *distortion,
-                                     int *skippable,
+                                     int *skippable, int64_t *sse,
                                       BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
    MACROBLOCKD *const xd = &x->e_mbd;
    xd->mode_info_context->mbmi.txfm_size = tx_size;
@@ -591,18 +597,18 @@ static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
    else
      vp9_xform_quant_sby(cm, x, bsize);
  
-  *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2);
+  *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2, sse);
    *rate       = rdcost_plane(cm, x, 0, bsize, tx_size);
    *skippable  = vp9_sby_is_skippable(xd, bsize);
  }
  
  static void super_block_yrd(VP9_COMP *cpi,
                              MACROBLOCK *x, int *rate, int64_t *distortion,
-                            int *skip, BLOCK_SIZE_TYPE bs,
+                            int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs,
                              int64_t txfm_cache[NB_TXFM_MODES]) {
    VP9_COMMON *const cm = &cpi->common;
    int r[TX_SIZE_MAX_SB][2], s[TX_SIZE_MAX_SB];
-  int64_t d[TX_SIZE_MAX_SB];
+  int64_t d[TX_SIZE_MAX_SB], sse[TX_SIZE_MAX_SB];
    MACROBLOCKD *xd = &x->e_mbd;
    MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
  
@@ -621,25 +627,27 @@ static void super_block_yrd(VP9_COMP *cpi,
        mbmi->txfm_size = TX_4X4;
      }
      vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t));
-    super_block_yrd_for_txfm(cm, x, rate, distortion, skip, bs,
+    super_block_yrd_for_txfm(cm, x, rate, distortion, skip, &sse[0], bs,
                               mbmi->txfm_size);
      return;
    }
    if (bs >= BLOCK_SIZE_SB32X32)
      super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
-                             bs, TX_32X32);
+                             &sse[TX_32X32], bs, TX_32X32);
    if (bs >= BLOCK_SIZE_MB16X16)
      super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
-                             bs, TX_16X16);
-  super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], bs,
-                           TX_8X8);
-  super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], bs,
-                           TX_4X4);
+                             &sse[TX_16X16], bs, TX_16X16);
+  super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
+                           &sse[TX_8X8], bs, TX_8X8);
+  super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
+                           &sse[TX_4X4], bs, TX_4X4);
  
    choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
                             skip, txfm_cache,
                             TX_32X32 - (bs < BLOCK_SIZE_SB32X32)
                             - (bs < BLOCK_SIZE_MB16X16));
+  if (psse)
+    *psse = sse[mbmi->txfm_size];
  }
  
  static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
@@ -688,6 +696,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
  
      for (idy = 0; idy < bh; ++idy) {
        for (idx = 0; idx < bw; ++idx) {
+        int64_t ssz;
+
          block = ib + idy * 2 + idx;
          xd->mode_info_context->bmi[block].as_mode.first = mode;
          src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
@@ -718,7 +728,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
          ratey += cost_coeffs(cm, x, 0, block, PLANE_TYPE_Y_WITH_DC,
                               tempa + idx, templ + idy, TX_4X4, 16);
          distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff,
-                                                          block, 16), 16) >> 2;
+                                                          block, 16),
+                                      16, &ssz) >> 2;
  
          if (best_tx_type != DCT_DCT)
            vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
@@ -881,7 +892,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
      }
      x->e_mbd.mode_info_context->mbmi.mode = mode;
  
-    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
+    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
                      bsize, local_txfm_cache);
  
      this_rate = this_rate_tokenonly + bmode_costs[mode];
@@ -914,22 +925,25 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
  
  static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
                                        int *rate, int64_t *distortion,
-                                      int *skippable, BLOCK_SIZE_TYPE bsize,
+                                      int *skippable, int64_t *sse,
+                                      BLOCK_SIZE_TYPE bsize,
                                        TX_SIZE uv_tx_size) {
    MACROBLOCKD *const xd = &x->e_mbd;
+  int64_t dummy;
    if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
      vp9_encode_intra_block_uv(cm, x, bsize);
    else
      vp9_xform_quant_sbuv(cm, x, bsize);
  
-  *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2);
+  *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2,
+                                 sse ? sse : &dummy);
    *rate       = rdcost_uv(cm, x, bsize, uv_tx_size);
    *skippable  = vp9_sbuv_is_skippable(xd, bsize);
  }
  
  static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
                               int *rate, int64_t *distortion, int *skippable,
-                             BLOCK_SIZE_TYPE bsize) {
+                             int64_t *sse, BLOCK_SIZE_TYPE bsize) {
    MACROBLOCKD *const xd = &x->e_mbd;
    MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
    TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
@@ -937,7 +951,7 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
    if (mbmi->ref_frame[0] > INTRA_FRAME)
      vp9_subtract_sbuv(x, bsize);
  
-  super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+  super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, sse, bsize,
                              uv_txfm_size);
  }
  
@@ -954,7 +968,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
    for (mode = DC_PRED; mode <= TM_PRED; mode++) {
      x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
      super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
-                     &this_distortion, &s, bsize);
+                     &this_distortion, &s, NULL, bsize);
      this_rate = this_rate_tokenonly +
                  x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
@@ -1151,6 +1165,8 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
    k = i;
    for (idy = 0; idy < bh / 4; ++idy) {
      for (idx = 0; idx < bw / 4; ++idx) {
+      int64_t ssz;
+
        k += (idy * 2 + idx);
        src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
                                             x->plane[0].src_diff);
@@ -1159,7 +1175,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
        x->quantize_b_4x4(x, k, DCT_DCT, 16);
        thisdistortion += vp9_block_error(coeff,
                                          BLOCK_OFFSET(xd->plane[0].dqcoeff,
-                                                     k, 16), 16);
+                                                     k, 16), 16, &ssz);
        thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,
                                ta + (k & 1),
                                tl + (k >> 1), TX_4X4, 16);
@@ -2238,7 +2254,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                   INTERPOLATIONFILTERTYPE *best_filter,
                                   int_mv *frame_mv,
                                   int mi_row, int mi_col,
-                                 int_mv single_newmv[MAX_REF_FRAMES]) {
+                                 int_mv single_newmv[MAX_REF_FRAMES],
+                                 int64_t *psse) {
    VP9_COMMON *cm = &cpi->common;
    MACROBLOCKD *xd = &x->e_mbd;
    MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
@@ -2467,17 +2484,19 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  
    if (!x->skip) {
      int skippable_y, skippable_uv;
+    int64_t sseuv = INT_MAX;
  
      // Y cost and distortion
-    super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y,
+    super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
                      bsize, txfm_cache);
  
      *rate2 += *rate_y;
      *distortion += *distortion_y;
  
      super_block_uvrd(cm, x, rate_uv, distortion_uv,
-                     &skippable_uv, bsize);
+                     &skippable_uv, &sseuv, bsize);
  
+    *psse += sseuv;
      *rate2 += *rate_uv;
      *distortion += *distortion_uv;
      *skippable = skippable_y && skippable_uv;
@@ -2611,6 +2630,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    int bws = (1 << bwsl) / 4;  // mode_info step for subsize
    int bhsl = b_height_log2(bsize);
    int bhs = (1 << bhsl) / 4;  // mode_info step for subsize
+  int best_skip2 = 0;
  
    for (i = 0; i < 4; i++) {
      int j;
@@ -2702,6 +2722,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
      int skippable;
      int64_t txfm_cache[NB_TXFM_MODES];
      int i;
+    int this_skip2 = 0;
+    int64_t total_sse = INT_MAX;
  
      for (i = 0; i < NB_TXFM_MODES; ++i)
        txfm_cache[i] = INT64_MAX;
@@ -2863,7 +2885,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
          txfm_cache[i] = txfm_cache[ONLY_4X4];
      } else if (ref_frame == INTRA_FRAME) {
        TX_SIZE uv_tx;
-      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
                        bsize, txfm_cache);
  
        uv_tx = mbmi->txfm_size;
@@ -2989,7 +3011,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                        BLOCK_SIZE_SB8X8);
        vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
        super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
-                                &uv_skippable, BLOCK_SIZE_SB8X8, TX_4X4);
+                                &uv_skippable, NULL, BLOCK_SIZE_SB8X8, TX_4X4);
        rate2 += rate_uv;
        distortion2 += distortion_uv;
        skippable = skippable && uv_skippable;
@@ -3017,7 +3039,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                    &mode_excluded, &disable_skip,
                                    &tmp_best_filter, frame_mv[this_mode],
                                    mi_row, mi_col,
-                                  single_newmv);
+                                  single_newmv, &total_sse);
        if (this_rd == INT64_MAX)
          continue;
      }
@@ -3062,10 +3084,29 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
              rate2 += prob_skip_cost;
            }
          }
+      } else if (mb_skip_allowed && ref_frame != INTRA_FRAME &&
+                 this_mode != SPLITMV) {
+        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
+            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
+          // Add in the cost of the no skip flag.
+          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
+                                                          PRED_MBSKIP), 0);
+          rate2 += prob_skip_cost;
+        } else {
+          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
+                                                              PRED_MBSKIP), 1);
+          rate2 += prob_skip_cost;
+          distortion2 = total_sse;
+          assert(total_sse >= 0);
+          rate2 -= (rate_y + rate_uv);
+          rate_y = 0;
+          rate_uv = 0;
+          this_skip2 = 1;
+        }
        } else if (mb_skip_allowed) {
          // Add in the cost of the no skip flag.
          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
-                                                        PRED_MBSKIP), 0);
+                                                            PRED_MBSKIP), 0);
          rate2 += prob_skip_cost;
        }
  
@@ -3119,6 +3160,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
          *returndistortion = distortion2;
          best_rd = this_rd;
          best_mbmode = *mbmi;
+        best_skip2 = this_skip2;
          best_partition = *x->partition_info;
  
          if (this_mode == I4X4_PRED || this_mode == SPLITMV)
@@ -3301,6 +3343,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
  
    // macroblock modes
    *mbmi = best_mbmode;
+  x->skip |= best_skip2;
    if (best_mbmode.ref_frame[0] == INTRA_FRAME &&
        best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
      for (i = 0; i < 4; i++)
diff --git a/vp9/encoder/x86/vp9_error_sse2.asm b/vp9/encoder/x86/vp9_error_sse2.asm

index bb1ea71b9b54fb831646ec279961aaa51fc043f5..1126fdb61640e40cb820dc3928ed2e3ecbc9553a 100644 (file)
--- a/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/vp9/encoder/x86/vp9_error_sse2.asm
@@ -12,45 +12,62 @@
  
  SECTION .text
  
-; void vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size)
+; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
+;                         int64_t *ssz)
  
  INIT_XMM sse2
-cglobal block_error, 3, 3, 6, uqc, dqc, size
-  pxor      m4, m4                 ; accumulator
+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m6, m6                 ; ssz accumulator
    pxor      m5, m5                 ; dedicated zero register
    lea     uqcq, [uqcq+sizeq*2]
    lea     dqcq, [dqcq+sizeq*2]
    neg    sizeq
  .loop:
-  mova      m0, [uqcq+sizeq*2]
-  mova      m2, [dqcq+sizeq*2]
-  mova      m1, [uqcq+sizeq*2+mmsize]
-  mova      m3, [dqcq+sizeq*2+mmsize]
+  mova      m2, [uqcq+sizeq*2]
+  mova      m0, [dqcq+sizeq*2]
+  mova      m3, [uqcq+sizeq*2+mmsize]
+  mova      m1, [dqcq+sizeq*2+mmsize]
    psubw     m0, m2
    psubw     m1, m3
    ; individual errors are max. 15bit+sign, so squares are 30bit, and
    ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
    pmaddwd   m0, m0
    pmaddwd   m1, m1
+  pmaddwd   m2, m2
+  pmaddwd   m3, m3
    ; accumulate in 64bit
-  punpckldq m2, m0, m5
+  punpckldq m7, m0, m5
    punpckhdq m0, m5
-  punpckldq m3, m1, m5
-  punpckhdq m1, m5
-  paddq     m4, m2
+  paddq     m4, m7
+  punpckldq m7, m1, m5
    paddq     m4, m0
-  paddq     m4, m3
+  punpckhdq m1, m5
+  paddq     m4, m7
+  punpckldq m7, m2, m5
    paddq     m4, m1
+  punpckhdq m2, m5
+  paddq     m6, m7
+  punpckldq m7, m3, m5
+  paddq     m6, m2
+  punpckhdq m3, m5
+  paddq     m6, m7
+  paddq     m6, m3
    add    sizeq, mmsize
    jl .loop
  
    ; accumulate horizontally and store in return value
    movhlps   m5, m4
+  movhlps   m7, m6
    paddq     m4, m5
+  paddq     m6, m7
  %if ARCH_X86_64
    movq    rax, m4
+  movq [sszq], m6
  %else
+  mov     eax, sszm
    pshufd   m5, m4, 0x1
+  movq  [eax], m6
    movd    eax, m4
    movd    edx, m5
  %endif
author	Ronald S. Bultje <rbultje@google.com>
	Fri, 28 Jun 2013 00:41:54 +0000 (17:41 -0700)
committer	Ronald S. Bultje <rbultje@google.com>
	Fri, 28 Jun 2013 17:28:49 +0000 (10:28 -0700)
vp9/common/vp9_rtcd_defs.sh		patch \| blob \| history
vp9/encoder/vp9_block.h		patch \| blob \| history
vp9/encoder/vp9_onyx_int.h		patch \| blob \| history
vp9/encoder/vp9_quantize.c		patch \| blob \| history
vp9/encoder/vp9_rdopt.c		patch \| blob \| history
vp9/encoder/x86/vp9_error_sse2.asm		patch \| blob \| history