Enable non-rd mode coding on key frame, for speed 6.

author Marco <marpan@google.com>

Wed, 12 Nov 2014 22:51:49 +0000 (14:51 -0800)

committer Marco <marpan@google.com>

Wed, 3 Dec 2014 17:18:08 +0000 (09:18 -0800)
author Marco <marpan@google.com>
Wed, 12 Nov 2014 22:51:49 +0000 (14:51 -0800)
committer Marco <marpan@google.com>
Wed, 3 Dec 2014 17:18:08 +0000 (09:18 -0800)
diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc

index c2e472b5d8719804384df64d465e910613ef98cc..fa04528a2413c092d25b06fe24e11381913c623e 100644 (file)
--- a/test/vp9_avg_test.cc
+++ b/test/vp9_avg_test.cc
@@ -57,7 +57,7 @@ class AverageTestBase : public ::testing::Test {
    }
  
    // Sum Pixels
-  unsigned int ReferenceAverage(const uint8_t* source, int pitch ) {
+  unsigned int ReferenceAverage8x8(const uint8_t* source, int pitch ) {
      unsigned int average = 0;
      for (int h = 0; h < 8; ++h)
        for (int w = 0; w < 8; ++w)
@@ -65,6 +65,14 @@ class AverageTestBase : public ::testing::Test {
      return ((average + 32) >> 6);
    }
  
+  unsigned int ReferenceAverage4x4(const uint8_t* source, int pitch ) {
+    unsigned int average = 0;
+    for (int h = 0; h < 4; ++h)
+      for (int w = 0; w < 4; ++w)
+        average += source[h * source_stride_ + w];
+    return ((average + 8) >> 4);
+  }
+
    void FillConstant(uint8_t fill_constant) {
      for (int i = 0; i < width_ * height_; ++i) {
          source_data_[i] = fill_constant;
@@ -85,7 +93,7 @@ class AverageTestBase : public ::testing::Test {
  };
  typedef unsigned int (*AverageFunction)(const uint8_t* s, int pitch);
  
-typedef std::tr1::tuple<int, int, int, AverageFunction> AvgFunc;
+typedef std::tr1::tuple<int, int, int, int, AverageFunction> AvgFunc;
  
  class AverageTest
      : public AverageTestBase,
@@ -95,12 +103,18 @@ class AverageTest
  
   protected:
    void CheckAverages() {
-    unsigned int expected = ReferenceAverage(source_data_+ GET_PARAM(2),
-                                             source_stride_);
+    unsigned int expected = 0;
+    if (GET_PARAM(3) == 8) {
+      expected = ReferenceAverage8x8(source_data_+ GET_PARAM(2),
+                                     source_stride_);
+    } else  if (GET_PARAM(3) == 4) {
+      expected = ReferenceAverage4x4(source_data_+ GET_PARAM(2),
+                                     source_stride_);
+    }
  
-    ASM_REGISTER_STATE_CHECK(GET_PARAM(3)(source_data_+ GET_PARAM(2),
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(4)(source_data_+ GET_PARAM(2),
                                            source_stride_));
-    unsigned int actual = GET_PARAM(3)(source_data_+ GET_PARAM(2),
+    unsigned int actual = GET_PARAM(4)(source_data_+ GET_PARAM(2),
                                         source_stride_);
  
      EXPECT_EQ(expected, actual);
@@ -134,16 +148,20 @@ using std::tr1::make_tuple;
  INSTANTIATE_TEST_CASE_P(
      C, AverageTest,
      ::testing::Values(
-        make_tuple(16, 16, 1, &vp9_avg_8x8_c)));
+        make_tuple(16, 16, 1, 8, &vp9_avg_8x8_c),
+        make_tuple(16, 16, 1, 4, &vp9_avg_4x4_c)));
  
  
  #if HAVE_SSE2
  INSTANTIATE_TEST_CASE_P(
      SSE2, AverageTest,
      ::testing::Values(
-        make_tuple(16, 16, 0, &vp9_avg_8x8_sse2),
-        make_tuple(16, 16, 5, &vp9_avg_8x8_sse2),
-        make_tuple(32, 32, 15, &vp9_avg_8x8_sse2)));
+        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_sse2),
+        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_sse2),
+        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_sse2),
+        make_tuple(16, 16, 0, 4, &vp9_avg_4x4_sse2),
+        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_sse2),
+        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_sse2)));
  
  #endif
  
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl

index ae12808640213a5e70bade1d74f58633bb93c3b7..281dcbd8b0de653d3dac662de05c49fdd74e26f4 100644 (file)
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1135,9 +1135,14 @@ specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
  specialize qw/vp9_avg_8x8 sse2/;
  
+add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
+specialize qw/vp9_avg_4x4 sse2/;
+
  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";
    specialize qw/vp9_highbd_avg_8x8/;
+  add_proto qw/unsigned int vp9_highbd_avg_4x4/, "const uint8_t *, int p";
+  specialize qw/vp9_highbd_avg_4x4/;
  }
  
  # ENCODEMB INVOKE
diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c

index e9810c894d3b26fa106d0b1a43c48b9d3c153279..f8fa7d2e8dfda96b97e18ab51ec29d25fe8e6900 100644 (file)
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c
@@ -19,6 +19,15 @@ unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {
    return (sum + 32) >> 6;
  }
  
+unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 4; ++i, s+=p)
+    for (j = 0; j < 4; sum += s[j], ++j) {}
+
+  return (sum + 8) >> 4;
+}
+
  #if CONFIG_VP9_HIGHBITDEPTH
  unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
    int i, j;
@@ -29,5 +38,16 @@ unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
  
    return (sum + 32) >> 6;
  }
+
+unsigned int vp9_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+  int i, j;
+  int sum = 0;
+  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
+  for (i = 0; i < 4; ++i, s+=p)
+    for (j = 0; j < 4; sum += s[j], ++j) {}
+
+  return (sum + 8) >> 4;
+}
  #endif  // CONFIG_VP9_HIGHBITDEPTH
  
+
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c

index d5122d0bcb4f1ab5beff566cd8c65e4b4a325cec..7788e502d3d8078f3c582af0569e0c92836bab24 100644 (file)
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -291,6 +291,11 @@ typedef struct {
  typedef struct {
    partition_variance part_variances;
    var split[4];
+} v4x4;
+
+typedef struct {
+  partition_variance part_variances;
+  v4x4 split[4];
  } v8x8;
  
  typedef struct {
@@ -348,6 +353,13 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
      case BLOCK_8X8: {
        v8x8 *vt = (v8x8 *) data;
        node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_4X4: {
+      v4x4 *vt = (v4x4 *) data;
+      node->part_variances = &vt->part_variances;
        for (i = 0; i < 4; i++)
          node->split[i] = &vt->split[i];
        break;
@@ -398,64 +410,76 @@ static int set_vt_partitioning(VP9_COMP *cpi,
    variance_node vt;
    const int block_width = num_8x8_blocks_wide_lookup[bsize];
    const int block_height = num_8x8_blocks_high_lookup[bsize];
-  // TODO(debargha): Choose this more intelligently.
-  const int threshold_multiplier = cm->frame_type == KEY_FRAME ? 64 : 4;
+  // TODO(marpan): Adjust/tune these thresholds.
+  const int threshold_multiplier = cm->frame_type == KEY_FRAME ? 80 : 4;
    int64_t threshold =
        (int64_t)(threshold_multiplier *
                  vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth));
+  int64_t threshold_bsize_ref = threshold << 6;
+  int64_t threshold_low = threshold;
+  BLOCK_SIZE bsize_ref = BLOCK_16X16;
+
    assert(block_height == block_width);
    tree_to_node(data, bsize, &vt);
  
-  // Split none is available only if we have more than half a block size
-  // in width and height inside the visible image.
-  if (mi_col + block_width / 2 < cm->mi_cols &&
-      mi_row + block_height / 2 < cm->mi_rows &&
-      vt.part_variances->none.variance < threshold) {
-    set_block_size(cpi, xd, mi_row, mi_col, bsize);
-    return 1;
+  if (cm->frame_type == KEY_FRAME) {
+    bsize_ref = BLOCK_8X8;
+    // Choose lower thresholds for key frame variance to favor split.
+    threshold_bsize_ref = threshold >> 1;
+    threshold_low = threshold >> 2;
    }
  
-  // Only allow split for blocks above 16x16.
-  if (bsize > BLOCK_16X16) {
-    // Vertical split is available on all but the bottom border.
+  // For bsize=bsize_ref (16x16/8x8 for 8x8/4x4 downsampling), select if
+  // variance is below threshold, otherwise split will be selected.
+  // No check for vert/horiz split as too few samples for variance.
+  if (bsize == bsize_ref) {
+    if (mi_col + block_width / 2 < cm->mi_cols &&
+        mi_row + block_height / 2 < cm->mi_rows &&
+        vt.part_variances->none.variance < threshold_bsize_ref) {
+      set_block_size(cpi, xd, mi_row, mi_col, bsize);
+      return 1;
+    }
+    return 0;
+  } else if (bsize > bsize_ref) {
+    // For key frame, for bsize above 32X32, or very high variance, take split.
+    if (cm->frame_type == KEY_FRAME &&
+        (bsize > BLOCK_32X32 ||
+        vt.part_variances->none.variance > (threshold << 2))) {
+      return 0;
+    }
+    // If variance is low, take the bsize (no split).
+    if (mi_col + block_width / 2 < cm->mi_cols &&
+        mi_row + block_height / 2 < cm->mi_rows &&
+        vt.part_variances->none.variance < threshold_low) {
+      set_block_size(cpi, xd, mi_row, mi_col, bsize);
+      return 1;
+    }
+    // Check vertical split.
      if (mi_row + block_height / 2 < cm->mi_rows &&
-        vt.part_variances->vert[0].variance < threshold &&
-        vt.part_variances->vert[1].variance < threshold) {
+        vt.part_variances->vert[0].variance < threshold_low &&
+        vt.part_variances->vert[1].variance < threshold_low) {
        BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
        set_block_size(cpi, xd, mi_row, mi_col, subsize);
        set_block_size(cpi, xd, mi_row, mi_col + block_width / 2, subsize);
        return 1;
      }
-
-    // Horizontal split is available on all but the right border.
+    // Check horizontal split.
      if (mi_col + block_width / 2 < cm->mi_cols &&
-        vt.part_variances->horz[0].variance < threshold &&
-        vt.part_variances->horz[1].variance < threshold) {
+        vt.part_variances->horz[0].variance < threshold_low &&
+        vt.part_variances->horz[1].variance < threshold_low) {
        BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
        set_block_size(cpi, xd, mi_row, mi_col, subsize);
        set_block_size(cpi, xd, mi_row + block_height / 2, mi_col, subsize);
        return 1;
      }
-  }
-
-  // This will only allow 8x8 if the 16x16 variance is very large.
-  if (bsize == BLOCK_16X16) {
-    if (mi_col + block_width / 2 < cm->mi_cols &&
-        mi_row + block_height / 2 < cm->mi_rows &&
-        vt.part_variances->none.variance < (threshold << 6)) {
-      set_block_size(cpi, xd, mi_row, mi_col, bsize);
-      return 1;
-    }
+    return 0;
    }
    return 0;
  }
  
-// This function chooses partitioning based on the variance
-// between source and reconstructed last, where variance is
-// computed for 8x8 downsampled inputs. Some things to check:
-// using the last source rather than reconstructed last, and
-// allowing for small downsampling (4x4 or 2x2) for selection
-// of smaller block sizes (i.e., < 16x16).
+// This function chooses partitioning based on the variance between source and
+// reconstructed last, where variance is computed for downsampled inputs.
+// Currently 8x8 downsampling is used for delta frames, 4x4 for key frames.
  static void choose_partitioning(VP9_COMP *cpi,
                                  const TileInfo *const tile,
                                  MACROBLOCK *x,
@@ -463,7 +487,7 @@ static void choose_partitioning(VP9_COMP *cpi,
    VP9_COMMON * const cm = &cpi->common;
    MACROBLOCKD *xd = &x->e_mbd;
  
-  int i, j, k;
+  int i, j, k, m;
    v64x64 vt;
    uint8_t *s;
    const uint8_t *d;
@@ -525,38 +549,63 @@ static void choose_partitioning(VP9_COMP *cpi,
        const int y16_idx = y32_idx + ((j >> 1) << 4);
        v16x16 *vst = &vt.split[i].split[j];
        for (k = 0; k < 4; k++) {
-        int x_idx = x16_idx + ((k & 1) << 3);
-        int y_idx = y16_idx + ((k >> 1) << 3);
-        unsigned int sse = 0;
-        int sum = 0;
-
-        if (x_idx < pixels_wide && y_idx < pixels_high) {
-          int s_avg, d_avg;
+        int x8_idx = x16_idx + ((k & 1) << 3);
+        int y8_idx = y16_idx + ((k >> 1) << 3);
+        if (cm->frame_type != KEY_FRAME) {
+          unsigned int sse = 0;
+          int sum = 0;
+          if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+            int s_avg, d_avg;
  #if CONFIG_VP9_HIGHBITDEPTH
-          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-            s_avg = vp9_highbd_avg_8x8(s + y_idx * sp + x_idx, sp);
-            d_avg = vp9_highbd_avg_8x8(d + y_idx * dp + x_idx, dp);
-          } else {
-            s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
-            d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
-          }
+            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+              s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+              d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+            } else {
+              s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+              d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+           }
  #else
-          s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
-          d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
+            s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+            d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
  #endif
-          sum = s_avg - d_avg;
-          sse = sum * sum;
+            sum = s_avg - d_avg;
+            sse = sum * sum;
+          }
+          // If variance is based on 8x8 downsampling, we stop here and have
+          // one sample for 8x8 block (so use 1 for count in fill_variance),
+          // which of course means variance = 0 for 8x8 block.
+          fill_variance(sse, sum, 1, &vst->split[k].part_variances.none);
+        } else {
+          // For key frame, go down to 4x4.
+          v8x8 *vst2 = &vst->split[k];
+          for (m = 0; m < 4; m++) {
+            int x4_idx = x8_idx + ((m & 1) << 2);
+            int y4_idx = y8_idx + ((m >> 1) << 2);
+            unsigned int sse = 0;
+            int sum = 0;
+            if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+              int s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+              // For key frame, reference is set to 128.
+              sum = s_avg - 128;
+              sse = sum * sum;
+            }
+            // If variance is based on 4x4 downsampling, we stop here and have
+            // one sample for 4x4 block (so use 1 for count in fill_variance),
+            // which of course means variance = 0 for 4x4 block.
+           fill_variance(sse, sum, 1, &vst2->split[m].part_variances.none);
+          }
          }
-        // For an 8x8 block we have just one value the average of all 64
-        // pixels,  so use 1.   This means of course that there is no variance
-        // in an 8x8 block.
-        fill_variance(sse, sum, 1, &vst->split[k].part_variances.none);
        }
      }
    }
    // Fill the rest of the variance tree by summing split partition values.
    for (i = 0; i < 4; i++) {
      for (j = 0; j < 4; j++) {
+      if (cm->frame_type == KEY_FRAME) {
+        for (m = 0; m < 4; m++) {
+          fill_variance_tree(&vt.split[i].split[j].split[m], BLOCK_8X8);
+        }
+      }
        fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
      }
      fill_variance_tree(&vt.split[i], BLOCK_32X32);
@@ -564,8 +613,7 @@ static void choose_partitioning(VP9_COMP *cpi,
    fill_variance_tree(&vt, BLOCK_64X64);
  
    // Now go through the entire structure,  splitting every block size until
-  // we get to one that's got a variance lower than our threshold,  or we
-  // hit 8x8.
+  // we get to one that's got a variance lower than our threshold.
    if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
        !set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col)) {
      for (i = 0; i < 4; ++i) {
@@ -576,11 +624,13 @@ static void choose_partitioning(VP9_COMP *cpi,
          for (j = 0; j < 4; ++j) {
            const int x16_idx = ((j & 1) << 1);
            const int y16_idx = ((j >> 1) << 1);
-          // NOTE: Since this uses 8x8 downsampling for variance calculation
-          // we cannot really select block size 8x8 (or even 8x16/16x8),
-          // since we do not sufficient samples for variance.
-          // For now, 8x8 partition is only set if the variance of the 16x16
-          // block is very high. This is controlled in set_vt_partitioning.
+          // Note: If 8x8 downsampling is used for variance calculation we
+          // cannot really select block size 8x8 (or even 8x16/16x8), since we
+          // don't have sufficient samples for variance. So on delta frames,
+          // 8x8 partition is only set if variance of the 16x16 block is very
+          // high. For key frames, 4x4 downsampling is used, so we can better
+          // select 8x16/16x8 and 8x8. 4x4 partition can potentially be set
+          // used here too, but for now 4x4 is not allowed.
            if (!set_vt_partitioning(cpi, xd, &vt.split[i].split[j],
                                     BLOCK_16X16,
                                     mi_row + y32_idx + y16_idx,
@@ -588,10 +638,26 @@ static void choose_partitioning(VP9_COMP *cpi,
              for (k = 0; k < 4; ++k) {
                const int x8_idx = (k & 1);
                const int y8_idx = (k >> 1);
-              set_block_size(cpi, xd,
-                             (mi_row + y32_idx + y16_idx + y8_idx),
-                             (mi_col + x32_idx + x16_idx + x8_idx),
-                             BLOCK_8X8);
+              // TODO(marpan): Allow for setting 4x4 partition on key frame.
+              /*
+              if (cm->frame_type == KEY_FRAME) {
+                if (!set_vt_partitioning(cpi, xd,
+                                         &vt.split[i].split[j].split[k],
+                                         BLOCK_8X8,
+                                         mi_row + y32_idx + y16_idx + y8_idx,
+                                         mi_col + x32_idx + x16_idx + x8_idx)) {
+                    set_block_size(cpi, xd,
+                                  (mi_row + y32_idx + y16_idx + y8_idx),
+                                  (mi_col + x32_idx + x16_idx + x8_idx),
+                                   BLOCK_4X4);
+                }
+              } else {
+              */
+                set_block_size(cpi, xd,
+                               (mi_row + y32_idx + y16_idx + y8_idx),
+                               (mi_col + x32_idx + x16_idx + x8_idx),
+                               BLOCK_8X8);
+              // }
              }
            }
          }
@@ -2511,7 +2577,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi,
        rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                         BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
      } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
-               cm->frame_type != KEY_FRAME ) {
+               cm->frame_type != KEY_FRAME) {
        choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
        rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                         BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
@@ -3532,6 +3598,11 @@ static void encode_frame_internal(VP9_COMP *cpi) {
                   cm->uv_ac_delta_q == 0;
  
    cm->tx_mode = select_tx_mode(cpi, xd);
+  if (cm->frame_type == KEY_FRAME &&
+      cpi->sf.use_nonrd_pick_mode &&
+      cpi->sf.partition_search_type == VAR_BASED_PARTITION) {
+    cm->tx_mode = ALLOW_16X16;
+  }
  
  #if CONFIG_VP9_HIGHBITDEPTH
    if (cm->use_highbitdepth)
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c

index 5c70b4ee464792b15cd76cd3b037fdb8e991c134..85d0dba599c147176a56c5cdce42c6f70701350c 100644 (file)
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -321,7 +321,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
      sf->partition_search_type = VAR_BASED_PARTITION;
  
      // Turn on this to use non-RD key frame coding mode.
-    // sf->use_nonrd_pick_mode = 1;
+    sf->use_nonrd_pick_mode = 1;
      sf->mv.search_method = NSTEP;
      sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
      sf->mv.reduce_first_step_size = 1;
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c

index ca6cf1ac91667168d2f2265e3cea3a64bc9846ef..4c3495b056033380e5c99f27914b6f4e36362abb 100644 (file)
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -38,3 +38,21 @@ unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
    avg = _mm_extract_epi16(s0, 0);
    return (avg + 32) >> 6;
  }
+
+unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) {
+  __m128i s0, s1, u0;
+  unsigned int avg = 0;
+  u0  = _mm_setzero_si128();
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+  avg = _mm_extract_epi16(s0, 0);
+  return (avg + 8) >> 4;
+}
author	Marco <marpan@google.com>
	Wed, 12 Nov 2014 22:51:49 +0000 (14:51 -0800)
committer	Marco <marpan@google.com>
	Wed, 3 Dec 2014 17:18:08 +0000 (09:18 -0800)
test/vp9_avg_test.cc		patch \| blob \| history
vp9/common/vp9_rtcd_defs.pl		patch \| blob \| history
vp9/encoder/vp9_avg.c		patch \| blob \| history
vp9/encoder/vp9_encodeframe.c		patch \| blob \| history
vp9/encoder/vp9_speed_features.c		patch \| blob \| history
vp9/encoder/x86/vp9_avg_intrin_sse2.c		patch \| blob \| history