]> granicus.if.org Git - libvpx/commitdiff
Add joint motion search in comp_inter_inter mode(experiment)
authorYunqing Wang <yunqingwang@google.com>
Tue, 7 May 2013 16:45:28 +0000 (09:45 -0700)
committerYunqing Wang <yunqingwang@google.com>
Fri, 10 May 2013 17:15:43 +0000 (10:15 -0700)
In current code, motion vectors got from single prediction mode are used
in compound prediction mode directly. These motion vectors may not give
accurate prediction since they are searched independently. In this patch,
we took Pascal's suggestion, and did joint motion search in compound
prediction mode to find better motion vectors in this situation.
Test results:
Overall PSNR: 0.570%(derf), 0.918%(stdhd);
SSIM: 0.572%(derf), 1.009%(stdhd);

The encoder is a little slower. This can be improved since some c
code is used in motion search.

Change-Id: Ib30c9240f6c56c9b070867b4ca89412a76d9f3c6

configure
vp9/common/vp9_enums.h
vp9/common/vp9_rtcd_defs.sh
vp9/encoder/vp9_mcomp.c
vp9/encoder/vp9_mcomp.h
vp9/encoder/vp9_onyx_if.c
vp9/encoder/vp9_rdopt.c
vp9/encoder/vp9_variance.h
vp9/encoder/vp9_variance_c.c

index 5cbf070952259460b1c2ce3391e11642c6318d84..cc8c58141bedced7498fc174c6b308197c0f3f92 100755 (executable)
--- a/configure
+++ b/configure
@@ -247,6 +247,7 @@ EXPERIMENT_LIST="
     multiple_arf
     non420
     ab4x4
+    comp_inter_joint_search
 "
 CONFIG_LIST="
     external_build
index 1663195651f9e95ef713e52c7c3dca38fb1cc6ac..2f6707487fa2c19cc41297282847d81442f36dc5 100644 (file)
@@ -36,6 +36,7 @@ typedef enum BLOCK_SIZE_TYPE {
   BLOCK_SIZE_SB32X64,
   BLOCK_SIZE_SB64X32,
   BLOCK_SIZE_SB64X64,
+  BLOCK_SIZE_TYPES
 } BLOCK_SIZE_TYPE;
 
 typedef enum PARTITION_TYPE {
index 75e36040c9a38881ab514652fa76a3a4c5702c05..abae952536506e4c48073862453083ee0129a1f5 100644 (file)
@@ -337,41 +337,74 @@ vp9_variance4x4_mmx=vp9_variance4x4_mmx
 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance64x64 sse2
 
+prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x64
+
 prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x64
 
+prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x64
+
 prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance64x32
 
+prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x32
+
 prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x16
 
+prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x16
+
 prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x32
 
+prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x32
+
 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x32 sse2
 
+prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x32
+
 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
 
+prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x16
+
 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x16 sse2 mmx
 vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x16
+
 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x8
+
 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x8 sse2 mmx
 vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x8
+
 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance4x4 sse2 mmx
 vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance4x4
+
 prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad64x64 sse2
 
index 74caba5a0b3b9a83f704cefa7eba703c6e42c972..aff5637e1d61f936d24390f557a86d6424336fe2 100644 (file)
@@ -413,6 +413,201 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
 
   return besterr;
 }
+
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+#undef DIST
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+    vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \
+              z, src_stride, &sse, second_pred)
+
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+                                 int_mv *bestmv, int_mv *ref_mv,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 int *mvjcost, int *mvcost[2],
+                                 int *distortion,
+                                 unsigned int *sse1,
+                                 const uint8_t *second_pred, int w, int h) {
+  uint8_t *z = x->plane[0].src.buf;
+  int src_stride = x->plane[0].src.stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  int rr, rc, br, bc, hstep;
+  int tr, tc;
+  unsigned int besterr = INT_MAX;
+  unsigned int left, right, up, down, diag;
+  unsigned int sse;
+  unsigned int whichdir;
+  unsigned int halfiters = 4;
+  unsigned int quarteriters = 4;
+  unsigned int eighthiters = 4;
+  int thismse;
+  int maxc, minc, maxr, minr;
+  int y_stride;
+  int offset;
+  int usehp = xd->allow_high_precision_mv;
+
+  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+  uint8_t *y = xd->plane[0].pre[0].buf +
+               (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
+               bestmv->as_mv.col;
+
+  y_stride = xd->plane[0].pre[0].stride;
+
+  rr = ref_mv->as_mv.row;
+  rc = ref_mv->as_mv.col;
+  br = bestmv->as_mv.row << 3;
+  bc = bestmv->as_mv.col << 3;
+  hstep = 4;
+  minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) -
+             ((1 << MV_MAX_BITS) - 1));
+  maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) +
+             ((1 << MV_MAX_BITS) - 1));
+  minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) -
+             ((1 << MV_MAX_BITS) - 1));
+  maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) +
+             ((1 << MV_MAX_BITS) - 1));
+
+  tr = br;
+  tc = bc;
+
+
+  offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+
+  // central mv
+  bestmv->as_mv.row <<= 3;
+  bestmv->as_mv.col <<= 3;
+
+  // calculate central point error
+  // TODO(yunqingwang): central pointer error was already calculated in full-
+  // pixel search, and can be passed in this function.
+  comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+  besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost,
+                         error_per_bit, xd->allow_high_precision_mv);
+
+  // Each subsequent iteration checks at least one point in
+  // common with the last iteration could be 2 ( if diag selected)
+  while (--halfiters) {
+    // 1/2 pel
+    CHECK_BETTER(left, tr, tc - hstep);
+    CHECK_BETTER(right, tr, tc + hstep);
+    CHECK_BETTER(up, tr - hstep, tc);
+    CHECK_BETTER(down, tr + hstep, tc);
+
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+
+    // no reason to check the same one again.
+    if (tr == br && tc == bc)
+      break;
+
+    tr = br;
+    tc = bc;
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  hstep >>= 1;
+  while (--quarteriters) {
+    CHECK_BETTER(left, tr, tc - hstep);
+    CHECK_BETTER(right, tr, tc + hstep);
+    CHECK_BETTER(up, tr - hstep, tc);
+    CHECK_BETTER(down, tr + hstep, tc);
+
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+
+    // no reason to check the same one again.
+    if (tr == br && tc == bc)
+      break;
+
+    tr = br;
+    tc = bc;
+  }
+
+  if (xd->allow_high_precision_mv) {
+    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+  } else {
+    usehp = 0;
+  }
+
+  if (usehp) {
+    hstep >>= 1;
+    while (--eighthiters) {
+      CHECK_BETTER(left, tr, tc - hstep);
+      CHECK_BETTER(right, tr, tc + hstep);
+      CHECK_BETTER(up, tr - hstep, tc);
+      CHECK_BETTER(down, tr + hstep, tc);
+
+      whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+      switch (whichdir) {
+        case 0:
+          CHECK_BETTER(diag, tr - hstep, tc - hstep);
+          break;
+        case 1:
+          CHECK_BETTER(diag, tr - hstep, tc + hstep);
+          break;
+        case 2:
+          CHECK_BETTER(diag, tr + hstep, tc - hstep);
+          break;
+        case 3:
+          CHECK_BETTER(diag, tr + hstep, tc + hstep);
+          break;
+      }
+
+      // no reason to check the same one again.
+      if (tr == br && tc == bc)
+        break;
+
+      tr = br;
+      tc = bc;
+    }
+  }
+  bestmv->as_mv.row = br;
+  bestmv->as_mv.col = bc;
+
+  vpx_free(comp_pred);
+
+  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
+
 #undef MVC
 #undef PRE
 #undef DIST
@@ -2132,7 +2327,109 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
     return INT_MAX;
 }
 
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+/* This function is called when we do joint motion search in comp_inter_inter
+ * mode.
+ */
+int vp9_refining_search_8p_c(MACROBLOCK *x,
+                             int_mv *ref_mv, int error_per_bit,
+                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                             int *mvjcost, int *mvcost[2], int_mv *center_mv,
+                             const uint8_t *second_pred, int w, int h) {
+  const MACROBLOCKD* const xd = &x->e_mbd;
+  MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
+      {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
+  int i, j;
+  int this_row_offset, this_col_offset;
 
+  int what_stride = x->plane[0].src.stride;
+  int in_what_stride = xd->plane[0].pre[0].stride;
+  uint8_t *what = x->plane[0].src.buf;
+  uint8_t *best_address = xd->plane[0].pre[0].buf +
+                          (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +
+                          ref_mv->as_mv.col;
+  uint8_t *check_here;
+  unsigned int thissad;
+  int_mv this_mv;
+  unsigned int bestsad = INT_MAX;
+  int_mv fcenter_mv;
+
+  int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+  /* Compound pred buffer */
+  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+
+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+  /* Get compound pred by averaging two pred blocks. */
+  comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+
+  bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) +
+      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 8; j++) {
+      this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+      this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+      if ((this_col_offset > x->mv_col_min) &&
+          (this_col_offset < x->mv_col_max) &&
+          (this_row_offset > x->mv_row_min) &&
+          (this_row_offset < x->mv_row_max)) {
+        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
+            best_address;
+
+        /* Get compound block and use it to calculate SAD. */
+        comp_avg_pred(comp_pred, second_pred, w, h, check_here,
+                      in_what_stride);
+        thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad);
+
+        if (thissad < bestsad) {
+          this_mv.as_mv.row = this_row_offset;
+          this_mv.as_mv.col = this_col_offset;
+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
+                                    mvsadcost, error_per_bit);
+
+          if (thissad < bestsad) {
+            bestsad = thissad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->as_mv.row += neighbors[best_site].row;
+      ref_mv->as_mv.col += neighbors[best_site].col;
+      best_address += (neighbors[best_site].row) * in_what_stride +
+          neighbors[best_site].col;
+    }
+  }
+
+  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+
+  if (bestsad < INT_MAX) {
+    int besterr;
+    comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+    besterr = fn_ptr->vf(what, what_stride, comp_pred, w,
+        (unsigned int *)(&thissad)) +
+        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
+                    xd->allow_high_precision_mv);
+    vpx_free(comp_pred);
+    return besterr;
+  } else {
+    vpx_free(comp_pred);
+    return INT_MAX;
+  }
+}
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
 
 #ifdef ENTROPY_STATS
 void print_mode_context(VP9_COMMON *pc) {
index e1ba7fd9dceb795d46886b16d536d01339b5fb86..cdbd29aa5d1414fd838c41a90e47c5087baedae7 100644 (file)
@@ -79,5 +79,21 @@ typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x,
                                        int *mvjcost, int *mvcost[2],
                                        int_mv *center_mv);
 
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+                                 int_mv *bestmv, int_mv *ref_mv,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 int *mvjcost, int *mvcost[2],
+                                 int *distortion, unsigned int *sse1,
+                                 const uint8_t *second_pred,
+                                 int w, int h);
 
+int vp9_refining_search_8p_c(MACROBLOCK *x,
+                             int_mv *ref_mv, int error_per_bit,
+                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                             int *mvjcost, int *mvcost[2],
+                             int_mv *center_mv, const uint8_t *second_pred,
+                             int w, int h);
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
 #endif  // VP9_ENCODER_VP9_MCOMP_H_
index 782816f9288d2f972b6c695826810c8c4bf4fb77..05105d7945d759bf77ddd387aa83dfa6a1adef1c 100644 (file)
@@ -1516,10 +1516,11 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   for (i = 0; i < MAX_MODES; i++)
     cpi->rd_thresh_mult[i] = 128;
 
-#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \
+#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\
     cpi->fn_ptr[BT].sdf            = SDF; \
     cpi->fn_ptr[BT].vf             = VF; \
     cpi->fn_ptr[BT].svf            = SVF; \
+    cpi->fn_ptr[BT].svaf           = SVAF; \
     cpi->fn_ptr[BT].svf_halfpix_h  = SVFHH; \
     cpi->fn_ptr[BT].svf_halfpix_v  = SVFHV; \
     cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \
@@ -1528,57 +1529,64 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
     cpi->fn_ptr[BT].sdx4df         = SDX4DF;
 
   BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance32x16, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad32x16x4d)
 
   BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance16x32, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad16x32x4d)
 
   BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance64x32, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad64x32x4d)
 
   BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance32x64, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad32x64x4d)
 
   BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
-      vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,
+      vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
+      vp9_variance_halfpixvar32x32_v,
       vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
       vp9_sad32x32x4d)
 
   BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
-      vp9_variance_halfpixvar64x64_h, vp9_variance_halfpixvar64x64_v,
+      vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
+      vp9_variance_halfpixvar64x64_v,
       vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
       vp9_sad64x64x4d)
 
   BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
-       vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v,
-       vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
-       vp9_sad16x16x4d)
+      vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
+      vp9_variance_halfpixvar16x16_v,
+      vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
+      vp9_sad16x16x4d)
 
   BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
-      NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
+      vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
+      vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
 
   BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
-      NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
+      vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
+      vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
 
   BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
-      NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
+      vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
+      vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
 
   BFP(BLOCK_4X8, NULL, vp9_variance4x8, NULL,
-      NULL, NULL, NULL, NULL, NULL, NULL)
+      NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 
   BFP(BLOCK_8X4, NULL, vp9_variance8x4, NULL,
-      NULL, NULL, NULL, NULL, NULL, NULL)
+      NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 
   BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
-      NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
+      vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
+      vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
 
   cpi->full_search_sad = vp9_full_search_sad;
   cpi->diamond_search_sad = vp9_diamond_search_sad;
index 1b143f5e089eaa83c0700ab0e6cbdc8b7df9124d..24b81c025ae9b3025d70af7e95d0a61e7411dc8a 100644 (file)
@@ -1807,8 +1807,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  INTERPOLATIONFILTERTYPE *best_filter,
                                  int_mv frame_mv[MB_MODE_COUNT]
                                                 [MAX_REF_FRAMES],
-                                 YV12_BUFFER_CONFIG *scaled_ref_frame,
-                                 int mi_row, int mi_col) {
+                                 YV12_BUFFER_CONFIG **scaled_ref_frame,
+                                 int mi_row, int mi_col,
+                                 int_mv single_newmv[MAX_REF_FRAMES]) {
   const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
 
   VP9_COMMON *cm = &cpi->common;
@@ -1838,6 +1839,152 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
 
       if (is_comp_pred) {
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+        const int b_sz[BLOCK_SIZE_TYPES][2] = {
+            {4, 4},
+            {8, 8},
+            {8, 16},
+            {16, 8},
+            {16, 16},
+            {16, 32},
+            {32, 16},
+            {32, 32},
+            {32, 64},
+            {64, 32},
+            {64, 64}
+        };
+
+        int ite;
+        // Prediction buffer from second frame.
+        uint8_t *second_pred = vpx_memalign(16, b_sz[bsize][0] *
+                                            b_sz[bsize][1] * sizeof(uint8_t));
+
+        // Do joint motion search in compound mode to get more accurate mv.
+        struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+        struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
+        struct buf_2d scaled_first_yv12;
+
+        if (scaled_ref_frame[0]) {
+          int i;
+
+          // Swap out the reference frame for a version that's been scaled to
+          // match the resolution of the current frame, allowing the existing
+          // motion search code to be used without additional modifications.
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            backup_yv12[i] = xd->plane[i].pre[0];
+
+          setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
+                           NULL, NULL);
+        }
+
+        if (scaled_ref_frame[1]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            backup_second_yv12[i] = xd->plane[i].pre[1];
+
+          setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
+                           NULL, NULL);
+        }
+        xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+                                                mi_row, mi_col);
+        xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+                                                mi_row, mi_col);
+
+        scaled_first_yv12 = xd->plane[0].pre[0];
+
+        // Initialize mv using single prediction mode result.
+        frame_mv[NEWMV][refs[0]].as_int = single_newmv[refs[0]].as_int;
+        frame_mv[NEWMV][refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+        // Iteration: joint search is done once for each ref frame.
+        // Tried allowing search multiple times iteratively, and break out if
+        // it couldn't find better mv. But tests didn't show noticeable
+        // improvement.
+        for (ite = 0; ite < 2; ite++) {
+          struct buf_2d ref_yv12[2] = {xd->plane[0].pre[0],
+                                       xd->plane[0].pre[1]};
+          int bestsme = INT_MAX;
+          int sadpb = x->sadperbit16;
+          int_mv tmp_mv;
+          int search_range = 3;
+
+          int tmp_col_min = x->mv_col_min;
+          int tmp_col_max = x->mv_col_max;
+          int tmp_row_min = x->mv_row_min;
+          int tmp_row_max = x->mv_row_max;
+          int id = ite % 2;
+
+          // Get pred block from second frame.
+          vp9_build_inter_predictor(ref_yv12[!id].buf,
+                                    ref_yv12[!id].stride,
+                                    second_pred, b_sz[bsize][0],
+                                    &frame_mv[NEWMV][refs[!id]],
+                                    &xd->scale_factor[!id],
+                                    b_sz[bsize][0], b_sz[bsize][1], 0,
+                                    &xd->subpix);
+
+          // Compound motion search on first ref frame.
+          if (id)
+            xd->plane[0].pre[0] = ref_yv12[id];
+          vp9_clamp_mv_min_max(x, &ref_mv[id]);
+
+          // Use mv result from single mode as mvp.
+          tmp_mv.as_int = frame_mv[NEWMV][refs[id]].as_int;
+
+          tmp_mv.as_mv.col >>= 3;
+          tmp_mv.as_mv.row >>= 3;
+
+          // Small-range full-pixel motion search
+          bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+                                             search_range,
+                                             &cpi->fn_ptr[block_size],
+                                             x->nmvjointcost, x->mvcost,
+                                             &ref_mv[id], second_pred,
+                                             b_sz[bsize][0], b_sz[bsize][1]);
+
+          x->mv_col_min = tmp_col_min;
+          x->mv_col_max = tmp_col_max;
+          x->mv_row_min = tmp_row_min;
+          x->mv_row_max = tmp_row_max;
+
+          if (bestsme < INT_MAX) {
+            int dis; /* TODO: use dis in distortion calculation later. */
+            unsigned int sse;
+
+            vp9_find_best_sub_pixel_comp(x, &tmp_mv,
+                                         &ref_mv[id],
+                                         x->errorperbit,
+                                         &cpi->fn_ptr[block_size],
+                                         x->nmvjointcost, x->mvcost,
+                                         &dis, &sse, second_pred,
+                                         b_sz[bsize][0], b_sz[bsize][1]);
+          }
+
+          frame_mv[NEWMV][refs[id]].as_int =
+              xd->mode_info_context->bmi[0].as_mv[1].as_int = tmp_mv.as_int;
+          if (id)
+            xd->plane[0].pre[0] = scaled_first_yv12;
+        }
+
+        // restore the predictor
+        if (scaled_ref_frame[0]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            xd->plane[i].pre[0] = backup_yv12[i];
+        }
+
+        if (scaled_ref_frame[1]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            xd->plane[i].pre[1] = backup_second_yv12[i];
+        }
+
+        vpx_free(second_pred);
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
+
         if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||
             frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)
           return INT64_MAX;
@@ -1862,7 +2009,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         int tmp_row_min = x->mv_row_min;
         int tmp_row_max = x->mv_row_max;
 
-        if (scaled_ref_frame) {
+        if (scaled_ref_frame[0]) {
           int i;
 
           // Swap out the reference frame for a version that's been scaled to
@@ -1871,7 +2018,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           for (i = 0; i < MAX_MB_PLANE; i++)
             backup_yv12[i] = xd->plane[i].pre[0];
 
-          setup_pre_planes(xd, scaled_ref_frame, NULL, mi_row, mi_col,
+          setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
                            NULL, NULL);
         }
 
@@ -1914,6 +2061,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         }
         frame_mv[NEWMV][refs[0]].as_int =
           xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+        single_newmv[refs[0]].as_int = tmp_mv.as_int;
 
         // Add the new motion vector cost to our rolling cost variable
         *rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0],
@@ -1921,7 +2069,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                   96, xd->allow_high_precision_mv);
 
         // restore the predictor, if required
-        if (scaled_ref_frame) {
+        if (scaled_ref_frame[0]) {
           int i;
 
           for (i = 0; i < MAX_MB_PLANE; i++)
@@ -2205,6 +2353,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   int frame_mdcounts[4][4];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  int_mv single_newmv[MAX_REF_FRAMES];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
   int idx_list[4] = {0,
@@ -2251,6 +2400,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   xd->mode_info_context->mbmi.segment_id = segment_id;
   estimate_ref_frame_costs(cpi, segment_id, ref_costs);
   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+  vpx_memset(&single_newmv, 0, sizeof(single_newmv));
 
   for (i = 0; i < NB_PREDICTION_TYPES; ++i)
     best_pred_rd[i] = INT64_MAX;
@@ -2608,7 +2758,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
       mbmi->mode = this_mode;
     } else {
-      YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
+      YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
       int fb;
 
       if (mbmi->ref_frame == LAST_FRAME) {
@@ -2620,7 +2770,20 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       }
 
       if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
-        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+        scaled_ref_frame[0] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+
+      if (comp_pred) {
+        if (mbmi->second_ref_frame == LAST_FRAME) {
+          fb = cpi->lst_fb_idx;
+        } else if (mbmi->second_ref_frame == GOLDEN_FRAME) {
+          fb = cpi->gld_fb_idx;
+        } else {
+          fb = cpi->alt_fb_idx;
+        }
+
+        if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
+          scaled_ref_frame[1] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+      }
 
       this_rd = handle_inter_mode(cpi, x, bsize,
                                   mdcounts, txfm_cache,
@@ -2630,7 +2793,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   &rate_uv, &distortion_uv,
                                   &mode_excluded, &disable_skip,
                                   mode_index, &tmp_best_filter, frame_mv,
-                                  scaled_ref_frame, mi_row, mi_col);
+                                  scaled_ref_frame, mi_row, mi_col,
+                                  single_newmv);
       if (this_rd == INT64_MAX)
         continue;
     }
index 13dabbda41d711a8dce115cf2892182450083a14..306476b018c83baee0396afa2efa048134d8b6b0 100644 (file)
@@ -12,6 +12,7 @@
 #define VP9_ENCODER_VP9_VARIANCE_H_
 
 #include "vpx/vpx_integer.h"
+// #include "./vpx_config.h"
 
 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                     int source_stride,
@@ -50,6 +51,15 @@ typedef unsigned int (*vp9_subpixvariance_fn_t)(const uint8_t *src_ptr,
                                                 int Refstride,
                                                 unsigned int *sse);
 
+typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
+                                                   int source_stride,
+                                                   int xoffset,
+                                                   int yoffset,
+                                                   const uint8_t *ref_ptr,
+                                                   int Refstride,
+                                                   unsigned int *sse,
+                                                   const uint8_t *second_pred);
+
 typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r,
                                 int rp, unsigned long *sum_s,
                                 unsigned long *sum_r, unsigned long *sum_sq_s,
@@ -64,15 +74,33 @@ typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
                                                    int  ref_stride);
 
 typedef struct vp9_variance_vtable {
-    vp9_sad_fn_t            sdf;
-    vp9_variance_fn_t       vf;
-    vp9_subpixvariance_fn_t svf;
-    vp9_variance_fn_t       svf_halfpix_h;
-    vp9_variance_fn_t       svf_halfpix_v;
-    vp9_variance_fn_t       svf_halfpix_hv;
-    vp9_sad_multi_fn_t      sdx3f;
-    vp9_sad_multi1_fn_t     sdx8f;
-    vp9_sad_multi_d_fn_t    sdx4df;
+    vp9_sad_fn_t               sdf;
+    vp9_variance_fn_t          vf;
+    vp9_subpixvariance_fn_t    svf;
+    vp9_subp_avg_variance_fn_t svaf;
+    vp9_variance_fn_t          svf_halfpix_h;
+    vp9_variance_fn_t          svf_halfpix_v;
+    vp9_variance_fn_t          svf_halfpix_hv;
+    vp9_sad_multi_fn_t         sdx3f;
+    vp9_sad_multi1_fn_t        sdx8f;
+    vp9_sad_multi_d_fn_t       sdx4df;
 } vp9_variance_fn_ptr_t;
 
+// #if CONFIG_COMP_INTER_JOINT_SEARCH
+static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int weight,
+                          int height, uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < weight; j++) {
+      int tmp;
+      tmp = pred[j] + ref[j];
+      comp_pred[j] = (tmp + 1) >> 1;
+    }
+    comp_pred += weight;
+    pred += weight;
+    ref += ref_stride;
+  }
+}
+// #endif  // CONFIG_COMP_INTER_JOINT_SEARCH
 #endif  // VP9_ENCODER_VP9_VARIANCE_H_
index c2a600408d50d72bd5961e50f37c3338f0fd08ed..fa53abdecea42c4a243aaf450357390bbf0274dc 100644 (file)
@@ -13,6 +13,7 @@
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_subpelvar.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
 
 unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
   unsigned int i, sum = 0;
@@ -58,6 +59,29 @@ unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr,
   return vp9_variance64x32_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 64, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
+  comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64);
+  return vp9_variance64x32_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance32x64_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -92,6 +116,29 @@ unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr,
   return vp9_variance32x64_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 65, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32);
+  return vp9_variance32x64_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance32x16_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -126,6 +173,29 @@ unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr,
   return vp9_variance32x16_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32);
+  return vp9_variance32x16_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance16x32_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -160,6 +230,29 @@ unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr,
   return vp9_variance16x32_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
+  comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16);
+  return vp9_variance16x32_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance64x64_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -317,6 +410,31 @@ unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr,
   return vp9_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint8_t temp2[20 * 16];
+  const int16_t *hfilter, *vfilter;
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4);  // compound pred buffer
+  uint16_t fdata3[5 * 4];  // Temp data bufffer used in filtering
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  // First filter 1d Horizontal
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 5, 4, hfilter);
+
+  // Now filter Verticaly
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);
+  comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4);
+  return vp9_variance4x4_c(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
+}
 
 unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
                                          int  src_pixels_per_line,
@@ -339,6 +457,29 @@ unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
   return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint16_t fdata3[9 * 8];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
+  comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8);
+  return vp9_variance8x8_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
@@ -360,6 +501,30 @@ unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
   return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[17 * 16];
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
+
+  comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16);
+  return vp9_variance16x16_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
@@ -381,6 +546,29 @@ unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
   return vp9_variance64x64_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 65, 64, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
+  comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64);
+  return vp9_variance64x64_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
@@ -402,6 +590,29 @@ unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
   return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32);
+  return vp9_variance32x32_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr,
                                               int  source_stride,
                                               const uint8_t *ref_ptr,
@@ -543,6 +754,29 @@ unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr,
   return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr,
+                                              int  src_pixels_per_line,
+                                              int  xoffset,
+                                              int  yoffset,
+                                              const uint8_t *dst_ptr,
+                                              int dst_pixels_per_line,
+                                              unsigned int *sse,
+                                              const uint8_t *second_pred) {
+  uint16_t fdata3[16 * 9];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
+  comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16);
+  return vp9_variance16x8_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
                                           int  src_pixels_per_line,
                                           int  xoffset,
@@ -564,3 +798,25 @@ unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
   return vp9_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr,
+                                              int  src_pixels_per_line,
+                                              int  xoffset,
+                                              int  yoffset,
+                                              const uint8_t *dst_ptr,
+                                              int dst_pixels_per_line,
+                                              unsigned int *sse,
+                                              const uint8_t *second_pred) {
+  uint16_t fdata3[9 * 16];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
+  comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);
+  return vp9_variance8x16_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}