Do sub-pixel motion search in up-sampled reference frames

author Yunqing Wang <yunqingwang@google.com>

Tue, 16 Feb 2016 22:33:18 +0000 (14:33 -0800)

committer Yunqing Wang <yunqingwang@google.com>

Mon, 29 Feb 2016 20:14:47 +0000 (12:14 -0800)
author Yunqing Wang <yunqingwang@google.com>
Tue, 16 Feb 2016 22:33:18 +0000 (14:33 -0800)
committer Yunqing Wang <yunqingwang@google.com>
Mon, 29 Feb 2016 20:14:47 +0000 (12:14 -0800)
diff --git a/configure b/configure

index 9769880438846f7094f3ad323636180eeab126c6..5eec2a117423c2ad714a9baf8652d66d01c3b99e 100755 (executable)
--- a/configure
+++ b/configure
@@ -283,6 +283,7 @@ EXPERIMENT_LIST="
      loop_restoration
      ext_partition
      obmc
+    affine_motion
  "
  CONFIG_LIST="
      dependency_tracking
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c

index f6ae6c032dfa5e1acf783c0a18c2476580723830..fc9e2e9242a0eabf8c0760e7dfd69c2345998cdc 100644 (file)
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -410,6 +410,15 @@ static void dealloc_compressor_data(VP10_COMP *cpi) {
    vpx_free(cpi->active_map.map);
    cpi->active_map.map = NULL;
  
+#if CONFIG_AFFINE_MOTION
+  {
+    // Free up-sampled reference buffers.
+    int i;
+    for (i = 0; i < MAX_REF_FRAMES; i++)
+      vpx_free_frame_buffer(&cpi->upsampled_ref_bufs[i].buf);
+  }
+#endif
+
    vp10_free_ref_frame_buffers(cm->buffer_pool);
  #if CONFIG_VP9_POSTPROC
    vp10_free_postproc_buffers(cm);
@@ -744,6 +753,26 @@ static void alloc_util_frame_buffers(VP10_COMP *cpi) {
                                 NULL, NULL, NULL))
      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                         "Failed to allocate scaled last source buffer");
+
+#if CONFIG_AFFINE_MOTION
+  {
+    // Allocate up-sampled reference buffers.
+    int i;
+
+    for (i = 0; i < MAX_REF_FRAMES; i++)
+      if (vpx_realloc_frame_buffer(&cpi->upsampled_ref_bufs[i].buf,
+                                   (cm->width << 3), (cm->height << 3),
+                                   cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                   cm->use_highbitdepth,
+#endif
+                                   (VP9_ENC_BORDER_IN_PIXELS << 3),
+                                   cm->byte_alignment,
+                                   NULL, NULL, NULL))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+            "Failed to allocate up-sampled reference frame buffer");
+  }
+#endif
  }
  
  
@@ -2353,10 +2382,11 @@ static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
  
  #if CONFIG_VP9_HIGHBITDEPTH
  static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                   YV12_BUFFER_CONFIG *dst, int bd) {
+                                   YV12_BUFFER_CONFIG *dst, int planes,
+                                   int bd) {
  #else
  static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                   YV12_BUFFER_CONFIG *dst) {
+                                   YV12_BUFFER_CONFIG *dst, int planes) {
  #endif  // CONFIG_VP9_HIGHBITDEPTH
    const int src_w = src->y_crop_width;
    const int src_h = src->y_crop_height;
@@ -2374,7 +2404,7 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
  
    for (y = 0; y < dst_h; y += 16) {
      for (x = 0; x < dst_w; x += 16) {
-      for (i = 0; i < MAX_MB_PLANE; ++i) {
+      for (i = 0; i < planes; ++i) {
          const int factor = (i == 0 || i == 3 ? 1 : 2);
          const int x_q4 = x * (16 / factor) * src_w / dst_w;
          const int y_q4 = y * (16 / factor) * src_h / dst_h;
@@ -2391,13 +2421,13 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                 &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
                                 16 / factor, 16 / factor, bd);
          } else {
-          vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
                          &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
                          &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
                          16 / factor, 16 / factor);
          }
  #else
-        vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+        vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
                        &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
                        &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
                        16 / factor, 16 / factor);
@@ -2406,7 +2436,10 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
      }
    }
  
-  vpx_extend_frame_borders(dst);
+  if (planes == 1)
+    vpx_extend_frame_borders_y(dst);
+  else
+    vpx_extend_frame_borders(dst);
  }
  
  static int scale_down(VP10_COMP *cpi, int q) {
@@ -2462,6 +2495,45 @@ static int recode_loop_test(VP10_COMP *cpi,
    return force_recode;
  }
  
+#if CONFIG_AFFINE_MOTION
+static INLINE int get_free_upsampled_ref_buf(EncRefCntBuffer *ubufs) {
+  int i;
+
+  for (i = 0; i < MAX_REF_FRAMES; i++) {
+    if (!ubufs[i].ref_count) {
+      return i;
+    }
+  }
+  return INVALID_IDX;
+}
+
+// Up-sample reference frames.
+static INLINE int upsample_ref_frame(RefCntBuffer *bufs,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                     EncRefCntBuffer *ubufs, int new_idx,
+                                     int bit_depth) {
+#else
+                                     EncRefCntBuffer *ubufs, int new_idx) {
+#endif
+  int new_uidx = get_free_upsampled_ref_buf(ubufs);
+
+  if (new_uidx == INVALID_IDX) {
+    return INVALID_IDX;
+  } else {
+    const YV12_BUFFER_CONFIG *const ref = &bufs[new_idx].buf;
+    YV12_BUFFER_CONFIG *upsampled_ref = &ubufs[new_uidx].buf;
+
+    // Currently, only Y plane is up-sampled, U, V are not used.
+#if CONFIG_VP9_HIGHBITDEPTH
+    scale_and_extend_frame(ref, upsampled_ref, 1, bit_depth);
+#else
+    scale_and_extend_frame(ref, upsampled_ref, 1);
+#endif
+    return new_uidx;
+  }
+}
+#endif
+
  void vp10_update_reference_frames(VP10_COMP *cpi) {
    VP10_COMMON * const cm = &cpi->common;
    BufferPool *const pool = cm->buffer_pool;
@@ -2469,6 +2541,17 @@ void vp10_update_reference_frames(VP10_COMP *cpi) {
    int ref_frame;
  #endif  // CONFIG_EXT_REFS
  
+#if CONFIG_AFFINE_MOTION
+  // Always up-sample the current encoded frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+  int new_uidx = upsample_ref_frame(pool->frame_bufs, cpi->upsampled_ref_bufs,
+                                    cm->new_fb_idx, (int)cm->bit_depth);
+#else
+  int new_uidx = upsample_ref_frame(pool->frame_bufs, cpi->upsampled_ref_bufs,
+                                    cm->new_fb_idx);
+#endif
+#endif
+
    // At this point the new frame has been encoded.
    // If any buffer copy / swapping is signaled it should be done here.
    if (cm->frame_type == KEY_FRAME) {
@@ -2476,6 +2559,13 @@ void vp10_update_reference_frames(VP10_COMP *cpi) {
                 &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
      ref_cnt_fb(pool->frame_bufs,
                 &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
+
+#if CONFIG_AFFINE_MOTION
+    uref_cnt_fb(cpi->upsampled_ref_bufs,
+                &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
+    uref_cnt_fb(cpi->upsampled_ref_bufs,
+                &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
+#endif
    } else if (vp10_preserve_existing_gf(cpi)) {
      // We have decided to preserve the previously existing golden frame as our
      // new ARF frame. However, in the short term in function
@@ -2489,7 +2579,10 @@ void vp10_update_reference_frames(VP10_COMP *cpi) {
  
      ref_cnt_fb(pool->frame_bufs,
                 &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
-
+#if CONFIG_AFFINE_MOTION
+    uref_cnt_fb(cpi->upsampled_ref_bufs,
+                &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
+#endif
      tmp = cpi->alt_fb_idx;
      cpi->alt_fb_idx = cpi->gld_fb_idx;
      cpi->gld_fb_idx = tmp;
@@ -2503,6 +2596,10 @@ void vp10_update_reference_frames(VP10_COMP *cpi) {
  
        ref_cnt_fb(pool->frame_bufs,
                   &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
+#if CONFIG_AFFINE_MOTION
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
+#endif
        memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
               cpi->interp_filter_selected[0],
               sizeof(cpi->interp_filter_selected[0]));
@@ -2511,6 +2608,10 @@ void vp10_update_reference_frames(VP10_COMP *cpi) {
      if (cpi->refresh_golden_frame) {
        ref_cnt_fb(pool->frame_bufs,
                   &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+#if CONFIG_AFFINE_MOTION
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
+#endif
        if (!cpi->rc.is_src_frame_alt_ref)
          memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
                 cpi->interp_filter_selected[0],
@@ -2545,6 +2646,10 @@ void vp10_update_reference_frames(VP10_COMP *cpi) {
    if (cpi->refresh_last_frame) {
      ref_cnt_fb(pool->frame_bufs,
                 &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
+#if CONFIG_AFFINE_MOTION
+    uref_cnt_fb(cpi->upsampled_ref_bufs,
+                &cpi->upsampled_ref_idx[cpi->lst_fb_idx], new_uidx);
+#endif
      if (!cpi->rc.is_src_frame_alt_ref) {
        memcpy(cpi->interp_filter_selected[LAST_FRAME],
               cpi->interp_filter_selected[0],
@@ -2678,7 +2783,8 @@ void vp10_scale_references(VP10_COMP *cpi) {
                                         cm->byte_alignment, NULL, NULL, NULL))
              vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                                 "Failed to allocate frame buffer");
-          scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth);
+          scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE,
+                                 (int)cm->bit_depth);
            cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
            alloc_frame_mvs(cm, new_fb);
          }
@@ -2703,11 +2809,39 @@ void vp10_scale_references(VP10_COMP *cpi) {
                                         cm->byte_alignment, NULL, NULL, NULL))
              vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                                 "Failed to allocate frame buffer");
-          scale_and_extend_frame(ref, &new_fb_ptr->buf);
+          scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE);
            cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
            alloc_frame_mvs(cm, new_fb);
          }
  #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_AFFINE_MOTION
+        {
+          const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
+          EncRefCntBuffer *ubuf =
+              &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[map_idx]];
+
+          if (vpx_realloc_frame_buffer(&ubuf->buf,
+                                       (cm->width << 3), (cm->height << 3),
+                                       cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                       cm->use_highbitdepth,
+#endif
+                                       (VP9_ENC_BORDER_IN_PIXELS << 3),
+                                       cm->byte_alignment,
+                                       NULL, NULL, NULL))
+            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to allocate up-sampled frame buffer");
+#if CONFIG_VP9_HIGHBITDEPTH
+          scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, MAX_MB_PLANE,
+                                 (int)cm->bit_depth);
+#else
+          scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, MAX_MB_PLANE);
+#endif
+          cpi->scaled_ref_idx[ref_frame - LAST_FRAME] = new_fb;
+          alloc_frame_mvs(cm, new_fb);
+        }
+#endif
        } else {
          const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
          RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
@@ -3787,6 +3921,17 @@ static void init_ref_frame_bufs(VP10_COMMON *cm) {
    }
  }
  
+#if CONFIG_AFFINE_MOTION
+static INLINE void init_upsampled_ref_frame_bufs(VP10_COMP *cpi) {
+  int i;
+
+  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    cpi->upsampled_ref_bufs[i].ref_count = 0;
+    cpi->upsampled_ref_idx[i] = INVALID_IDX;
+  }
+}
+#endif
+
  static void check_initial_width(VP10_COMP *cpi,
  #if CONFIG_VP9_HIGHBITDEPTH
                                  int use_highbitdepth,
@@ -3809,7 +3954,9 @@ static void check_initial_width(VP10_COMP *cpi,
      alloc_raw_frame_buffers(cpi);
      init_ref_frame_bufs(cm);
      alloc_util_frame_buffers(cpi);
-
+#if CONFIG_AFFINE_MOTION
+    init_upsampled_ref_frame_bufs(cpi);
+#endif
      init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
  
      cpi->initial_width = cm->width;
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h

index 292494caf4bce51853c8f17ac18aadde360f6da8..2c158a47cd23a7ced942f8fd6792187371b4ce83 100644 (file)
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -286,6 +286,13 @@ typedef struct IMAGE_STAT {
    double worst;
  } ImageStat;
  
+#if CONFIG_AFFINE_MOTION
+typedef struct {
+  int ref_count;
+  YV12_BUFFER_CONFIG buf;
+} EncRefCntBuffer;
+#endif
+
  typedef struct VP10_COMP {
    QUANTS quants;
    ThreadData td;
@@ -304,6 +311,12 @@ typedef struct VP10_COMP {
    YV12_BUFFER_CONFIG *unscaled_last_source;
    YV12_BUFFER_CONFIG scaled_last_source;
  
+#if CONFIG_AFFINE_MOTION
+  // Up-sampled reference buffers
+  EncRefCntBuffer upsampled_ref_bufs[MAX_REF_FRAMES];
+  int upsampled_ref_idx[MAX_REF_FRAMES];
+#endif
+
    TileDataEnc *tile_data;
    int allocated_tiles;  // Keep track of memory allocated for tiles.
  
@@ -692,4 +705,18 @@ void vp10_new_framerate(VP10_COMP *cpi, double framerate);
  }  // extern "C"
  #endif
  
+#if CONFIG_AFFINE_MOTION
+// Update up-sampled reference frame index.
+static INLINE void uref_cnt_fb(EncRefCntBuffer *ubufs, int *uidx,
+                               int new_uidx) {
+  const int ref_index = *uidx;
+
+  if (ref_index >= 0 && ubufs[ref_index].ref_count > 0)
+    ubufs[ref_index].ref_count--;
+
+  *uidx = new_uidx;
+  ubufs[new_uidx].ref_count++;
+}
+#endif
+
  #endif  // VP10_ENCODER_ENCODER_H_
diff --git a/vp10/encoder/mbgraph.c b/vp10/encoder/mbgraph.c

index 2d3a33e392af8f2140aae0b6eaf942f7ee7b93b0..1f467b811bebb6400d8f79734dda6b3254f9ffde 100644 (file)
--- a/vp10/encoder/mbgraph.c
+++ b/vp10/encoder/mbgraph.c
@@ -64,7 +64,11 @@ static unsigned int do_16x16_motion_iteration(VP10_COMP *cpi,
          &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
          cond_cost_list(cpi, cost_list),
          NULL, NULL,
+#if CONFIG_AFFINE_MOTION
+        &distortion, &sse, NULL, 0, 0, 0);
+#else
          &distortion, &sse, NULL, 0, 0);
+#endif
    }
  
  #if CONFIG_EXT_INTER
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c

index 6e3b06ab9cecb20d7b67576ccb04936ad461d517..8949f76bc985beeefe4371bcb8972633b69563dd 100644 (file)
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -208,6 +208,32 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
      v = INT_MAX;                                                       \
    }
  
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+#if CONFIG_AFFINE_MOTION
+static INLINE const uint8_t *upre(const uint8_t *buf, int stride,
+                                  int r, int c) {
+  return &buf[(r) * stride + (c)];
+}
+
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER1(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    thismse = upsampled_pref_error(xd, vfp, z, src_stride,             \
+                                   upre(y, y_stride, r, c), y_stride,  \
+                                   second_pred, w, h, &sse);           \
+    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+#endif
+
  #define FIRST_LEVEL_CHECKS                              \
    {                                                     \
      unsigned int left, right, up, down, diag;           \
@@ -276,7 +302,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
  // TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of
  // SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten
  // later in the same way.
-#define SECOND_LEVEL_CHECKS_BEST                        \
+#define SECOND_LEVEL_CHECKS_BEST(k)                     \
    {                                                     \
      unsigned int second;                                \
      int br0 = br;                                       \
@@ -287,10 +313,10 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
      } else if (tr != br && tc == bc) {                  \
        kr = br - tr;                                     \
      }                                                   \
-    CHECK_BETTER(second, br0 + kr, bc0);                \
-    CHECK_BETTER(second, br0, bc0 + kc);                \
+    CHECK_BETTER##k(second, br0 + kr, bc0);             \
+    CHECK_BETTER##k(second, br0, bc0 + kc);             \
      if (br0 != br || bc0 != bc) {                       \
-      CHECK_BETTER(second, br0 + kr, bc0 + kc);         \
+      CHECK_BETTER##k(second, br0 + kr, bc0 + kc);      \
      }                                                   \
    }
  
@@ -412,7 +438,11 @@ int vp10_find_best_sub_pixel_tree_pruned_evenmore(
      int *distortion,
      unsigned int *sse1,
      const uint8_t *second_pred,
+#if CONFIG_AFFINE_MOTION
+    int w, int h, int use_upsampled_ref) {
+#else
      int w, int h) {
+#endif
    SETUP_SUBPEL_SEARCH;
    besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                 z, src_stride, y, y_stride, second_pred,
@@ -425,6 +455,9 @@ int vp10_find_best_sub_pixel_tree_pruned_evenmore(
    (void) allow_hp;
    (void) forced_stop;
    (void) hstep;
+#if CONFIG_AFFINE_MOTION
+  (void) use_upsampled_ref;
+#endif
  
    if (cost_list &&
        cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
@@ -491,8 +524,17 @@ int vp10_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
                                               int *distortion,
                                               unsigned int *sse1,
                                               const uint8_t *second_pred,
+#if CONFIG_AFFINE_MOTION
+                                             int w, int h,
+                                             int use_upsampled_ref) {
+#else
                                               int w, int h) {
+#endif
    SETUP_SUBPEL_SEARCH;
+#if CONFIG_AFFINE_MOTION
+  (void) use_upsampled_ref;
+#endif
+
    besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                 z, src_stride, y, y_stride, second_pred,
                                 w, h, offset, mvjcost, mvcost,
@@ -565,8 +607,16 @@ int vp10_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
                                          int *distortion,
                                          unsigned int *sse1,
                                          const uint8_t *second_pred,
+#if CONFIG_AFFINE_MOTION
+                                        int w, int h, int use_upsampled_ref) {
+#else
                                          int w, int h) {
+#endif
    SETUP_SUBPEL_SEARCH;
+#if CONFIG_AFFINE_MOTION
+  (void) use_upsampled_ref;
+#endif
+
    besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                 z, src_stride, y, y_stride, second_pred,
                                 w, h, offset, mvjcost, mvcost,
@@ -655,6 +705,101 @@ static const MV search_step_table[12] = {
      {0, -1}, {0, 1}, {-1, 0}, {1, 0}
  };
  
+
+#if CONFIG_AFFINE_MOTION
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_comp_avg_upsampled_pred(uint16_t *comp_pred,
+                                           const uint8_t *pred8,
+                                           int width, int height,
+                                           const uint8_t *ref8,
+                                           int ref_stride) {
+  int i, j;
+  int stride = ref_stride << 3;
+
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int tmp = pred[j] + ref[(j << 3)];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += stride;
+  }
+}
+
+static void highbd_upsampled_pred(uint16_t *comp_pred,
+                                  int width, int height,
+                                  const uint8_t *ref8,
+                                  int ref_stride) {
+  int i, j;
+  int stride = ref_stride << 3;
+
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      comp_pred[j] = ref[(j << 3)];
+    }
+    comp_pred += width;
+    ref += stride;
+  }
+}
+#endif
+
+static int upsampled_pref_error(const MACROBLOCKD *xd,
+                                const vp9_variance_fn_ptr_t *vfp,
+                                const uint8_t *const src, const int src_stride,
+                                const uint8_t *const y, int y_stride,
+                                const uint8_t *second_pred,
+                                int w, int h, unsigned int *sse) {
+  unsigned int besterr;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[64 * 64]);
+    if (second_pred != NULL)
+      highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, y,
+                                     y_stride);
+    else
+      highbd_upsampled_pred(pred16, w, h, y, y_stride);
+
+    besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride,
+                      sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
+#else
+    DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
+    (void) xd;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    if (second_pred != NULL)
+      vpx_comp_avg_upsampled_pred(pred, second_pred, w, h, y,
+                                  y_stride);
+    else
+      vpx_upsampled_pred(pred, w, h, y, y_stride);
+
+    besterr = vfp->vf(pred, w, src, src_stride, sse);
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif
+return besterr;
+}
+
+static unsigned int upsampled_setup_center_error(
+    const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
+    int error_per_bit, const vp9_variance_fn_ptr_t *vfp,
+    const uint8_t *const src, const int src_stride,
+    const uint8_t *const y, int y_stride, const uint8_t *second_pred,
+    int w, int h, int offset, int *mvjcost, int *mvcost[2],
+    unsigned int *sse1, int *distortion) {
+  unsigned int besterr = upsampled_pref_error(xd, vfp, src, src_stride,
+                                              y + offset, y_stride, second_pred,
+                                              w, h, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+#endif
+
  int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x,
                                   MV *bestmv, const MV *ref_mv,
                                   int allow_hp,
@@ -667,14 +812,18 @@ int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x,
                                   int *distortion,
                                   unsigned int *sse1,
                                   const uint8_t *second_pred,
+#if CONFIG_AFFINE_MOTION
+                                 int w, int h, int use_upsampled_ref) {
+#else
                                   int w, int h) {
+#endif
    const uint8_t *const z = x->plane[0].src.buf;
    const uint8_t *const src_address = z;
    const int src_stride = x->plane[0].src.stride;
    const MACROBLOCKD *xd = &x->e_mbd;
    unsigned int besterr = INT_MAX;
    unsigned int sse;
-  int thismse;
+  unsigned int thismse;
    const int y_stride = xd->plane[0].pre[0].stride;
    const int offset = bestmv->row * y_stride + bestmv->col;
    const uint8_t *const y = xd->plane[0].pre[0].buf;
@@ -703,10 +852,19 @@ int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x,
    bestmv->row *= 8;
    bestmv->col *= 8;
  
-  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
-                               z, src_stride, y, y_stride, second_pred,
-                               w, h, offset, mvjcost, mvcost,
-                               sse1, distortion);
+#if CONFIG_AFFINE_MOTION
+  // use_upsampled_ref can be 0 or 1
+  if (use_upsampled_ref)
+    besterr = upsampled_setup_center_error(xd, bestmv, ref_mv, error_per_bit,
+                                           vfp, z, src_stride, y, y_stride,
+                                           second_pred, w, h, (offset << 3),
+                                           mvjcost, mvcost, sse1, distortion);
+  else
+#endif
+    besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                                 z, src_stride, y, y_stride, second_pred,
+                                 w, h, offset, mvjcost, mvcost,
+                                 sse1, distortion);
  
    (void) cost_list;  // to silence compiler warning
  
@@ -716,16 +874,29 @@ int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x,
        tr = br + search_step[idx].row;
        tc = bc + search_step[idx].col;
        if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
-        MV this_mv;
-        this_mv.row = tr;
-        this_mv.col = tc;
-        if (second_pred == NULL)
-          thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
-                             src_address, src_stride, &sse);
-        else
-          thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
-                              src_address, src_stride, &sse, second_pred);
+        MV this_mv = {tr, tc};
+
+#if CONFIG_AFFINE_MOTION
+        if (use_upsampled_ref) {
+          const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+          thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
+                                         pre_address, y_stride, second_pred,
+                                         w, h, &sse);
+        } else {
+#endif
+          const uint8_t *const pre_address = y + (tr >> 3) * y_stride +
+              (tc >> 3);
+          if (second_pred == NULL)
+            thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+                               src_address, src_stride, &sse);
+          else
+            thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                                src_address, src_stride, &sse, second_pred);
+#if CONFIG_AFFINE_MOTION
+        }
+#endif
+
          cost_array[idx] = thismse +
              mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
  
@@ -747,14 +918,29 @@ int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x,
      tc = bc + kc;
      tr = br + kr;
      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-      const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
        MV this_mv = {tr, tc};
-      if (second_pred == NULL)
-        thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
-                           src_address, src_stride, &sse);
-      else
-        thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
-                            src_address, src_stride, &sse, second_pred);
+
+#if CONFIG_AFFINE_MOTION
+      if (use_upsampled_ref) {
+        const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+        thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
+                                       pre_address, y_stride, second_pred,
+                                       w, h, &sse);
+      } else {
+#endif
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+
+        if (second_pred == NULL)
+          thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+                             src_address, src_stride, &sse);
+        else
+          thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, src_stride, &sse, second_pred);
+#if CONFIG_AFFINE_MOTION
+      }
+#endif
+
        cost_array[4] = thismse +
            mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
  
@@ -776,8 +962,17 @@ int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x,
        bc = tc;
      }
  
-    if (iters_per_step > 1 && best_idx != -1)
-      SECOND_LEVEL_CHECKS_BEST;
+    if (iters_per_step > 1 && best_idx != -1) {
+#if CONFIG_AFFINE_MOTION
+      if (use_upsampled_ref) {
+        SECOND_LEVEL_CHECKS_BEST(1);
+      } else {
+#endif
+        SECOND_LEVEL_CHECKS_BEST(0);
+#if CONFIG_AFFINE_MOTION
+      }
+#endif
+    }
  
      tr = br;
      tc = bc;
diff --git a/vp10/encoder/mcomp.h b/vp10/encoder/mcomp.h

index 9d1ab2aabe574c35a3f0d58271197c162e61356f..3063b996e7b095a6fd22a95bd687af20d3880575 100644 (file)
--- a/vp10/encoder/mcomp.h
+++ b/vp10/encoder/mcomp.h
@@ -116,7 +116,11 @@ typedef int (fractional_mv_step_fp) (
      int *mvjcost, int *mvcost[2],
      int *distortion, unsigned int *sse1,
      const uint8_t *second_pred,
+#if CONFIG_AFFINE_MOTION
+    int w, int h, int use_upsampled_ref);
+#else
      int w, int h);
+#endif
  
  extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree;
  extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree_pruned;
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c

index 03aa9f0869c88487d680d8457a27ef9730e4e393..5c74d32eb5c9cc23ab1f5144482e5ae93999b07a 100644 (file)
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -3929,7 +3929,8 @@ static void joint_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
                                  int_mv* ref_mv_sub8x8[2],
  #endif
                                  int_mv single_newmv[MAX_REF_FRAMES],
-                                int *rate_mv) {
+                                int *rate_mv,
+                                const int block) {
    const VP10_COMMON *const cm = &cpi->common;
    const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
    const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
@@ -4076,6 +4077,40 @@ static void joint_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
      if (bestsme < INT_MAX) {
        int dis; /* TODO: use dis in distortion calculation later. */
        unsigned int sse;
+#if CONFIG_AFFINE_MOTION
+      // Use up-sampled reference frames.
+      struct macroblockd_plane *const pd = &xd->plane[0];
+      struct buf_2d backup_pred = pd->pre[0];
+      const YV12_BUFFER_CONFIG *upsampled_ref =
+          get_upsampled_ref(cpi, refs[id]);
+
+      // Set pred for Y plane
+      setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
+                       upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
+                       NULL, pd->subsampling_x, pd->subsampling_y);
+
+      // If bsize < BLOCK_8X8, adjust pred pointer for this block
+      if (bsize < BLOCK_8X8)
+        pd->pre[0].buf =
+            &pd->pre[0].buf[(vp10_raster_block_offset(BLOCK_8X8, block,
+            pd->pre[0].stride)) << 3];
+
+      bestsme = cpi->find_fractional_mv_step(
+          x, &tmp_mv,
+          &ref_mv[id].as_mv,
+          cpi->common.allow_high_precision_mv,
+          x->errorperbit,
+          &cpi->fn_ptr[bsize],
+          0, cpi->sf.mv.subpel_iters_per_step,
+          NULL,
+          x->nmvjointcost, x->mvcost,
+          &dis, &sse, second_pred,
+          pw, ph, 1);
+
+      // Restore the reference frames.
+      pd->pre[0] = backup_pred;
+#else
+      (void) block;
        bestsme = cpi->find_fractional_mv_step(
            x, &tmp_mv,
            &ref_mv[id].as_mv,
@@ -4087,6 +4122,7 @@ static void joint_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
            x->nmvjointcost, x->mvcost,
            &dis, &sse, second_pred,
            pw, ph);
+#endif
      }
  
      // Restore the pointer to the first (possibly scaled) prediction buffer.
@@ -4367,6 +4403,43 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x,
  
            if (bestsme < INT_MAX) {
              int distortion;
+#if CONFIG_AFFINE_MOTION
+            const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
+            const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
+            // Use up-sampled reference frames.
+            struct macroblockd_plane *const pd = &xd->plane[0];
+            struct buf_2d backup_pred = pd->pre[0];
+            const YV12_BUFFER_CONFIG *upsampled_ref =
+                get_upsampled_ref(cpi, mbmi->ref_frame[0]);
+
+            // Set pred for Y plane
+            setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
+                             upsampled_ref->y_stride,
+                             (mi_row << 3), (mi_col << 3),
+                             NULL, pd->subsampling_x, pd->subsampling_y);
+
+            // adjust pred pointer for this block
+            pd->pre[0].buf =
+                &pd->pre[0].buf[(vp10_raster_block_offset(BLOCK_8X8, i,
+                pd->pre[0].stride)) << 3];
+
+            cpi->find_fractional_mv_step(
+                x,
+                new_mv,
+                &bsi->ref_mv[0]->as_mv,
+                cm->allow_high_precision_mv,
+                x->errorperbit, &cpi->fn_ptr[bsize],
+                cpi->sf.mv.subpel_force_stop,
+                cpi->sf.mv.subpel_iters_per_step,
+                cond_cost_list(cpi, cost_list),
+                x->nmvjointcost, x->mvcost,
+                &distortion,
+                &x->pred_sse[mbmi->ref_frame[0]],
+                NULL, pw, ph, 1);
+
+            // Restore the reference frames.
+            pd->pre[0] = backup_pred;
+#else
              cpi->find_fractional_mv_step(
                  x,
                  new_mv,
@@ -4380,6 +4453,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x,
                  &distortion,
                  &x->pred_sse[mbmi->ref_frame[0]],
                  NULL, 0, 0);
+#endif
  
              // save motion search result for use in compound prediction
  #if CONFIG_EXT_INTER
@@ -4426,7 +4500,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x,
  #else
                                  seg_mvs[i],
  #endif  // CONFIG_EXT_INTER
-                                &rate_mv);
+                                &rate_mv, i);
  #if CONFIG_EXT_INTER
              compound_seg_newmvs[i][0].as_int =
                  frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
@@ -4975,6 +5049,33 @@ static void single_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
  
    if (bestsme < INT_MAX) {
      int dis;  /* TODO: use dis in distortion calculation later. */
+#if CONFIG_AFFINE_MOTION
+    const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
+    const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
+    // Use up-sampled reference frames.
+    struct macroblockd_plane *const pd = &xd->plane[0];
+    struct buf_2d backup_pred = pd->pre[0];
+    const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+
+    // Set pred for Y plane
+    setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
+                     upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+
+    bestsme = cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
+                                           cm->allow_high_precision_mv,
+                                           x->errorperbit,
+                                           &cpi->fn_ptr[bsize],
+                                           cpi->sf.mv.subpel_force_stop,
+                                           cpi->sf.mv.subpel_iters_per_step,
+                                           cond_cost_list(cpi, cost_list),
+                                           x->nmvjointcost, x->mvcost,
+                                           &dis, &x->pred_sse[ref], NULL,
+                                           pw, ph, 1);
+
+    // Restore the reference frames.
+    pd->pre[0] = backup_pred;
+#else
      cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
                                   cm->allow_high_precision_mv,
                                   x->errorperbit,
@@ -4984,6 +5085,7 @@ static void single_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
                                   cond_cost_list(cpi, cost_list),
                                   x->nmvjointcost, x->mvcost,
                                   &dis, &x->pred_sse[ref], NULL, 0, 0);
+#endif
    }
    *rate_mv = vp10_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
                               x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
@@ -5328,7 +5430,7 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
  
          if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
            joint_motion_search(cpi, x, bsize, frame_mv,
-                              mi_row, mi_col, NULL, single_newmv, &rate_mv);
+                              mi_row, mi_col, NULL, single_newmv, &rate_mv, 0);
          } else {
            rate_mv  = vp10_mv_bit_cost(&frame_mv[refs[0]].as_mv,
                                        &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
@@ -5358,7 +5460,7 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
        if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
          joint_motion_search(cpi, x, bsize, frame_mv,
                              mi_row, mi_col,
-                            single_newmv, &rate_mv);
+                            single_newmv, &rate_mv, 0);
        } else {
          rate_mv  = vp10_mv_bit_cost(&frame_mv[refs[0]].as_mv,
                                     &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
diff --git a/vp10/encoder/rdopt.h b/vp10/encoder/rdopt.h

index 066bf69f562a39f0eaafa74d0e150e6db66d029f..74702a95b2276da0b931b5bd0ec2130b8a9439a5 100644 (file)
--- a/vp10/encoder/rdopt.h
+++ b/vp10/encoder/rdopt.h
@@ -106,4 +106,20 @@ void vp10_build_prediction_by_left_preds(VP10_COMP *cpi,
  }  // extern "C"
  #endif
  
+#if CONFIG_AFFINE_MOTION
+static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(VP10_COMP *cpi,
+                                                          const int ref) {
+  // Use up-sampled reference frames.
+  int ref_idx = 0;
+  if (ref == LAST_FRAME)
+    ref_idx = cpi->lst_fb_idx;
+  else if (ref == GOLDEN_FRAME)
+    ref_idx = cpi->gld_fb_idx;
+  else if (ref == ALTREF_FRAME)
+    ref_idx = cpi->alt_fb_idx;
+
+  return &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[ref_idx]].buf;
+}
+#endif
+
  #endif  // VP10_ENCODER_RDOPT_H_
diff --git a/vp10/encoder/temporal_filter.c b/vp10/encoder/temporal_filter.c

index d16e4a4e20dab414b53ec22f349998848abb3629..3e1246a807e37f3066a02c51fe816cba5031a893 100644 (file)
--- a/vp10/encoder/temporal_filter.c
+++ b/vp10/encoder/temporal_filter.c
@@ -320,7 +320,11 @@ static int temporal_filter_find_matching_mb_c(VP10_COMP *cpi,
                                           0, mv_sf->subpel_iters_per_step,
                                           cond_cost_list(cpi, cost_list),
                                           NULL, NULL,
+#if CONFIG_AFFINE_MOTION
+                                         &distortion, &sse, NULL, 0, 0, 0);
+#else
                                           &distortion, &sse, NULL, 0, 0);
+#endif
  
    // Restore input state
    x->plane[0].src = src;
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c

index e8bddb0a0e0088de41c39578c267a8a447344509..3b6c419744356be0b1b896d045b8cbe867d8a777 100644 (file)
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -272,6 +272,41 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
    }
  }
  
+#if CONFIG_AFFINE_MOTION
+// Get pred block from up-sampled reference.
+void vpx_upsampled_pred_c(uint8_t *comp_pred,
+                          int width, int height,
+                          const uint8_t *ref,  int ref_stride) {
+    int i, j, k;
+    int stride = ref_stride << 3;
+
+    for (i = 0; i < height; i++) {
+      for (j = 0, k = 0; j < width; j++, k += 8) {
+        comp_pred[j] = ref[k];
+      }
+      comp_pred += width;
+      ref += stride;
+    }
+}
+
+void vpx_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
+                                   int width, int height,
+                                   const uint8_t *ref, int ref_stride) {
+    int i, j;
+    int stride = ref_stride << 3;
+
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j++) {
+        const int tmp = ref[(j << 3)] + pred[j];
+        comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += stride;
+    }
+}
+#endif
+
  #if CONFIG_VP9_HIGHBITDEPTH
  static void highbd_variance64(const uint8_t *a8, int  a_stride,
                                const uint8_t *b8, int  b_stride,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl

index 5457d00bf0239939de0a70549c3c4c95565ca2ad..8d1afdfac1e4255072861e689c5b98950814f094 100644 (file)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1464,6 +1464,13 @@ add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int
  
  add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
  
+if (vpx_config("CONFIG_AFFINE_MOTION") eq "yes") {
+  add_proto qw/void vpx_upsampled_pred/, "uint8_t *comp_pred, int width, int height, const uint8_t *ref, int ref_stride";
+    specialize qw/vpx_upsampled_pred sse2/;
+  add_proto qw/void vpx_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+    specialize qw/vpx_comp_avg_upsampled_pred sse2/;
+}
+
  #
  # Subpixel Variance
  #
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c

index e6c9365ab4d84e1de5a07c8c684f08bb687ba74b..7943c843c408197afff8c6adaa6f9e3d27539555 100644 (file)
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -475,3 +475,232 @@ FNS(ssse3, ssse3);
  #undef FNS
  #undef FN
  #endif  // CONFIG_USE_X86INC
+
+#if CONFIG_AFFINE_MOTION
+void vpx_upsampled_pred_sse2(uint8_t *comp_pred,
+                             int width, int height,
+                             const uint8_t *ref,  int ref_stride) {
+    int i, j;
+    int stride = ref_stride << 3;
+
+    if (width >= 16) {
+      // read 16 points at one time
+      for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j+= 16) {
+          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+          __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
+          __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
+          __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64));
+          __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80));
+          __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96));
+          __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112));
+          __m128i t0, t1, t2, t3;
+
+          t0 = _mm_unpacklo_epi8(s0, s1);
+          s1 = _mm_unpackhi_epi8(s0, s1);
+          t1 = _mm_unpacklo_epi8(s2, s3);
+          s3 = _mm_unpackhi_epi8(s2, s3);
+          t2 = _mm_unpacklo_epi8(s4, s5);
+          s5 = _mm_unpackhi_epi8(s4, s5);
+          t3 = _mm_unpacklo_epi8(s6, s7);
+          s7 = _mm_unpackhi_epi8(s6, s7);
+
+          s0 = _mm_unpacklo_epi8(t0, s1);
+          s2 = _mm_unpacklo_epi8(t1, s3);
+          s4 = _mm_unpacklo_epi8(t2, s5);
+          s6 = _mm_unpacklo_epi8(t3, s7);
+
+          *(int *)comp_pred = _mm_cvtsi128_si32(s0);
+          *(int *)(comp_pred + 4) = _mm_cvtsi128_si32(s2);
+          *(int *)(comp_pred + 8) = _mm_cvtsi128_si32(s4);
+          *(int *)(comp_pred + 12) = _mm_cvtsi128_si32(s6);
+
+          comp_pred += 16;
+          ref += 16 * 8;
+        }
+        ref += stride - (width << 3);
+      }
+    } else if (width >= 8) {
+      // read 8 points at one time
+      for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j+= 8) {
+          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+          __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
+          __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
+          __m128i t0, t1;
+
+          t0 = _mm_unpacklo_epi8(s0, s1);
+          s1 = _mm_unpackhi_epi8(s0, s1);
+          t1 = _mm_unpacklo_epi8(s2, s3);
+          s3 = _mm_unpackhi_epi8(s2, s3);
+
+          s0 = _mm_unpacklo_epi8(t0, s1);
+          s2 = _mm_unpacklo_epi8(t1, s3);
+
+          *(int *)comp_pred = _mm_cvtsi128_si32(s0);
+          *(int *)(comp_pred + 4) = _mm_cvtsi128_si32(s2);
+          comp_pred += 8;
+          ref += 8 * 8;
+        }
+        ref += stride - (width << 3);
+      }
+    } else {
+      // read 4 points at one time
+      for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j+= 4) {
+          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+          __m128i t0;
+
+          t0 = _mm_unpacklo_epi8(s0, s1);
+          s1 = _mm_unpackhi_epi8(s0, s1);
+          s0 = _mm_unpacklo_epi8(t0, s1);
+
+          *(int *)comp_pred = _mm_cvtsi128_si32(s0);
+
+          comp_pred += 4;
+          ref += 4 * 8;
+        }
+        ref += stride - (width << 3);
+      }
+    }
+}
+
+void vpx_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred,
+                                      int width, int height,
+                                      const uint8_t *ref,  int ref_stride) {
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi16(1);
+    int i, j;
+    int stride = ref_stride << 3;
+
+    if (width >= 16) {
+      // read 16 points at one time
+      for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j+= 16) {
+          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+          __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
+          __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
+          __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64));
+          __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80));
+          __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96));
+          __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112));
+          __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+          __m128i p1;
+          __m128i t0, t1, t2, t3;
+
+          t0 = _mm_unpacklo_epi8(s0, s1);
+          s1 = _mm_unpackhi_epi8(s0, s1);
+          t1 = _mm_unpacklo_epi8(s2, s3);
+          s3 = _mm_unpackhi_epi8(s2, s3);
+          t2 = _mm_unpacklo_epi8(s4, s5);
+          s5 = _mm_unpackhi_epi8(s4, s5);
+          t3 = _mm_unpacklo_epi8(s6, s7);
+          s7 = _mm_unpackhi_epi8(s6, s7);
+
+          s0 = _mm_unpacklo_epi8(t0, s1);
+          s2 = _mm_unpacklo_epi8(t1, s3);
+          s4 = _mm_unpacklo_epi8(t2, s5);
+          s6 = _mm_unpacklo_epi8(t3, s7);
+
+          s0 = _mm_unpacklo_epi32(s0, s2);
+          s4 = _mm_unpacklo_epi32(s4, s6);
+          s0 = _mm_unpacklo_epi8(s0, zero);
+          s4 = _mm_unpacklo_epi8(s4, zero);
+
+          p1 = _mm_unpackhi_epi8(p0, zero);
+          p0 = _mm_unpacklo_epi8(p0, zero);
+          p0 = _mm_adds_epu16(s0, p0);
+          p1 = _mm_adds_epu16(s4, p1);
+          p0 = _mm_adds_epu16(p0, one);
+          p1 = _mm_adds_epu16(p1, one);
+
+          p0 = _mm_srli_epi16(p0, 1);
+          p1 = _mm_srli_epi16(p1, 1);
+          p0 = _mm_packus_epi16(p0, p1);
+
+          *(int *)comp_pred = _mm_cvtsi128_si32(p0);
+          p0 = _mm_srli_si128(p0, 4);
+          *(int *)(comp_pred + 4) = _mm_cvtsi128_si32(p0);
+          p0 = _mm_srli_si128(p0, 4);
+          *(int *)(comp_pred + 8) = _mm_cvtsi128_si32(p0);
+          p0 = _mm_srli_si128(p0, 4);
+          *(int *)(comp_pred + 12) = _mm_cvtsi128_si32(p0);
+
+          comp_pred += 16;
+          pred += 16;
+          ref += 16 * 8;
+        }
+        ref += stride - (width << 3);
+      }
+    } else if (width >= 8) {
+      // read 8 points at one time
+      for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j+= 8) {
+          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+          __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
+          __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
+          __m128i p0 = _mm_loadl_epi64((const __m128i *)pred);
+          __m128i t0, t1;
+
+          t0 = _mm_unpacklo_epi8(s0, s1);
+          s1 = _mm_unpackhi_epi8(s0, s1);
+          t1 = _mm_unpacklo_epi8(s2, s3);
+          s3 = _mm_unpackhi_epi8(s2, s3);
+
+          s0 = _mm_unpacklo_epi8(t0, s1);
+          s2 = _mm_unpacklo_epi8(t1, s3);
+          s0 = _mm_unpacklo_epi32(s0, s2);
+          s0 = _mm_unpacklo_epi8(s0, zero);
+
+          p0 = _mm_unpacklo_epi8(p0, zero);
+          p0 = _mm_adds_epu16(s0, p0);
+          p0 = _mm_adds_epu16(p0, one);
+          p0 = _mm_srli_epi16(p0, 1);
+          p0 = _mm_packus_epi16(p0, zero);
+
+          *(int *)comp_pred = _mm_cvtsi128_si32(p0);
+          p0 = _mm_srli_si128(p0, 4);
+          *(int *)(comp_pred + 4) = _mm_cvtsi128_si32(p0);
+
+          comp_pred += 8;
+          pred += 8;
+          ref += 8 * 8;
+        }
+        ref += stride - (width << 3);
+      }
+    } else {
+      // read 4 points at one time
+      for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j+= 4) {
+          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+          __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)pred);
+          __m128i t0;
+
+          t0 = _mm_unpacklo_epi8(s0, s1);
+          s1 = _mm_unpackhi_epi8(s0, s1);
+          s0 = _mm_unpacklo_epi8(t0, s1);
+          s0 = _mm_unpacklo_epi8(s0, zero);
+
+          p0 = _mm_unpacklo_epi8(p0, zero);
+          p0 = _mm_adds_epu16(s0, p0);
+          p0 = _mm_adds_epu16(p0, one);
+          p0 = _mm_srli_epi16(p0, 1);
+          p0 = _mm_packus_epi16(p0, zero);
+
+          *(int *)comp_pred = _mm_cvtsi128_si32(p0);
+
+          comp_pred += 4;
+          pred += 4;
+          ref += 4 * 8;
+        }
+        ref += stride - (width << 3);
+      }
+    }
+}
+#endif
diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c

index 670144bc10fd84ebb5c10874047a847b03240ec1..521207589e03949258049718af5dff9b0a353cca 100644 (file)
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c
@@ -210,6 +210,30 @@ void vpx_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf) {
    extend_frame(ybf, inner_bw);
  }
  
+void vpx_extend_frame_borders_y_c(YV12_BUFFER_CONFIG *ybf) {
+  int ext_size = ybf->border;
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    extend_plane_high(ybf->y_buffer, ybf->y_stride,
+                      ybf->y_crop_width, ybf->y_crop_height,
+                      ext_size, ext_size,
+                      ext_size + ybf->y_height - ybf->y_crop_height,
+                      ext_size + ybf->y_width - ybf->y_crop_width);
+    return;
+  }
+#endif
+  extend_plane(ybf->y_buffer, ybf->y_stride,
+               ybf->y_crop_width, ybf->y_crop_height,
+               ext_size, ext_size,
+               ext_size + ybf->y_height - ybf->y_crop_height,
+               ext_size + ybf->y_width - ybf->y_crop_width);
+}
+
  #if CONFIG_VP9_HIGHBITDEPTH
  void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
diff --git a/vpx_scale/vpx_scale_rtcd.pl b/vpx_scale/vpx_scale_rtcd.pl

index 56b952ba3554fe6922342b4816aa553bac6a1be8..68a1a3ec0bce7a0fea5e76861dd40de502bdac54 100644 (file)
--- a/vpx_scale/vpx_scale_rtcd.pl
+++ b/vpx_scale/vpx_scale_rtcd.pl
@@ -28,5 +28,8 @@ if ((vpx_config("CONFIG_VP9") eq "yes") || (vpx_config("CONFIG_VP10") eq "yes"))
  
      add_proto qw/void vpx_extend_frame_inner_borders/, "struct yv12_buffer_config *ybf";
      specialize qw/vpx_extend_frame_inner_borders dspr2/;
+
+    add_proto qw/void vpx_extend_frame_borders_y/, "struct yv12_buffer_config *ybf";
+    specialize qw/vpx_extend_frame_borders_y/;
  }
  1;
author	Yunqing Wang <yunqingwang@google.com>
	Tue, 16 Feb 2016 22:33:18 +0000 (14:33 -0800)
committer	Yunqing Wang <yunqingwang@google.com>
	Mon, 29 Feb 2016 20:14:47 +0000 (12:14 -0800)
configure		patch \| blob \| history
vp10/encoder/encoder.c		patch \| blob \| history
vp10/encoder/encoder.h		patch \| blob \| history
vp10/encoder/mbgraph.c		patch \| blob \| history
vp10/encoder/mcomp.c		patch \| blob \| history
vp10/encoder/mcomp.h		patch \| blob \| history
vp10/encoder/rdopt.c		patch \| blob \| history
vp10/encoder/rdopt.h		patch \| blob \| history
vp10/encoder/temporal_filter.c		patch \| blob \| history
vpx_dsp/variance.c		patch \| blob \| history
vpx_dsp/vpx_dsp_rtcd_defs.pl		patch \| blob \| history
vpx_dsp/x86/variance_sse2.c		patch \| blob \| history
vpx_scale/generic/yv12extend.c		patch \| blob \| history
vpx_scale/vpx_scale_rtcd.pl		patch \| blob \| history