From 5a88271b09f2c9412a13dae2a282b2c82bce7552 Mon Sep 17 00:00:00 2001
From: Jim Bankoski <jimbankoski@google.com>
Date: Thu, 6 Jun 2013 06:07:09 -0700
Subject: [PATCH] don't tokenize & encode tokens for blocks in UMV

This avoids encoding tokens for blocks that are entirely
in the UMV border. This changes the bitstream.

Change-Id: I32b4df46ac8a990d0c37cee92fd34f8ddd4fb6c9
---
 test/borders_test.cc          |  86 ++++++++++++++++++++++
 test/test-data.sha1           |   3 +-
 test/test.mk                  |   2 +
 vp9/common/vp9_alloccommon.c  |   1 +
 vp9/common/vp9_blockd.h       | 131 ++++++++++++++++++++++++++++++++--
 vp9/common/vp9_mvref_common.c |  54 +++++++++++---
 vp9/decoder/vp9_decodframe.c  |   5 ++
 vp9/decoder/vp9_detokenize.c  |  20 ++++--
 vp9/encoder/vp9_encodeframe.c |  79 +++++++++++---------
 vp9/encoder/vp9_encodemb.c    |   5 +-
 vp9/encoder/vp9_rdopt.c       |  69 +++++++++++++-----
 vp9/encoder/vp9_tokenize.c    |   9 ++-
 12 files changed, 385 insertions(+), 79 deletions(-)
 create mode 100644 test/borders_test.cc
diff --git a/test/borders_test.cc b/test/borders_test.cc
new file mode 100644
index 000000000..8cac4fd09
--- /dev/null
+++ b/test/borders_test.cc
@@ -0,0 +1,86 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class BordersTest : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  BordersTest() : EncoderTest(GET_PARAM(0)) {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+
+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if ( video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, 5);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+    }
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
+    }
+  }
+};
+
+TEST_P(BordersTest, TestEncodeHighBitrate) {
+  // Validate that this non multiple of 64 wide clip encodes and decodes
+  // without a mismatch when passing in a very low max q.  This pushes
+  // the encoder to producing lots of big partitions which will likely
+  // extend into the border and test the border condition.
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.rc_max_quantizer = 10;
+
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       40);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+TEST_P(BordersTest, TestLowBitrate) {
+  // Validate that this clip encodes and decodes without a mismatch
+  // when passing in a very high min q.  This pushes the encoder to producing
+  // lots of small partitions which might will test the other condition.
+
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.rc_min_quantizer = 40;
+
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       40);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+VP9_INSTANTIATE_TEST_CASE(BordersTest, ::testing::Values(
+    ::libvpx_test::kTwoPassGood));
+}  // namespace
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index c1b6a834c..98cdda0a2 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -1,4 +1,5 @@
 d5dfb0151c9051f8c85999255645d7a23916d3c0  hantro_collage_w352h288.yuv
+b87815bf86020c592ccc7a846ba2e28ec8043902  hantro_odd.yuv
 5184c46ddca8b1fadd16742e8500115bc8f749da  vp80-00-comprehensive-001.ivf
 65bf1bbbced81b97bd030f376d1b7f61a224793f  vp80-00-comprehensive-002.ivf
 906b4c1e99eb734504c504b3f1ad8052137ce672  vp80-00-comprehensive-003.ivf
@@ -120,4 +121,4 @@ f95eb6214571434f1f73ab7833b9ccdf47588020  vp80-03-segmentation-1437.ivf.md5
 41d70bb5fa45bc88da1604a0af466930b8dd77b5  vp80-05-sharpness-1438.ivf.md5
 086c56378df81b6cee264d7540a7b8f2b405c7a4  vp80-05-sharpness-1439.ivf.md5
 d32dc2c4165eb266ea4c23c14a45459b363def32  vp80-05-sharpness-1440.ivf.md5
-8c69dc3d8e563f56ffab5ad1e400d9e689dd23df  vp80-05-sharpness-1443.ivf.md5
\ No newline at end of file
+8c69dc3d8e563f56ffab5ad1e400d9e689dd23df  vp80-05-sharpness-1443.ivf.md5
diff --git a/test/test.mk b/test/test.mk
index 0d069d026..1e0b2172e 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -22,6 +22,7 @@ LIBVPX_TEST_SRCS-yes                   += encode_test_driver.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc
 
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../md5_utils.h ../md5_utils.c
@@ -92,6 +93,7 @@ endif
 ## TEST DATA
 ##
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
 
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index b770d0505..bdebb3327 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -146,6 +146,7 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
   // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling
   // information is exposed at this level
   mi_cols = mi_cols_aligned_to_sb(oci);
+
 # if CONFIG_ALPHA
   // TODO(jkoleszar): Why is this * 2?
   oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 8 * mi_cols, 1);
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index c89470b66..05b4fda1b 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -631,14 +631,14 @@ static INLINE void foreach_transformed_block_in_plane(
   // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
   // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
   // transform size varies per plane, look it up in a common way.
-  const TX_SIZE tx_size = plane ? get_uv_tx_size(xd)
-                                : xd->mode_info_context->mbmi.txfm_size;
+  const TX_SIZE tx_size =
+      plane ? get_uv_tx_size(xd) : xd->mode_info_context->mbmi.txfm_size;
   const int block_size_b = bw + bh;
   const int txfrm_size_b = tx_size * 2;
 
   // subsampled size of the block
-  const int ss_sum = xd->plane[plane].subsampling_x +
-                     xd->plane[plane].subsampling_y;
+  const int ss_sum = xd->plane[plane].subsampling_x
+      + xd->plane[plane].subsampling_y;
   const int ss_block_size = block_size_b - ss_sum;
 
   const int step = 1 << txfrm_size_b;
@@ -647,8 +647,42 @@ static INLINE void foreach_transformed_block_in_plane(
 
   assert(txfrm_size_b <= block_size_b);
   assert(txfrm_size_b <= ss_block_size);
-  for (i = 0; i < (1 << ss_block_size); i += step) {
-    visit(plane, i, bsize, txfrm_size_b, arg);
+
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
+    int r, c;
+    const int sw = bw - xd->plane[plane].subsampling_x;
+    const int sh = bh - xd->plane[plane].subsampling_y;
+    int max_blocks_wide = 1 << sw;
+    int max_blocks_high = 1 << sh;
+
+    // xd->mb_to_right_edge is in units of pixels * 8.  This converts
+    // it to 4x4 block sizes.
+    if (xd->mb_to_right_edge < 0)
+      max_blocks_wide +=
+          + (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x));
+
+    if (xd->mb_to_bottom_edge < 0)
+      max_blocks_high +=
+          + (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y));
+
+    i = 0;
+    // Unlike the normal case - in here we have to keep track of the
+    // row and column of the blocks we use so that we know if we are in
+    // the unrestricted motion border..
+    for (r = 0; r < (1 << sh); r += (1 << tx_size)) {
+      for (c = 0; c < (1 << sw); c += (1 << tx_size)) {
+        if (r < max_blocks_high && c < max_blocks_wide)
+          visit(plane, i, bsize, txfrm_size_b, arg);
+        i += step;
+      }
+    }
+  } else {
+    for (i = 0; i < (1 << ss_block_size); i += step) {
+      visit(plane, i, bsize, txfrm_size_b, arg);
+    }
   }
 }
 
@@ -780,4 +814,89 @@ static void txfrm_block_to_raster_xy(MACROBLOCKD *xd,
   *x = (raster_mb & (tx_cols - 1)) << (txwl);
   *y = raster_mb >> tx_cols_lg2 << (txwl);
 }
+
+static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block,
+                             BLOCK_SIZE_TYPE bsize, int ss_txfrm_size) {
+  const int bw = plane_block_width(bsize, &xd->plane[plane]);
+  const int bh = plane_block_height(bsize, &xd->plane[plane]);
+  int x, y;
+  txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y);
+  x = x * 4 - 1;
+  y = y * 4 - 1;
+  // Copy a pixel into the umv if we are in a situation where the block size
+  // extends into the UMV.
+  // TODO(JBB): Should be able to do the full extend in place so we don't have
+  // to do this multiple times.
+  if (xd->mb_to_right_edge < 0) {
+    int umv_border_start = bw
+        + (xd->mb_to_right_edge >> (3 + xd->plane[plane].subsampling_x));
+
+    if (x + bw > umv_border_start)
+      vpx_memset(
+          xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride
+              + umv_border_start,
+          *(xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride
+              + umv_border_start - 1),
+          bw);
+  }
+  if (xd->mb_to_bottom_edge < 0) {
+    int umv_border_start = bh
+        + (xd->mb_to_bottom_edge >> (3 + xd->plane[plane].subsampling_y));
+    int i;
+    uint8_t c = *(xd->plane[plane].dst.buf
+        + (umv_border_start - 1) * xd->plane[plane].dst.stride + x);
+
+    uint8_t *d = xd->plane[plane].dst.buf
+        + umv_border_start * xd->plane[plane].dst.stride + x;
+
+    if (y + bh > umv_border_start)
+      for (i = 0; i < bh; i++, d += xd->plane[plane].dst.stride)
+        *d = c;
+  }
+}
+static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+                                   int plane, int ss_tx_size, int eob, int aoff,
+                                   int loff, ENTROPY_CONTEXT *A,
+                                   ENTROPY_CONTEXT *L) {
+  const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
+  const int sw = bw - xd->plane[plane].subsampling_x;
+  const int sh = bh - xd->plane[plane].subsampling_y;
+  int mi_blocks_wide = 1 << sw;
+  int mi_blocks_high = 1 << sh;
+  int tx_size_in_blocks = (1 << ss_tx_size);
+  int above_contexts = tx_size_in_blocks;
+  int left_contexts = tx_size_in_blocks;
+  int pt;
+
+  // xd->mb_to_right_edge is in units of pixels * 8.  This converts
+  // it to 4x4 block sizes.
+  if (xd->mb_to_right_edge < 0) {
+    mi_blocks_wide += (xd->mb_to_right_edge
+        >> (5 + xd->plane[plane].subsampling_x));
+  }
+
+  // this code attempts to avoid copying into contexts that are outside
+  // our border.  Any blocks that do are set to 0...
+  if (above_contexts + aoff > mi_blocks_wide)
+    above_contexts = mi_blocks_wide - aoff;
+
+  if (xd->mb_to_bottom_edge < 0) {
+    mi_blocks_high += (xd->mb_to_bottom_edge
+        >> (5 + xd->plane[plane].subsampling_y));
+  }
+  if (left_contexts + loff > mi_blocks_high) {
+    left_contexts = mi_blocks_high - loff;
+  }
+
+  for (pt = 0; pt < above_contexts; pt++)
+    A[pt] = eob > 0;
+  for (pt = above_contexts; pt < (1 << ss_tx_size); pt++)
+    A[pt] = 0;
+  for (pt = 0; pt < left_contexts; pt++)
+    L[pt] = eob > 0;
+  for (pt = left_contexts; pt < (1 << ss_tx_size); pt++)
+    L[pt] = 0;
+}
+
+
 #endif  // VP9_COMMON_VP9_BLOCKD_H_
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index f79d1c0ab..224151de4 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -154,17 +154,49 @@ void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
   vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);
   vpx_memset(candidate_scores, 0, sizeof(candidate_scores));
 
-  if (mbmi->sb_type == BLOCK_SIZE_SB64X64) {
-    mv_ref_search = sb64_mv_ref_search;
-  } else if (mbmi->sb_type >= BLOCK_SIZE_SB32X32) {
-    mv_ref_search = sb_mv_ref_search;
-  } else if (mbmi->sb_type >= BLOCK_SIZE_MB16X16) {
-    mv_ref_search = mb_mv_ref_search;
-  } else {
-    mv_ref_search = b_mv_ref_search;
-    if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
-      x_idx = block_idx & 1;
-      y_idx = block_idx >> 1;
+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
+    int pixels_wide = 4 * b_width_log2(mbmi->sb_type);
+    int pixels_high = 4 * b_height_log2(mbmi->sb_type);
+    int pixels_square = 0;
+
+    if (xd->mb_to_right_edge < 0)
+      pixels_wide += (xd->mb_to_right_edge >> 3);
+
+    if (xd->mb_to_bottom_edge < 0)
+      pixels_high += (xd->mb_to_bottom_edge >> 3);
+
+    if ( pixels_wide < pixels_high )
+      pixels_square = pixels_wide;
+    else
+      pixels_square = pixels_high;
+
+    if (pixels_square == 64) {
+      mv_ref_search = sb64_mv_ref_search;
+    } else if (pixels_square == 32) {
+      mv_ref_search = sb_mv_ref_search;
+    } else if (pixels_square == 16) {
+      mv_ref_search = mb_mv_ref_search;
+    } else {
+      mv_ref_search = b_mv_ref_search;
+      if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
+        x_idx = block_idx & 1;
+        y_idx = block_idx >> 1;
+      }
+    }
+  }
+  else {
+    if (mbmi->sb_type == BLOCK_SIZE_SB64X64) {
+      mv_ref_search = sb64_mv_ref_search;
+    } else if (mbmi->sb_type >= BLOCK_SIZE_SB32X32) {
+      mv_ref_search = sb_mv_ref_search;
+    } else if (mbmi->sb_type >= BLOCK_SIZE_MB16X16) {
+      mv_ref_search = mb_mv_ref_search;
+    } else {
+      mv_ref_search = b_mv_ref_search;
+      if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
+        x_idx = block_idx & 1;
+        y_idx = block_idx >> 1;
+      }
     }
   }
 
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index a9717222a..92c5c9d1b 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -240,6 +240,7 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
   mode = plane == 0? xd->mode_info_context->mbmi.mode:
                      xd->mode_info_context->mbmi.uv_mode;
 
+
   if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 && plane == 0) {
     assert(bsize == BLOCK_SIZE_SB8X8);
     b_mode = xd->mode_info_context->bmi[raster_block].as_mode.first;
@@ -247,6 +248,10 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
     b_mode = mode;
   }
 
+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
+    extend_for_intra(xd, plane, block, bsize, ss_txfrm_size);
+  }
+
   plane_b_size = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
   vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size,
                           b_mode, dst, xd->plane[plane].dst.stride);
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index b20807226..c91c4fcf8 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -288,9 +288,6 @@ SKIP_START:
   if (c < seg_eob)
     coef_counts[type][ref][band][pt][DCT_EOB_MODEL_TOKEN]++;
 
-  for (pt = 0; pt < (1 << txfm_size); pt++) {
-    A[pt] = L[pt] = c > 0;
-  }
 
   return c;
 }
@@ -299,7 +296,6 @@ static int get_eob(MACROBLOCKD* const xd, int segment_id, int eob_max) {
   return vp9_get_segdata(xd, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
-
 struct decode_block_args {
   VP9D_COMP *pbi;
   MACROBLOCKD *xd;
@@ -314,6 +310,7 @@ static void decode_block(int plane, int block,
   const int bw = b_width_log2(bsize);
 
   // find the maximum eob for this transform size, adjusted by segment
+  MACROBLOCKD *xd = arg->xd;
   const int segment_id = arg->xd->mode_info_context->mbmi.segment_id;
   const TX_SIZE ss_tx_size = ss_txfrm_size / 2;
   const int seg_eob = get_eob(arg->xd, segment_id, 16 << ss_txfrm_size);
@@ -322,14 +319,23 @@ static void decode_block(int plane, int block,
   const int mod = bw - ss_tx_size - arg->xd->plane[plane].subsampling_x;
   const int aoff = (off & ((1 << mod) - 1)) << ss_tx_size;
   const int loff = (off >> mod) << ss_tx_size;
-
+  int pt;
+  ENTROPY_CONTEXT *A = arg->xd->plane[plane].above_context + aoff;
+  ENTROPY_CONTEXT *L = arg->xd->plane[plane].left_context + loff;
   const int eob = decode_coefs(arg->pbi, arg->xd, arg->r, block,
                                arg->xd->plane[plane].plane_type, seg_eob,
                                BLOCK_OFFSET(qcoeff_base, block, 16),
                                ss_tx_size, arg->xd->plane[plane].dequant,
-                               arg->xd->plane[plane].above_context + aoff,
-                               arg->xd->plane[plane].left_context + loff);
+                               A,
+                               L);
 
+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
+    set_contexts_on_border(xd, bsize, plane, ss_tx_size, eob, aoff, loff, A, L);
+  } else {
+    for (pt = 0; pt < (1 << ss_tx_size); pt++) {
+      A[pt] = L[pt] = eob > 0;
+    }
+  }
   arg->xd->plane[plane].eobs[block] = eob;
   arg->eobtotal[0] += eob;
 }
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 3ced30d56..6efcdd042 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -467,7 +467,9 @@ static void update_state(VP9_COMP *cpi,
       int i, j;
       for (j = 0; j < bh; ++j)
         for (i = 0; i < bw; ++i)
-          xd->mode_info_context[mis * j + i].mbmi = *mbmi;
+          if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > j &&
+              (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > i)
+            xd->mode_info_context[mis * j + i].mbmi = *mbmi;
     }
 
     if (cpi->common.mcomp_filter_type == SWITCHABLE &&
@@ -915,13 +917,16 @@ static void set_block_size(VP9_COMMON *const cm,
                            MODE_INFO *m, BLOCK_SIZE_TYPE bsize, int mis,
                            int mi_row, int mi_col) {
   int row, col;
-  int bsl = b_width_log2(bsize);
+  int bwl = b_width_log2(bsize);
+  int bhl = b_height_log2(bsize);
+  int bsl = (bwl > bhl ? bwl : bhl);
+
   int bs = (1 << bsl) / 2;  //
   MODE_INFO *m2 = m + mi_row * mis + mi_col;
   for (row = 0; row < bs; row++) {
     for (col = 0; col < bs; col++) {
       if (mi_row + row >= cm->mi_rows || mi_col + col >= cm->mi_cols)
-        return;
+        continue;
       m2[row*mis+col].mbmi.sb_type = bsize;
     }
   }
@@ -961,21 +966,6 @@ static void fill_variance(var *v, int64_t s2, int64_t s, int c) {
       / v->count;
 }
 
-// Fills a 16x16 variance tree node by calling get var8x8 var..
-static void fill_16x16_variance(const unsigned char *s, int sp,
-                                const unsigned char *d, int dp, v16x16 *vt) {
-  unsigned int sse;
-  int sum;
-  vp9_get_sse_sum_8x8(s, sp, d, dp, &sse, &sum);
-  fill_variance(&vt->split[0].none, sse, sum, 64);
-  vp9_get_sse_sum_8x8(s + 8, sp, d + 8, dp, &sse, &sum);
-  fill_variance(&vt->split[1].none, sse, sum, 64);
-  vp9_get_sse_sum_8x8(s + 8 * sp, sp, d + 8 * dp, dp, &sse, &sum);
-  fill_variance(&vt->split[2].none, sse, sum, 64);
-  vp9_get_sse_sum_8x8(s + 8 * sp + 8, sp, d + 8 + 8 * dp, dp, &sse, &sum);
-  fill_variance(&vt->split[3].none, sse, sum, 64);
-}
-
 // Combine 2 variance structures by summing the sum_error, sum_square_error,
 // and counts and then calculating the new variance.
 void sum_2_variances(var *r, var *a, var*b) {
@@ -1021,8 +1011,18 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
   int sp;
   const unsigned char * d = xd->plane[0].pre->buf;
   int dp = xd->plane[0].pre->stride;
+  int pixels_wide = 64, pixels_high = 64;
+
+  vpx_memset(&vt, 0, sizeof(vt));
 
   set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64);
+
+  if (xd->mb_to_right_edge < 0)
+    pixels_wide += (xd->mb_to_right_edge >> 3);
+
+  if (xd->mb_to_bottom_edge < 0)
+    pixels_high += (xd->mb_to_bottom_edge >> 3);
+
   s = x->plane[0].src.buf;
   sp = x->plane[0].src.stride;
 
@@ -1034,6 +1034,7 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
   d = vp9_64x64_zeros;
   dp = 64;
   // }
+
   // Fill in the entire tree of 8x8 variances for splits.
   for (i = 0; i < 4; i++) {
     const int x32_idx = ((i & 1) << 5);
@@ -1041,8 +1042,28 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
     for (j = 0; j < 4; j++) {
       const int x_idx = x32_idx + ((j & 1) << 4);
       const int y_idx = y32_idx + ((j >> 1) << 4);
-      fill_16x16_variance(s + y_idx * sp + x_idx, sp, d + y_idx * dp + x_idx,
-                          dp, &vt.split[i].split[j]);
+      const uint8_t *st = s + y_idx * sp + x_idx;
+      const uint8_t *dt = d + y_idx * dp + x_idx;
+      unsigned int sse = 0;
+      int sum = 0;
+      v16x16 *vst = &vt.split[i].split[j];
+      sse = sum = 0;
+      if (x_idx < pixels_wide && y_idx < pixels_high)
+        vp9_get_sse_sum_8x8(st, sp, dt, dp, &sse, &sum);
+      fill_variance(&vst->split[0].none, sse, sum, 64);
+      sse = sum = 0;
+      if (x_idx + 8 < pixels_wide && y_idx < pixels_high)
+        vp9_get_sse_sum_8x8(st + 8, sp, dt + 8, dp, &sse, &sum);
+      fill_variance(&vst->split[1].none, sse, sum, 64);
+      sse = sum = 0;
+      if (x_idx < pixels_wide && y_idx + 8 < pixels_high)
+        vp9_get_sse_sum_8x8(st + 8 * sp, sp, dt + 8 * dp, dp, &sse, &sum);
+      fill_variance(&vst->split[2].none, sse, sum, 64);
+      sse = sum = 0;
+      if (x_idx + 8 < pixels_wide && y_idx + 8 < pixels_high)
+        vp9_get_sse_sum_8x8(st + 8 * sp + 8, sp, dt + 8 + 8 * dp, dp, &sse,
+                            &sum);
+      fill_variance(&vst->split[3].none, sse, sum, 64);
     }
   }
   // Fill the rest of the variance tree by summing the split partition
@@ -1088,8 +1109,10 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
   MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   const int mis = cm->mode_info_stride;
-  int bwl, bhl;
+  int bwl = b_width_log2(m->mbmi.sb_type);
+  int bhl = b_height_log2(m->mbmi.sb_type);
   int bsl = b_width_log2(bsize);
+  int bh = (1 << bhl);
   int bs = (1 << bsl);
   int bss = (1 << bsl)/4;
   int i, pl;
@@ -1103,9 +1126,6 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
     return;
 
 
-  bwl = b_width_log2(m->mbmi.sb_type);
-  bhl = b_height_log2(m->mbmi.sb_type);
-
   // parse the partition type
   if ((bwl == bsl) && (bhl == bsl))
     partition = PARTITION_NONE;
@@ -1144,7 +1164,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
       *(get_sb_index(xd, subsize)) = 0;
       pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
                     get_block_context(x, subsize));
-      if (mi_row + (bs >> 1) <= cm->mi_rows) {
+      if (mi_row + (bh >> 1) <= cm->mi_rows) {
         int rt, dt;
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
@@ -1404,18 +1424,13 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row,
   for (mi_col = cm->cur_tile_mi_col_start;
        mi_col < cm->cur_tile_mi_col_end; mi_col += 8) {
     int dummy_rate, dummy_dist;
-    // TODO(JBB): remove the border conditions for 64x64 blocks once its fixed
-    // without this border check choose will fail on the border of every
-    // non 64x64.
-    if (cpi->speed < 5 ||
-        mi_col + 8 > cm->cur_tile_mi_col_end ||
-        mi_row + 8 > cm->cur_tile_mi_row_end) {
+    if (cpi->speed < 5) {
       rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
                         &dummy_rate, &dummy_dist);
     } else {
       const int idx_str = cm->mode_info_stride * mi_row + mi_col;
       MODE_INFO *m = cm->mi + idx_str;
-      // set_partitioning(cpi, m, BLOCK_SIZE_SB8X8);
+      // set_partitioning(cpi, m, BLOCK_SIZE_SB64X64);
       choose_partitioning(cpi, cm->mi, mi_row, mi_col);
       rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
                        &dummy_rate, &dummy_dist);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 389c5d860..b65b2619b 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -615,6 +615,10 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
   TX_TYPE tx_type;
   int mode, b_mode;
 
+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
+    extend_for_intra(xd, plane, block, bsize, ss_txfrm_size);
+  }
+
   mode = plane == 0? mbmi->mode: mbmi->uv_mode;
   if (plane == 0 &&
       mbmi->sb_type < BLOCK_SIZE_SB8X8 &&
@@ -684,7 +688,6 @@ void vp9_encode_intra_block_uv(VP9_COMMON *cm, MACROBLOCK *x,
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
   struct encode_b_args arg = {cm, x, &ctx};
-
   foreach_transformed_block_uv(xd, bsize, encode_block_intra, &arg);
 }
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 761bbb6d5..c48f34ad7 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -505,33 +505,48 @@ static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
   return sum > INT_MAX ? INT_MAX : (int)sum;
 }
 
-static int rdcost_plane(VP9_COMMON *const cm, MACROBLOCK *x,
-                        int plane, BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
-  MACROBLOCKD *const xd = &x->e_mbd;
+struct rdcost_block_args {
+  VP9_COMMON *cm;
+  MACROBLOCK *x;
+  ENTROPY_CONTEXT t_above[16];
+  ENTROPY_CONTEXT t_left[16];
+  TX_SIZE tx_size;
+  int bw;
+  int bh;
+  int cost;
+};
+
+static void rdcost_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                         int ss_txfrm_size, void *arg) {
+  struct rdcost_block_args* args = arg;
+  int x_idx, y_idx;
+  MACROBLOCKD * const xd = &args->x->e_mbd;
+
+  txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx,
+                           &y_idx);
+
+  args->cost += cost_coeffs(args->cm, args->x, plane, block,
+                            xd->plane[plane].plane_type, args->t_above + x_idx,
+                            args->t_left + y_idx, args->tx_size,
+                            args->bw * args->bh);
+}
+
+static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane,
+                        BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD * const xd = &x->e_mbd;
   const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
   const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
   const int bw = 1 << bwl, bh = 1 << bhl;
-  ENTROPY_CONTEXT t_above[16], t_left[16];
-  int block, cost;
+  struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh, 0 };
 
-  vpx_memcpy(&t_above, xd->plane[plane].above_context,
+  vpx_memcpy(&args.t_above, xd->plane[plane].above_context,
              sizeof(ENTROPY_CONTEXT) * bw);
-  vpx_memcpy(&t_left,  xd->plane[plane].left_context,
+  vpx_memcpy(&args.t_left, xd->plane[plane].left_context,
              sizeof(ENTROPY_CONTEXT) * bh);
 
-  cost = 0;
-  for (block = 0; block < bw * bh; block += 1 << (tx_size * 2)) {
-    int x_idx, y_idx;
+  foreach_transformed_block_in_plane(xd, bsize, plane, rdcost_block, &args);
 
-    txfrm_block_to_raster_xy(xd, bsize, plane, block, tx_size * 2,
-                             &x_idx, &y_idx);
-
-    cost += cost_coeffs(cm, x, plane, block, xd->plane[plane].plane_type,
-                        t_above + x_idx, t_left + y_idx,
-                        tx_size, bw * bh);
-  }
-
-  return cost;
+  return args.cost;
 }
 
 static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
@@ -582,6 +597,7 @@ static void super_block_yrd(VP9_COMP *cpi,
     } else {
       mbmi->txfm_size = TX_4X4;
     }
+    vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t));
     super_block_yrd_for_txfm(cm, x, rate, distortion, skip, bs,
                              mbmi->txfm_size);
     return;
@@ -826,6 +842,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     int64_t local_txfm_cache[NB_TXFM_MODES];
     MODE_INFO *const mic = xd->mode_info_context;
     const int mis = xd->mode_info_stride;
+
     if (cpi->common.frame_type == KEY_FRAME) {
       const MB_PREDICTION_MODE A = above_block_mode(mic, 0, mis);
       const MB_PREDICTION_MODE L = xd->left_available ?
@@ -2410,6 +2427,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t err4x4 = INT64_MAX;
   int i;
 
+  vpx_memset(&txfm_cache,0,sizeof(txfm_cache));
   ctx->skip = 0;
   xd->mode_info_context->mbmi.mode = DC_PRED;
   xd->mode_info_context->mbmi.ref_frame = INTRA_FRAME;
@@ -2502,6 +2520,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   union b_mode_info best_bmodes[4];
   PARTITION_INFO best_partition;
+  int bwsl = b_width_log2(bsize);
+  int bws = (1 << bwsl) / 4;  // mode_info step for subsize
+  int bhsl = b_width_log2(bsize);
+  int bhs = (1 << bhsl) / 4;  // mode_info step for subsize
 
   for (i = 0; i < 4; i++) {
     int j;
@@ -2723,6 +2745,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         }
       }
     }
+    // TODO(JBB): This is to make up for the fact that we don't have sad
+    // functions that work when the block size reads outside the umv.  We
+    // should fix this either by making the motion search just work on
+    // a representative block in the boundary ( first ) and then implement a
+    // function that does sads when inside the border..
+    if (((mi_row + bhs) < cm->mi_rows || (mi_col + bws) < cm->mi_cols) &&
+        this_mode == NEWMV) {
+      continue;
+    }
 
     if (this_mode == I4X4_PRED) {
       int rate;
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 3d8390b08..b307c54b0 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -99,6 +99,7 @@ struct tokenize_b_args {
   TX_SIZE tx_size;
   int dry_run;
 };
+
 static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
                        int ss_txfrm_size, void *arg) {
   struct tokenize_b_args* const args = arg;
@@ -233,8 +234,12 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
   } while (c < eob && ++c < seg_eob);
 
   *tp = t;
-  for (pt = 0; pt < (1 << tx_size); pt++) {
-    A[pt] = L[pt] = c > 0;
+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
+    set_contexts_on_border(xd, bsize, plane, tx_size, c, aoff, loff, A, L);
+  } else {
+    for (pt = 0; pt < (1 << tx_size); pt++) {
+      A[pt] = L[pt] = c > 0;
+    }
   }
 }
 
-- 
2.50.1