From 552d5cd715accc0672e1c3ffd46366bcd556fa08 Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Mon, 7 Mar 2016 13:46:39 +0000
Subject: [PATCH] Extend superblock size fo 128x128 pixels.

If --enable-ext-partition is used at build time, the superblock size
(sometimes also referred to as coding unit (CU) size) is extended to
128x128 pixels.

Change-Id: Ie09cec6b7e8d765b7555ff5d80974aab60803f3a
---
 test/convolve_test.cc                         |   2 +-
 test/masked_sad_test.cc                       |  28 +-
 test/masked_variance_test.cc                  | 128 +++---
 ...{vp9_subtract_test.cc => subtract_test.cc} |  10 +-
 test/test.mk                                  |   3 +-
 vp10/common/blockd.h                          |  14 +-
 vp10/common/common_data.h                     | 376 ++++++++++++------
 vp10/common/entropymode.c                     |  42 ++
 vp10/common/entropymode.h                     |   2 +-
 vp10/common/enums.h                           |  85 ++--
 vp10/common/loopfilter.c                      | 124 +++---
 vp10/common/loopfilter.h                      |   4 +-
 vp10/common/mvref_common.c                    |  44 +-
 vp10/common/mvref_common.h                    |  17 +-
 vp10/common/onyxc_int.h                       |   4 +-
 vp10/common/reconinter.c                      | 240 +++++++----
 vp10/common/reconinter.h                      |   4 +-
 vp10/common/reconintra.c                      | 197 +++++++--
 vp10/common/thread_common.c                   |  12 +
 vp10/common/vp10_convolve.c                   |   4 +-
 vp10/decoder/decodeframe.c                    |  77 ++--
 vp10/decoder/decoder.h                        |   8 +-
 vp10/decoder/detokenize.c                     |   4 +-
 vp10/encoder/aq_complexity.c                  |   4 +-
 vp10/encoder/aq_cyclicrefresh.c               |   4 +-
 vp10/encoder/aq_variance.c                    |  18 +-
 vp10/encoder/bitstream.c                      |   2 +-
 vp10/encoder/block.h                          |  20 +-
 vp10/encoder/context_tree.c                   |  28 +-
 vp10/encoder/context_tree.h                   |   1 -
 vp10/encoder/denoiser.c                       |   2 +-
 vp10/encoder/encodeframe.c                    | 349 +++++++++-------
 vp10/encoder/encodemb.c                       |  10 +-
 vp10/encoder/encoder.c                        |   4 +-
 vp10/encoder/encoder.h                        |   4 +-
 vp10/encoder/mcomp.c                          |  20 +-
 vp10/encoder/picklpf.c                        |   2 +-
 vp10/encoder/quantize.c                       |   2 +-
 vp10/encoder/quantize.h                       |   1 +
 vp10/encoder/rd.c                             |  11 +-
 vp10/encoder/rd.h                             |   4 +-
 vp10/encoder/rdopt.c                          | 184 ++++-----
 vp10/encoder/segmentation.c                   |   6 +-
 vp10/encoder/speed_features.c                 |  20 +-
 vp10/encoder/tokenize.c                       |   2 +-
 vp10/encoder/x86/denoiser_sse2.c              |   4 +-
 vpx_dsp/vpx_convolve.c                        |  40 +-
 vpx_dsp/vpx_dsp_common.h                      |  13 +-
 vpx_dsp/x86/convolve.h                        |  34 +-
 vpx_dsp/x86/subtract_sse2.asm                 |  20 +
 vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c    |  22 +-
 vpx_scale/yv12config.h                        |   6 +-
 52 files changed, 1445 insertions(+), 821 deletions(-)
 rename test/{vp9_subtract_test.cc => subtract_test.cc} (97%)

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 0e54c4013..fdea61f29 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -28,7 +28,7 @@
 
 namespace {
 
-static const unsigned int kMaxDimension = MAX_CU_SIZE;
+static const unsigned int kMaxDimension = MAX_SB_SIZE;
 
 typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc
index 34223eac8..13fff0f0d 100644
--- a/test/masked_sad_test.cc
+++ b/test/masked_sad_test.cc
@@ -50,16 +50,16 @@ class MaskedSADTest : public ::testing::TestWithParam<MaskedSADParam> {
 TEST_P(MaskedSADTest, OperationCheck) {
   unsigned int ref_ret, ret;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = MAX_CU_SIZE;
-  int ref_stride = MAX_CU_SIZE;
-  int msk_stride = MAX_CU_SIZE;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
   for (int i = 0; i < number_of_iterations; ++i) {
-    for (int j = 0; j < MAX_CU_SIZE*MAX_CU_SIZE; j++) {
+    for (int j = 0; j < MAX_SB_SIZE*MAX_SB_SIZE; j++) {
       src_ptr[j] = rnd.Rand8();
       ref_ptr[j] = rnd.Rand8();
       msk_ptr[j] = ((rnd.Rand8()&0x7f) > 64) ? rnd.Rand8()&0x3f : 64;
@@ -108,18 +108,18 @@ class HighbdMaskedSADTest : public ::testing::
 TEST_P(HighbdMaskedSADTest, OperationCheck) {
   unsigned int ref_ret, ret;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
-  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
   uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = MAX_CU_SIZE;
-  int ref_stride = MAX_CU_SIZE;
-  int msk_stride = MAX_CU_SIZE;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
   for (int i = 0; i < number_of_iterations; ++i) {
-    for (int j = 0; j < MAX_CU_SIZE*MAX_CU_SIZE; j++) {
+    for (int j = 0; j < MAX_SB_SIZE*MAX_SB_SIZE; j++) {
       src_ptr[j] = rnd.Rand16()&0xfff;
       ref_ptr[j] = rnd.Rand16()&0xfff;
       msk_ptr[j] = ((rnd.Rand8()&0x7f) > 64) ? rnd.Rand8()&0x3f : 64;
diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc
index 1f8bf1e22..1710285df 100644
--- a/test/masked_variance_test.cc
+++ b/test/masked_variance_test.cc
@@ -58,17 +58,17 @@ TEST_P(MaskedVarianceTest, OperationCheck) {
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t,  src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  src_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = MAX_CU_SIZE;
-  int ref_stride = MAX_CU_SIZE;
-  int msk_stride = MAX_CU_SIZE;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
 
   for (int i = 0; i < number_of_iterations; ++i) {
-    for (int j = 0; j < MAX_CU_SIZE*MAX_CU_SIZE; j++) {
+    for (int j = 0; j < MAX_SB_SIZE*MAX_SB_SIZE; j++) {
       src_ptr[j] = rnd.Rand8();
       ref_ptr[j] = rnd.Rand8();
       msk_ptr[j] = rnd(65);
@@ -100,19 +100,19 @@ TEST_P(MaskedVarianceTest, ExtremeValues) {
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t,  src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  src_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = MAX_CU_SIZE;
-  int ref_stride = MAX_CU_SIZE;
-  int msk_stride = MAX_CU_SIZE;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
 
   for (int i = 0; i < 8; ++i) {
-    memset(src_ptr, (i & 0x1) ? 255 : 0, MAX_CU_SIZE*MAX_CU_SIZE);
-    memset(ref_ptr, (i & 0x2) ? 255 : 0, MAX_CU_SIZE*MAX_CU_SIZE);
-    memset(msk_ptr, (i & 0x4) ?  64 : 0, MAX_CU_SIZE*MAX_CU_SIZE);
+    memset(src_ptr, (i & 0x1) ? 255 : 0, MAX_SB_SIZE*MAX_SB_SIZE);
+    memset(ref_ptr, (i & 0x2) ? 255 : 0, MAX_SB_SIZE*MAX_SB_SIZE);
+    memset(msk_ptr, (i & 0x4) ?  64 : 0, MAX_SB_SIZE*MAX_SB_SIZE);
 
     ref_ret = ref_func_(src_ptr, src_stride,
                         ref_ptr, ref_stride,
@@ -166,21 +166,21 @@ TEST_P(MaskedSubPixelVarianceTest, OperationCheck) {
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t,  src_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  src_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = (MAX_CU_SIZE+1);
-  int ref_stride = (MAX_CU_SIZE+1);
-  int msk_stride = (MAX_CU_SIZE+1);
+  int src_stride = (MAX_SB_SIZE+1);
+  int ref_stride = (MAX_SB_SIZE+1);
+  int msk_stride = (MAX_SB_SIZE+1);
   int xoffset;
   int yoffset;
 
   for (int i = 0; i < number_of_iterations; ++i) {
     int xoffsets[] = {0, 4, rnd(BIL_SUBPEL_SHIFTS)};
     int yoffsets[] = {0, 4, rnd(BIL_SUBPEL_SHIFTS)};
-    for (int j = 0; j < (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1); j++) {
+    for (int j = 0; j < (MAX_SB_SIZE+1)*(MAX_SB_SIZE+1); j++) {
       src_ptr[j] = rnd.Rand8();
       ref_ptr[j] = rnd.Rand8();
       msk_ptr[j] = rnd(65);
@@ -221,23 +221,23 @@ TEST_P(MaskedSubPixelVarianceTest, ExtremeValues) {
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t,  src_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  src_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
   int first_failure_x = -1;
   int first_failure_y = -1;
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = (MAX_CU_SIZE+1);
-  int ref_stride = (MAX_CU_SIZE+1);
-  int msk_stride = (MAX_CU_SIZE+1);
+  int src_stride = (MAX_SB_SIZE+1);
+  int ref_stride = (MAX_SB_SIZE+1);
+  int msk_stride = (MAX_SB_SIZE+1);
 
   for (int xoffset = 0 ; xoffset < BIL_SUBPEL_SHIFTS ; xoffset++) {
     for (int yoffset = 0 ; yoffset < BIL_SUBPEL_SHIFTS ; yoffset++) {
       for (int i = 0; i < 8; ++i) {
-        memset(src_ptr, (i & 0x1) ? 255 : 0, (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
-        memset(ref_ptr, (i & 0x2) ? 255 : 0, (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
-        memset(msk_ptr, (i & 0x4) ?  64 : 0, (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
+        memset(src_ptr, (i & 0x1) ? 255 : 0, (MAX_SB_SIZE+1)*(MAX_SB_SIZE+1));
+        memset(ref_ptr, (i & 0x2) ? 255 : 0, (MAX_SB_SIZE+1)*(MAX_SB_SIZE+1));
+        memset(msk_ptr, (i & 0x4) ?  64 : 0, (MAX_SB_SIZE+1)*(MAX_SB_SIZE+1));
 
         ref_ret = ref_func_(src_ptr, src_stride,
                             xoffset, yoffset,
@@ -297,19 +297,19 @@ TEST_P(HighbdMaskedVarianceTest, OperationCheck) {
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
-  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
   uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = MAX_CU_SIZE;
-  int ref_stride = MAX_CU_SIZE;
-  int msk_stride = MAX_CU_SIZE;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
 
   for (int i = 0; i < number_of_iterations; ++i) {
-    for (int j = 0; j < MAX_CU_SIZE*MAX_CU_SIZE; j++) {
+    for (int j = 0; j < MAX_SB_SIZE*MAX_SB_SIZE; j++) {
       src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
       ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
       msk_ptr[j] = rnd(65);
@@ -341,23 +341,23 @@ TEST_P(HighbdMaskedVarianceTest, ExtremeValues) {
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
-  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_CU_SIZE*MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
   uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = MAX_CU_SIZE;
-  int ref_stride = MAX_CU_SIZE;
-  int msk_stride = MAX_CU_SIZE;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
 
   for (int i = 0; i < 8; ++i) {
     vpx_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
-                 MAX_CU_SIZE*MAX_CU_SIZE);
+                 MAX_SB_SIZE*MAX_SB_SIZE);
     vpx_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
-                 MAX_CU_SIZE*MAX_CU_SIZE);
-    memset(msk_ptr, (i & 0x4) ?  64 : 0, MAX_CU_SIZE*MAX_CU_SIZE);
+                 MAX_SB_SIZE*MAX_SB_SIZE);
+    memset(msk_ptr, (i & 0x4) ?  64 : 0, MAX_SB_SIZE*MAX_SB_SIZE);
 
     ref_ret = ref_func_(src8_ptr, src_stride,
                         ref8_ptr, ref_stride,
@@ -407,24 +407,24 @@ TEST_P(HighbdMaskedSubPixelVarianceTest, OperationCheck) {
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
   uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
   int err_count = 0;
   int first_failure = -1;
   int first_failure_x = -1;
   int first_failure_y = -1;
-  int src_stride = (MAX_CU_SIZE+1);
-  int ref_stride = (MAX_CU_SIZE+1);
-  int msk_stride = (MAX_CU_SIZE+1);
+  int src_stride = (MAX_SB_SIZE+1);
+  int ref_stride = (MAX_SB_SIZE+1);
+  int msk_stride = (MAX_SB_SIZE+1);
   int xoffset, yoffset;
 
   for (int i = 0; i < number_of_iterations; ++i) {
     for (xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
       for (yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
-        for (int j = 0; j < (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1); j++) {
+        for (int j = 0; j < (MAX_SB_SIZE+1)*(MAX_SB_SIZE+1); j++) {
           src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
           ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
           msk_ptr[j] = rnd(65);
@@ -465,27 +465,27 @@ TEST_P(HighbdMaskedSubPixelVarianceTest, ExtremeValues) {
   unsigned int ref_ret, opt_ret;
   unsigned int ref_sse, opt_sse;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
-  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_CU_SIZE+1)*(MAX_CU_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
   uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
   uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
   int first_failure_x = -1;
   int first_failure_y = -1;
   int err_count = 0;
   int first_failure = -1;
-  int src_stride = (MAX_CU_SIZE+1);
-  int ref_stride = (MAX_CU_SIZE+1);
-  int msk_stride = (MAX_CU_SIZE+1);
+  int src_stride = (MAX_SB_SIZE+1);
+  int ref_stride = (MAX_SB_SIZE+1);
+  int msk_stride = (MAX_SB_SIZE+1);
 
   for (int xoffset = 0 ; xoffset < BIL_SUBPEL_SHIFTS ; xoffset++) {
     for (int yoffset = 0 ; yoffset < BIL_SUBPEL_SHIFTS ; yoffset++) {
       for (int i = 0; i < 8; ++i) {
         vpx_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
-                     (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
+                     (MAX_SB_SIZE+1)*(MAX_SB_SIZE+1));
         vpx_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
-                     (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
-        memset(msk_ptr, (i & 0x4) ?   64 : 0, (MAX_CU_SIZE+1)*(MAX_CU_SIZE+1));
+                     (MAX_SB_SIZE+1)*(MAX_SB_SIZE+1));
+        memset(msk_ptr, (i & 0x4) ?   64 : 0, (MAX_SB_SIZE+1)*(MAX_SB_SIZE+1));
 
         ref_ret = ref_func_(src8_ptr, src_stride,
                             xoffset, yoffset,
diff --git a/test/vp9_subtract_test.cc b/test/subtract_test.cc
similarity index 97%
rename from test/vp9_subtract_test.cc
rename to test/subtract_test.cc
index 3cad4d7e6..a3f015277 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/subtract_test.cc
@@ -10,13 +10,16 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
+#if CONFIG_VP10
+#include "vp10/common/blockd.h"
+#elif CONFIG_VP9
 #include "vp9/common/vp9_blockd.h"
+#endif
 #include "vpx_mem/vpx_mem.h"
 
 typedef void (*SubtractFunc)(int rows, int cols,
@@ -24,7 +27,7 @@ typedef void (*SubtractFunc)(int rows, int cols,
                              const uint8_t *src_ptr, ptrdiff_t src_stride,
                              const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 
-namespace vp9 {
+namespace {
 
 class VP9SubtractBlockTest : public ::testing::TestWithParam<SubtractFunc> {
  public:
@@ -105,5 +108,4 @@ INSTANTIATE_TEST_CASE_P(NEON, VP9SubtractBlockTest,
 INSTANTIATE_TEST_CASE_P(MSA, VP9SubtractBlockTest,
                         ::testing::Values(vpx_subtract_block_msa));
 #endif
-
-}  // namespace vp9
+}  // namespace
diff --git a/test/test.mk b/test/test.mk
index db2e361eb..b173ec3fd 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -147,7 +147,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += subtract_test.cc
 
 ifeq ($(CONFIG_VP9_ENCODER),yes)
 LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
@@ -172,6 +172,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht16x16_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ANS)          += vp10_ans_test.cc
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += subtract_test.cc
 
 ifeq ($(CONFIG_EXT_INTER),yes)
 LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index fb3f44b12..821d67c95 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -44,9 +44,6 @@ typedef enum {
 #define IsInterpolatingFilter(filter)  (1)
 #endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
 
-#define MAXTXLEN 32
-#define CU_SIZE  64
-
 static INLINE int is_inter_mode(PREDICTION_MODE mode) {
 #if CONFIG_EXT_INTER
   return mode >= NEARESTMV && mode <= NEW_NEWMV;
@@ -167,8 +164,8 @@ typedef struct {
   PREDICTION_MODE mode;
   TX_SIZE tx_size;
 #if CONFIG_VAR_TX
-  // TODO(jingning): This effectively assigned an entry for each 8x8 block.
-  // Apparently it takes much more space than needed.
+  // TODO(jingning): This effectively assigned a separate entry for each
+  // 8x8 block. Apparently it takes much more space than needed.
   TX_SIZE inter_tx_size[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
 #endif
   int8_t skip;
@@ -318,15 +315,15 @@ typedef struct macroblockd {
   const YV12_BUFFER_CONFIG *cur_buf;
 
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][2 * MI_BLOCK_SIZE];
 
   PARTITION_CONTEXT *above_seg_context;
-  PARTITION_CONTEXT left_seg_context[8];
+  PARTITION_CONTEXT left_seg_context[MI_BLOCK_SIZE];
 
 #if CONFIG_VAR_TX
   TXFM_CONTEXT *above_txfm_context;
   TXFM_CONTEXT *left_txfm_context;
-  TXFM_CONTEXT left_txfm_context_buffer[8];
+  TXFM_CONTEXT left_txfm_context_buffer[MI_BLOCK_SIZE];
 
   TX_SIZE max_tx_size;
 #if CONFIG_SUPERTX
@@ -686,6 +683,7 @@ void vp10_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
 
 #if CONFIG_EXT_INTER
 static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
+  // TODO(debargha): Should this be bsize < BLOCK_LARGEST?
   return (bsize >= BLOCK_8X8) && (bsize < BLOCK_64X64);
 }
 
diff --git a/vp10/common/common_data.h b/vp10/common/common_data.h
index 67d6e3a81..44ebff2dc 100644
--- a/vp10/common/common_data.h
+++ b/vp10/common/common_data.h
@@ -19,154 +19,282 @@
 extern "C" {
 #endif
 
+#if CONFIG_EXT_PARTITION
+# define IF_EXT_PARTITION(...) __VA_ARGS__
+#else
+# define IF_EXT_PARTITION(...)
+#endif
+
 // Log 2 conversion lookup tables for block width and height
 static const uint8_t b_width_log2_lookup[BLOCK_SIZES] =
-  {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4};
+  {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, IF_EXT_PARTITION(4, 5, 5)};
 static const uint8_t b_height_log2_lookup[BLOCK_SIZES] =
-  {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4};
-static const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES] =
-  {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16};
-static const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES] =
-  {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16};
+  {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, IF_EXT_PARTITION(5, 4, 5)};
 // Log 2 conversion lookup tables for modeinfo width and height
 static const uint8_t mi_width_log2_lookup[BLOCK_SIZES] =
-  {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
+  {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, IF_EXT_PARTITION(3, 4, 4)};
 static const uint8_t mi_height_log2_lookup[BLOCK_SIZES] =
-  {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
+  {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3, IF_EXT_PARTITION(4, 3, 4)};
+
+// Width/height lookup tables in units of varios block sizes
+static const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES] =
+  {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, IF_EXT_PARTITION(16, 32, 32)};
+static const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES] =
+  {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, IF_EXT_PARTITION(32, 16, 32)};
 static const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
-  {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
+  {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, IF_EXT_PARTITION(8, 16, 16)};
 static const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] =
-  {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
+  {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, IF_EXT_PARTITION(16, 8, 16)};
+static const uint8_t num_16x16_blocks_wide_lookup[BLOCK_SIZES] =
+  {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4, IF_EXT_PARTITION(4, 8, 8)};
+static const uint8_t num_16x16_blocks_high_lookup[BLOCK_SIZES] =
+  {1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, IF_EXT_PARTITION(8, 4, 8)};
 
 // VPXMIN(3, VPXMIN(b_width_log2(bsize), b_height_log2(bsize)))
 static const uint8_t size_group_lookup[BLOCK_SIZES] =
-  {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3};
+  {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, IF_EXT_PARTITION(3, 3, 3)};
 
 static const uint8_t num_pels_log2_lookup[BLOCK_SIZES] =
-  {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12};
+  {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, IF_EXT_PARTITION(13, 13, 14)};
 
-static const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = {
-  {  // 4X4
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID,
+static const PARTITION_TYPE
+  partition_lookup[MAX_SB_SIZE_LOG2 - 1][BLOCK_SIZES] = {
+  {     // 4X4 ->
+    //                                    4X4
+                                          PARTITION_NONE,
+    // 4X8,            8X4,               8X8
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 8X16,           16X8,              16X16
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 16X32,          32X16,             32X32
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 32X64,          64X32,             64X64
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_PARTITION
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // 8X8 ->
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+    // 8X16,           16X8,              16X16
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 16X32,          32X16,             32X32
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 32X64,          64X32,             64X64
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_PARTITION
+    // 64x128,         128x64,            128x128
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // 16X16 ->
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 8X16,           16X8,              16X16
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+    // 16X32,          32X16,             32X32
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID
-  }, {  // 8X8
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE,
+    // 32X64,          64X32,             64X64
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_PARTITION
+    // 64x128,         128x64,            128x128
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID
-  }, {  // 16X16
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
-    PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // 32X32 ->
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 8X16,           16X8,              16X16
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 16X32,          32X16,             32X32
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+    // 32X64,          64X32,             64X64
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID
-  }, {  // 32X32
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT,
-    PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID
-  }, {  // 64X64
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ,
-    PARTITION_NONE
+#if CONFIG_EXT_PARTITION
+    // 64x128,         128x64,            128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // 64X64 ->
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 8X16,           16X8,              16X16
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 16X32,          32X16,             32X32
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 32X64,          64X32,             64X64
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+#if CONFIG_EXT_PARTITION
+    // 64x128,         128x64,            128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+  }, {  // 128x128 ->
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 8X16,           16X8,              16X16
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 16X32,          32X16,             32X32
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 32X64,          64X32,             64X64
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 64x128,         128x64,            128x128
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+#endif  // CONFIG_EXT_PARTITION
   }
 };
 
 #if CONFIG_EXT_PARTITION_TYPES
-static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][BLOCK_SIZES] = {
+static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][BLOCK_SIZES] =
+#else
+static const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] =
+#endif  // CONFIG_EXT_PARTITION_TYPES
+{
   {     // PARTITION_NONE
-    BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
-    BLOCK_8X8,   BLOCK_8X16,  BLOCK_16X8,
-    BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
-    BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
-    BLOCK_64X64,
+    //                            4X4
+                                  BLOCK_4X4,
+    // 4X8,        8X4,           8X8
+    BLOCK_4X8,     BLOCK_8X4,     BLOCK_8X8,
+    // 8X16,       16X8,          16X16
+    BLOCK_8X16,    BLOCK_16X8,    BLOCK_16X16,
+    // 16X32,      32X16,         32X32
+    BLOCK_16X32,   BLOCK_32X16,   BLOCK_32X32,
+    // 32X64,      64X32,         64X64
+    BLOCK_32X64,   BLOCK_64X32,   BLOCK_64X64,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_64X128,  BLOCK_128X64,  BLOCK_128X128,
+#endif  // CONFIG_EXT_PARTITION
   }, {  // PARTITION_HORZ
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_64X32,
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+#endif  // CONFIG_EXT_PARTITION
   }, {  // PARTITION_VERT
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_32X64,
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+#endif  // CONFIG_EXT_PARTITION
   }, {  // PARTITION_SPLIT
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_4X4,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_8X8,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_16X16,   BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_32X32,
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64,
+#endif  // CONFIG_EXT_PARTITION
+#if CONFIG_EXT_PARTITION_TYPES
   }, {  // PARTITION_HORZ_A
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_64X32,
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+#endif  // CONFIG_EXT_PARTITION
   }, {  // PARTITION_HORZ_B
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_64X32,
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+#endif  // CONFIG_EXT_PARTITION
   }, {  // PARTITION_VERT_A
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_32X64,
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+#endif  // CONFIG_EXT_PARTITION
   }, {  // PARTITION_VERT_B
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_32X64,
-  }
-};
-#else
-static const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = {
-  {     // PARTITION_NONE
-    BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
-    BLOCK_8X8,   BLOCK_8X16,  BLOCK_16X8,
-    BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
-    BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
-    BLOCK_64X64,
-  }, {  // PARTITION_HORZ
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_64X32,
-  }, {  // PARTITION_VERT
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_32X64,
-  }, {  // PARTITION_SPLIT
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_4X4,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_8X8,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_16X16,   BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_32X32,
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_EXT_PARTITION_TYPES
   }
 };
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
 static const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = {
-  TX_4X4,   TX_4X4,   TX_4X4,
-  TX_8X8,   TX_8X8,   TX_8X8,
-  TX_16X16, TX_16X16, TX_16X16,
-  TX_32X32, TX_32X32, TX_32X32, TX_32X32
+  //                   4X4
+                       TX_4X4,
+  // 4X8,    8X4,      8X8
+  TX_4X4,    TX_4X4,   TX_8X8,
+  // 8X16,   16X8,     16X16
+  TX_8X8,    TX_8X8,   TX_16X16,
+  // 16X32,  32X16,    32X32
+  TX_16X16,  TX_16X16, TX_32X32,
+  // 32X64,  64X32,    64X64
+  TX_32X32,  TX_32X32, TX_32X32,
+#if CONFIG_EXT_PARTITION
+  // 64x128, 128x64,   128x128
+  TX_32X32,  TX_32X32, TX_32X32,
+#endif  // CONFIG_EXT_PARTITION
 };
 
 static const BLOCK_SIZE txsize_to_bsize[TX_SIZES] = {
@@ -200,6 +328,11 @@ static const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
   {{BLOCK_32X64, BLOCK_32X32},   {BLOCK_INVALID, BLOCK_16X32}},
   {{BLOCK_64X32, BLOCK_INVALID}, {BLOCK_32X32,   BLOCK_32X16}},
   {{BLOCK_64X64, BLOCK_64X32},   {BLOCK_32X64,   BLOCK_32X32}},
+#if CONFIG_EXT_PARTITION
+  {{BLOCK_64X128, BLOCK_64X64},   {BLOCK_INVALID, BLOCK_32X64}},
+  {{BLOCK_128X64, BLOCK_INVALID}, {BLOCK_64X64,   BLOCK_64X32}},
+  {{BLOCK_128X128, BLOCK_128X64}, {BLOCK_64X128,  BLOCK_64X64}},
+#endif  // CONFIG_EXT_PARTITION
 };
 
 // Generates 4 bit field in which each bit set to 1 represents
@@ -209,6 +342,24 @@ static const struct {
   PARTITION_CONTEXT above;
   PARTITION_CONTEXT left;
 } partition_context_lookup[BLOCK_SIZES]= {
+#if CONFIG_EXT_PARTITION
+  {31, 31},  // 4X4   - {0b11111, 0b11111}
+  {31, 30},  // 4X8   - {0b11111, 0b11110}
+  {30, 31},  // 8X4   - {0b11110, 0b11111}
+  {30, 30},  // 8X8   - {0b11110, 0b11110}
+  {30, 28},  // 8X16  - {0b11110, 0b11100}
+  {28, 30},  // 16X8  - {0b11100, 0b11110}
+  {28, 28},  // 16X16 - {0b11100, 0b11100}
+  {28, 24},  // 16X32 - {0b11100, 0b11000}
+  {24, 28},  // 32X16 - {0b11000, 0b11100}
+  {24, 24},  // 32X32 - {0b11000, 0b11000}
+  {24, 16},  // 32X64 - {0b11000, 0b10000}
+  {16, 24},  // 64X32 - {0b10000, 0b11000}
+  {16, 16},  // 64X64 - {0b10000, 0b10000}
+  {16, 0 },  // 64X128- {0b10000, 0b00000}
+  {0,  16},  // 128X64- {0b00000, 0b10000}
+  {0,  0 },  // 128X128-{0b00000, 0b00000}
+#else
   {15, 15},  // 4X4   - {0b1111, 0b1111}
   {15, 14},  // 4X8   - {0b1111, 0b1110}
   {14, 15},  // 8X4   - {0b1110, 0b1111}
@@ -222,6 +373,7 @@ static const struct {
   {8,  0 },  // 32X64 - {0b1000, 0b0000}
   {0,  8 },  // 64X32 - {0b0000, 0b1000}
   {0,  0 },  // 64X64 - {0b0000, 0b0000}
+#endif  // CONFIG_EXT_PARTITION
 };
 
 #if CONFIG_SUPERTX
diff --git a/vp10/common/entropymode.c b/vp10/common/entropymode.c
index b57ed7abd..29d541951 100644
--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c
@@ -171,6 +171,13 @@ static const vpx_prob default_partition_probs[PARTITION_CONTEXTS]
   {  72,  16,  44, 128, 128, 128, 128 },  // a split, l not split
   {  58,  32,  12, 128, 128, 128, 128 },  // l split, a not split
   {  10,   7,   6, 128, 128, 128, 128 },  // a/l both split
+#if CONFIG_EXT_PARTITION
+  // 128x128 -> 64x64
+  { 222,  34,  30, 128, 128, 128, 128 },  // a/l both not split
+  {  72,  16,  44, 128, 128, 128, 128 },  // a split, l not split
+  {  58,  32,  12, 128, 128, 128, 128 },  // l split, a not split
+  {  10,   7,   6, 128, 128, 128, 128 },  // a/l both split
+#endif  // CONFIG_EXT_PARTITION
 };
 #else
 static const vpx_prob default_partition_probs[PARTITION_CONTEXTS]
@@ -195,6 +202,13 @@ static const vpx_prob default_partition_probs[PARTITION_CONTEXTS]
   {  72,  16,  44 },  // a split, l not split
   {  58,  32,  12 },  // l split, a not split
   {  10,   7,   6 },  // a/l both split
+#if CONFIG_EXT_PARTITION
+  // 128x128 -> 64x64
+  { 222,  34,  30 },  // a/l both not split
+  {  72,  16,  44 },  // a split, l not split
+  {  58,  32,  12 },  // l split, a not split
+  {  10,   7,   6 },  // a/l both split
+#endif  // CONFIG_EXT_PARTITION
 };
 #endif  // CONFIG_EXT_PARTITION_TYPES
 
@@ -256,20 +270,33 @@ static const vpx_prob default_inter_compound_mode_probs
 
 static const vpx_prob default_interintra_prob[BLOCK_SIZES] = {
   192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
+#if CONFIG_EXT_PARTITION
+  192, 192, 192
+#endif  // CONFIG_EXT_PARTITION
 };
 
 static const vpx_prob default_wedge_interintra_prob[BLOCK_SIZES] = {
   192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
+#if CONFIG_EXT_PARTITION
+  192, 192, 192
+#endif  // CONFIG_EXT_PARTITION
 };
 
 static const vpx_prob default_wedge_interinter_prob[BLOCK_SIZES] = {
   192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
+#if CONFIG_EXT_PARTITION
+  192, 192, 192
+#endif  // CONFIG_EXT_PARTITION
 };
 #endif  // CONFIG_EXT_INTER
 
 #if CONFIG_OBMC
 static const vpx_prob default_obmc_prob[BLOCK_SIZES] = {
     255, 255, 255, 151, 153, 144, 178, 165, 160, 207, 195, 168, 244,
+#if CONFIG_EXT_PARTITION
+    // TODO(debargha) What are the correct values for these?
+    192, 192, 192
+#endif  // CONFIG_EXT_PARTITION
 };
 #endif  // CONFIG_OBMC
 
@@ -389,6 +416,11 @@ vp10_default_palette_y_size_prob[PALETTE_BLOCK_SIZES][PALETTE_SIZES - 1] = {
     { 180, 113, 136,  49,  45, 114},
     { 107,  70,  87,  49, 154, 156},
     {  98, 105, 142,  63,  64, 152},
+#if CONFIG_EXT_PARTITION
+    {  98, 105, 142,  63,  64, 152},
+    {  98, 105, 142,  63,  64, 152},
+    {  98, 105, 142,  63,  64, 152},
+#endif  // CONFIG_EXT_PARTITION
 };
 
 const vpx_prob
@@ -403,6 +435,11 @@ vp10_default_palette_uv_size_prob[PALETTE_BLOCK_SIZES][PALETTE_SIZES - 1] = {
     {  67,  53,  54,  55,  66,  93},
     { 120, 130,  83, 171,  75, 214},
     {  72,  55,  66,  68,  79, 107},
+#if CONFIG_EXT_PARTITION
+    {  72,  55,  66,  68,  79, 107},
+    {  72,  55,  66,  68,  79, 107},
+    {  72,  55,  66,  68,  79, 107},
+#endif  // CONFIG_EXT_PARTITION
 };
 
 const vpx_prob
@@ -418,6 +455,11 @@ vp10_default_palette_y_mode_prob[PALETTE_BLOCK_SIZES][PALETTE_Y_MODE_CONTEXTS]
     { 240,  180,  100, },
     { 240,  180,  100, },
     { 240,  180,  100, },
+#if CONFIG_EXT_PARTITION
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+#endif  // CONFIG_EXT_PARTITION
 };
 
 
diff --git a/vp10/common/entropymode.h b/vp10/common/entropymode.h
index 3d5fe9e47..8219dc5e0 100644
--- a/vp10/common/entropymode.h
+++ b/vp10/common/entropymode.h
@@ -32,7 +32,7 @@ extern "C" {
 
 #define PALETTE_COLOR_CONTEXTS 16
 #define PALETTE_MAX_SIZE 8
-#define PALETTE_BLOCK_SIZES (BLOCK_64X64 - BLOCK_8X8 + 1)
+#define PALETTE_BLOCK_SIZES (BLOCK_LARGEST - BLOCK_8X8 + 1)
 #define PALETTE_Y_MODE_CONTEXTS 3
 
 struct VP10Common;
diff --git a/vp10/common/enums.h b/vp10/common/enums.h
index 36c9f9121..5615cee93 100644
--- a/vp10/common/enums.h
+++ b/vp10/common/enums.h
@@ -18,13 +18,25 @@
 extern "C" {
 #endif
 
-#define MI_SIZE_LOG2 3
-#define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2)  // 64 = 2^6
+#undef MAX_SB_SIZE
+
+#if CONFIG_EXT_PARTITION
+# define MAX_SB_SIZE_LOG2 7
+#else
+# define MAX_SB_SIZE_LOG2 6
+#endif  // CONFIG_EXT_PARTITION
+
+#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
+#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
 
+#define MI_SIZE_LOG2 3
 #define MI_SIZE (1 << MI_SIZE_LOG2)  // pixels per mi-unit
+
+#define MI_BLOCK_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
 #define MI_BLOCK_SIZE (1 << MI_BLOCK_SIZE_LOG2)  // mi-units per max block
 
 #define MI_MASK (MI_BLOCK_SIZE - 1)
+#define MI_MASK_2 (MI_BLOCK_SIZE * 2 - 1)
 
 #if CONFIG_EXT_TILE
 # define  MAX_TILE_ROWS 1024
@@ -49,32 +61,29 @@ typedef enum BITSTREAM_PROFILE {
   MAX_PROFILES
 } BITSTREAM_PROFILE;
 
-#define BLOCK_4X4      0
-#define BLOCK_4X8      1
-#define BLOCK_8X4      2
-#define BLOCK_8X8      3
-#define BLOCK_8X16     4
-#define BLOCK_16X8     5
-#define BLOCK_16X16    6
-#define BLOCK_16X32    7
-#define BLOCK_32X16    8
-#define BLOCK_32X32    9
-#define BLOCK_32X64   10
-#define BLOCK_64X32   11
-#define BLOCK_64X64   12
-
-#if CONFIG_EXT_PARTITION
-#define BLOCK_64X128  13
-#define BLOCK_128X64  14
-#define BLOCK_128X128 15
-#define BLOCK_SIZES   16
+#define BLOCK_4X4       0
+#define BLOCK_4X8       1
+#define BLOCK_8X4       2
+#define BLOCK_8X8       3
+#define BLOCK_8X16      4
+#define BLOCK_16X8      5
+#define BLOCK_16X16     6
+#define BLOCK_16X32     7
+#define BLOCK_32X16     8
+#define BLOCK_32X32     9
+#define BLOCK_32X64    10
+#define BLOCK_64X32    11
+#define BLOCK_64X64    12
+#if !CONFIG_EXT_PARTITION
+# define BLOCK_SIZES   13
 #else
-#define BLOCK_SIZES   13
-#endif  // CONFIG_EXT_PARTITION
-
-#define BLOCK_INVALID (BLOCK_SIZES)
+# define BLOCK_64X128  13
+# define BLOCK_128X64  14
+# define BLOCK_128X128 15
+# define BLOCK_SIZES   16
+#endif  // !CONFIG_EXT_PARTITION
+#define BLOCK_INVALID BLOCK_SIZES
 #define BLOCK_LARGEST (BLOCK_SIZES - 1)
-
 typedef uint8_t BLOCK_SIZE;
 
 #if CONFIG_EXT_PARTITION_TYPES
@@ -104,7 +113,11 @@ typedef enum PARTITION_TYPE {
 
 typedef char PARTITION_CONTEXT;
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
-#define PARTITION_CONTEXTS   (4 * PARTITION_PLOFFSET)
+#if CONFIG_EXT_PARTITION
+# define PARTITION_CONTEXTS  (5 * PARTITION_PLOFFSET)
+#else
+# define PARTITION_CONTEXTS  (4 * PARTITION_PLOFFSET)
+#endif  // CONFIG_EXT_PARTITION
 
 // block transform size
 typedef uint8_t TX_SIZE;
@@ -114,6 +127,15 @@ typedef uint8_t TX_SIZE;
 #define TX_32X32 ((TX_SIZE)3)   // 32x32 transform
 #define TX_SIZES ((TX_SIZE)4)
 
+#define MAX_TX_SIZE_LOG2  5
+#define MAX_TX_SIZE       (1 << MAX_TX_SIZE_LOG2)
+#define MAX_TX_SQUARE     (MAX_TX_SIZE * MAX_TX_SIZE)
+
+// Number of maxium size transform blocks in the maximum size superblock
+#define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 \
+  ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2)
+#define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2)
+
 // frame transform mode
 typedef enum {
   ONLY_4X4            = 0,        // only 4x4 transform used
@@ -286,10 +308,15 @@ typedef enum {
 
 /* Segment Feature Masks */
 #define MAX_MV_REF_CANDIDATES 2
+
 #if CONFIG_REF_MV
 #define MAX_REF_MV_STACK_SIZE 16
-#define REF_CAT_LEVEL  160
-#endif
+#if CONFIG_EXT_PARTITION
+#define REF_CAT_LEVEL 640
+#else
+#define REF_CAT_LEVEL 160
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_REF_MV
 
 #define INTRA_INTER_CONTEXTS 4
 #define COMP_INTER_CONTEXTS 5
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index 25941d02b..fe9b13cb4 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -871,6 +871,9 @@ void vp10_setup_mask(VP10_COMMON *const cm, const int mi_row, const int mi_col,
                         cm->mi_rows - mi_row : MI_BLOCK_SIZE);
   const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ?
                         cm->mi_cols - mi_col : MI_BLOCK_SIZE);
+#if CONFIG_EXT_PARTITION
+  assert(0 && "Not yet updated");
+#endif  // CONFIG_EXT_PARTITION
 
   vp10_zero(*lfm);
   assert(mip[0] != NULL);
@@ -1045,8 +1048,10 @@ void vp10_setup_mask(VP10_COMMON *const cm, const int mi_row, const int mi_col,
     const uint64_t rows = cm->mi_rows - mi_row;
 
     // Each pixel inside the border gets a 1,
-    const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1);
-    const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1);
+    const uint64_t mask_y =
+      (((uint64_t) 1 << (rows << MI_BLOCK_SIZE_LOG2)) - 1);
+    const uint16_t mask_uv =
+      (((uint16_t) 1 << (((rows + 1) >> 1) << (MI_BLOCK_SIZE_LOG2 - 1))) - 1);
 
     // Remove values completely outside our border.
     for (i = 0; i < TX_32X32; i++) {
@@ -1262,7 +1267,7 @@ void vp10_filter_block_plane_non420(VP10_COMMON *cm,
 
       int tx_size_mask = 0;
       // Filter level can vary per MI
-      if (!(lfl[(r << 3) + (c >> ss_x)] =
+      if (!(lfl[(r << MI_BLOCK_SIZE_LOG2) + (c >> ss_x)] =
             get_filter_level(&cm->lf_info, mbmi)))
         continue;
 
@@ -1280,11 +1285,13 @@ void vp10_filter_block_plane_non420(VP10_COMMON *cm,
                                 sb_type, ss_x, ss_y) :
             mbmi->inter_tx_size[blk_row][blk_col];
 
-      tx_size_r = VPXMIN(tx_size, cm->above_txfm_context[mi_col + c]);
-      tx_size_c = VPXMIN(tx_size, cm->left_txfm_context[(mi_row + r) & 0x07]);
+      tx_size_r = VPXMIN(tx_size,
+                         cm->above_txfm_context[mi_col + c]);
+      tx_size_c = VPXMIN(tx_size,
+                         cm->left_txfm_context[(mi_row + r) & MI_MASK]);
 
       cm->above_txfm_context[mi_col + c] = tx_size;
-      cm->left_txfm_context[(mi_row + r) & 0x07] = tx_size;
+      cm->left_txfm_context[(mi_row + r) & MI_MASK] = tx_size;
 #endif
 
       // Build masks based on the transform size of each block
@@ -1351,21 +1358,22 @@ void vp10_filter_block_plane_non420(VP10_COMMON *cm,
     border_mask = ~(mi_col == 0);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
-      highbd_filter_selectively_vert(CONVERT_TO_SHORTPTR(dst->buf),
-                                     dst->stride,
-                                     mask_16x16_c & border_mask,
-                                     mask_8x8_c & border_mask,
-                                     mask_4x4_c & border_mask,
-                                     mask_4x4_int[r],
-                                     &cm->lf_info, &lfl[r << 3],
-                                     (int)cm->bit_depth);
+      highbd_filter_selectively_vert(
+          CONVERT_TO_SHORTPTR(dst->buf),
+          dst->stride,
+          mask_16x16_c & border_mask,
+          mask_8x8_c & border_mask,
+          mask_4x4_c & border_mask,
+          mask_4x4_int[r],
+          &cm->lf_info, &lfl[r << MI_BLOCK_SIZE_LOG2],
+          (int)cm->bit_depth);
     } else {
       filter_selectively_vert(dst->buf, dst->stride,
                               mask_16x16_c & border_mask,
                               mask_8x8_c & border_mask,
                               mask_4x4_c & border_mask,
                               mask_4x4_int[r],
-                              &cm->lf_info, &lfl[r << 3]);
+                              &cm->lf_info, &lfl[r << MI_BLOCK_SIZE_LOG2]);
     }
 #else
     filter_selectively_vert(dst->buf, dst->stride,
@@ -1373,7 +1381,7 @@ void vp10_filter_block_plane_non420(VP10_COMMON *cm,
                             mask_8x8_c & border_mask,
                             mask_4x4_c & border_mask,
                             mask_4x4_int[r],
-                            &cm->lf_info, &lfl[r << 3]);
+                            &cm->lf_info, &lfl[r << MI_BLOCK_SIZE_LOG2]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 8 * dst->stride;
     mi_8x8 += row_step_stride;
@@ -1400,21 +1408,22 @@ void vp10_filter_block_plane_non420(VP10_COMMON *cm,
     }
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
-      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
-                                      dst->stride,
-                                      mask_16x16_r,
-                                      mask_8x8_r,
-                                      mask_4x4_r,
-                                      mask_4x4_int_r,
-                                      &cm->lf_info, &lfl[r << 3],
-                                      (int)cm->bit_depth);
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf),
+          dst->stride,
+          mask_16x16_r,
+          mask_8x8_r,
+          mask_4x4_r,
+          mask_4x4_int_r,
+          &cm->lf_info, &lfl[r << MI_BLOCK_SIZE_LOG2],
+          (int)cm->bit_depth);
     } else {
       filter_selectively_horiz(dst->buf, dst->stride,
                                mask_16x16_r,
                                mask_8x8_r,
                                mask_4x4_r,
                                mask_4x4_int_r,
-                               &cm->lf_info, &lfl[r << 3]);
+                               &cm->lf_info, &lfl[r << MI_BLOCK_SIZE_LOG2]);
     }
 #else
     filter_selectively_horiz(dst->buf, dst->stride,
@@ -1422,7 +1431,7 @@ void vp10_filter_block_plane_non420(VP10_COMMON *cm,
                              mask_8x8_r,
                              mask_4x4_r,
                              mask_4x4_int_r,
-                             &cm->lf_info, &lfl[r << 3]);
+                             &cm->lf_info, &lfl[r << MI_BLOCK_SIZE_LOG2]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 8 * dst->stride;
   }
@@ -1455,16 +1464,18 @@ void vp10_filter_block_plane_ss00(VP10_COMMON *const cm,
       highbd_filter_selectively_vert_row2(
           plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
           mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfm->lfl_y[r << 3], (int)cm->bit_depth);
+          &lfm->lfl_y[r << MI_BLOCK_SIZE_LOG2], (int)cm->bit_depth);
     } else {
       filter_selectively_vert_row2(
           plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
-          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+          mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+          &lfm->lfl_y[r << MI_BLOCK_SIZE_LOG2]);
     }
 #else
     filter_selectively_vert_row2(
         plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
-        mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+        mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+        &lfm->lfl_y[r << MI_BLOCK_SIZE_LOG2]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 16 * dst->stride;
     mask_16x16 >>= 16;
@@ -1499,17 +1510,18 @@ void vp10_filter_block_plane_ss00(VP10_COMMON *const cm,
     if (cm->use_highbitdepth) {
       highbd_filter_selectively_horiz(
           CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
-          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r << 3],
+          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
+          &lfm->lfl_y[r << MI_BLOCK_SIZE_LOG2],
           (int)cm->bit_depth);
     } else {
       filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                                mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                               &lfm->lfl_y[r << 3]);
+                               &lfm->lfl_y[r << MI_BLOCK_SIZE_LOG2]);
     }
 #else
     filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                              mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                             &lfm->lfl_y[r << 3]);
+                             &lfm->lfl_y[r << MI_BLOCK_SIZE_LOG2]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     dst->buf += 8 * dst->stride;
@@ -1539,8 +1551,10 @@ void vp10_filter_block_plane_ss11(VP10_COMMON *const cm,
   for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {
     if (plane->plane_type == 1) {
       for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
-        lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
-        lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
+        lfm->lfl_uv[(r << 1) + c] =
+          lfm->lfl_y[(r << MI_BLOCK_SIZE_LOG2) + (c << 1)];
+        lfm->lfl_uv[((r + 2) << 1) + c] =
+          lfm->lfl_y[((r + 2) << MI_BLOCK_SIZE_LOG2) + (c << 1)];
       }
     }
 
@@ -1632,9 +1646,31 @@ void vp10_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
                            VP10_COMMON *cm,
                            struct macroblockd_plane planes[MAX_MB_PLANE],
                            int start, int stop, int y_only) {
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+  int mi_row, mi_col;
+
+# if CONFIG_VAR_TX
+  memset(cm->above_txfm_context, TX_SIZES, cm->mi_cols);
+# endif  // CONFIG_VAR_TX
+  for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+# if CONFIG_VAR_TX
+    memset(cm->left_txfm_context, TX_SIZES, MI_BLOCK_SIZE);
+# endif  // CONFIG_VAR_TX
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+      int plane;
+
+      vp10_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+      for (plane = 0; plane < num_planes; ++plane)
+        vp10_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                       mi_row, mi_col);
+    }
+  }
+#else
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   int mi_row, mi_col;
-#if !CONFIG_VAR_TX && !CONFIG_EXT_PARTITION_TYPES
   enum lf_path path;
   LOOP_FILTER_MASK lfm;
 
@@ -1646,29 +1682,17 @@ void vp10_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
     path = LF_PATH_444;
   else
     path = LF_PATH_SLOW;
-#endif  // !CONFIG_VAR_TX && !CONFIG_EXT_PARTITION_TYPES
 
-#if CONFIG_VAR_TX
-  memset(cm->above_txfm_context, TX_SIZES, cm->mi_cols);
-#endif
   for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
-#if CONFIG_VAR_TX
-    memset(cm->left_txfm_context, TX_SIZES, 8);
-#endif
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
       int plane;
 
       vp10_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
-#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION_TYPES
-      for (plane = 0; plane < num_planes; ++plane)
-        vp10_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
-                                       mi_row, mi_col);
-#else
       // TODO(JBB): Make setup_mask work for non 420.
-      vp10_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
-                     &lfm);
+      vp10_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+
       vp10_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
       for (plane = 1; plane < num_planes; ++plane) {
         switch (path) {
@@ -1684,9 +1708,9 @@ void vp10_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
             break;
         }
       }
-#endif  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION_TYPES
     }
   }
+#endif  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
 }
 
 void vp10_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
diff --git a/vp10/common/loopfilter.h b/vp10/common/loopfilter.h
index 81f44de7c..8fa0b8048 100644
--- a/vp10/common/loopfilter.h
+++ b/vp10/common/loopfilter.h
@@ -84,8 +84,8 @@ typedef struct {
   uint16_t above_uv[TX_SIZES];
   uint16_t left_int_4x4_uv;
   uint16_t above_int_4x4_uv;
-  uint8_t lfl_y[64];
-  uint8_t lfl_uv[16];
+  uint8_t lfl_y[MI_BLOCK_SIZE * MI_BLOCK_SIZE];
+  uint8_t lfl_uv[MI_BLOCK_SIZE / 2 * MI_BLOCK_SIZE / 2];
 } LOOP_FILTER_MASK;
 
 /* assorted loopfilter functions which get used elsewhere */
diff --git a/vp10/common/mvref_common.c b/vp10/common/mvref_common.c
index 30d779051..aa651a2e2 100644
--- a/vp10/common/mvref_common.c
+++ b/vp10/common/mvref_common.c
@@ -12,6 +12,7 @@
 #include "vp10/common/mvref_common.h"
 
 #if CONFIG_REF_MV
+
 static uint8_t add_ref_mv_candidate(const MODE_INFO *const candidate_mi,
                                     const MB_MODE_INFO *const candidate,
                                     const MV_REFERENCE_FRAME rf[2],
@@ -23,6 +24,8 @@ static uint8_t add_ref_mv_candidate(const MODE_INFO *const candidate_mi,
   int index = 0, ref;
   int newmv_count = 0;
 
+  assert(2 * weight < REF_CAT_LEVEL);
+
   if (rf[1] == NONE) {
     // single reference frame
     for (ref = 0; ref < 2; ++ref) {
@@ -246,32 +249,30 @@ static uint8_t scan_blk_mbmi(const VP10_COMMON *cm, const MACROBLOCKD *xd,
   return newmv_count;
 }
 
-// This function assumes MI blocks are 8x8 and coding units are 64x64
 static int has_top_right(const MACROBLOCKD *xd,
                          int mi_row, int mi_col, int bs) {
   // In a split partition all apart from the bottom right has a top right
-  int has_tr = !((mi_row & bs) & (bs * 2 - 1)) ||
-               !((mi_col & bs) & (bs * 2 - 1));
+  int has_tr = !((mi_row & bs) && (mi_col & bs));
 
-  // Filter out partial right-most boundaries
-  // For each 4x4 group of blocks, when the bottom right is decoded the blocks
-  // to the right have not been decoded therefore the second from bottom in the
-  // right-most column does not have a top right
-  if ((mi_col & bs) & (bs * 2 - 1)) {
-    if (((mi_col & (2 * bs)) & (bs * 4 - 1)) &&
-        ((mi_row & (2 * bs)) & (bs * 4 - 1)))
-      has_tr = 0;
-  }
+  // bs > 0 and bs is a power of 2
+  assert(bs > 0 && !(bs & (bs - 1)));
 
-  // If the right had side of the block lines up with the right had edge end of
-  // a group of 8x8 MI blocks (i.e. edge of a coding unit) and is not on the top
-  // row of that coding unit, it does not have a top right
-  if (has_tr)
-    if (((mi_col + xd->n8_w) & 0x07) == 0)
-      if ((mi_row & 0x07) > 0)
+  // For each 4x4 group of blocks, when the bottom right is decoded the blocks
+  // to the right have not been decoded therefore the bottom right does
+  // not have a top right
+  while (bs < MI_BLOCK_SIZE) {
+    if (mi_col & bs) {
+      if ((mi_col & (2 * bs)) && (mi_row & (2 * bs))) {
         has_tr = 0;
+        break;
+      }
+    } else {
+      break;
+    }
+    bs <<= 1;
+  }
 
-  // The left had of two vertical rectangles always has a top right (as the
+  // The left hand of two vertical rectangles always has a top right (as the
   // block above will have been decoded)
   if (xd->n8_w < xd->n8_h)
     if (!xd->is_sec_rect)
@@ -359,8 +360,11 @@ static void setup_ref_mv_list(const VP10_COMMON *cm, const MACROBLOCKD *xd,
 
   nearest_refmv_count = *refmv_count;
 
-  for (idx = 0; idx < nearest_refmv_count; ++idx)
+  for (idx = 0; idx < nearest_refmv_count; ++idx) {
+    assert(ref_mv_stack[idx].weight > 0 &&
+           ref_mv_stack[idx].weight < REF_CAT_LEVEL);
     ref_mv_stack[idx].weight += REF_CAT_LEVEL;
+  }
 
   if (prev_frame_mvs_base && cm->show_frame && cm->last_show_frame
       && rf[1] == NONE) {
diff --git a/vp10/common/mvref_common.h b/vp10/common/mvref_common.h
index 104a91a99..5a3b6a88d 100644
--- a/vp10/common/mvref_common.h
+++ b/vp10/common/mvref_common.h
@@ -120,7 +120,16 @@ static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
   // 64X32
   {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
   // 64X64
-  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
+  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}},
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha/jingning) Making them twice the 32x64, .. ones above
+  // 64x128
+  {{0, -2}, {-2, 0}, {8, -2}, {-2, 4}, {-2, -2}, {0, -6}, {-6, 0}, {4, -2}},
+  // 128x64
+  {{-2, 0}, {0, -2}, {-2, 8}, {4, -2}, {-2, -2}, {-6, 0}, {0, -6}, {-2, 4}},
+  // 128x128
+  {{-2, 6}, {6, -2}, {-2, 8}, {8, -2}, {-2, -2}, {-2, 0}, {0, -2}, {-2, 12}},
+#endif  // CONFIG_EXT_PARTITION
 };
 
 static const int idx_n_column_to_subblock[4][2] = {
@@ -131,7 +140,11 @@ static const int idx_n_column_to_subblock[4][2] = {
 };
 
 // clamp_mv_ref
-#define MV_BORDER (8 << 3)  // Allow 8 pels in 1/8th pel units
+#if CONFIG_EXT_PARTITION
+# define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
+#else
+# define MV_BORDER (8 << 3)   // Allow 8 pels in 1/8th pel units
+#endif  // CONFIG_EXT_PARTITION
 
 static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
   clamp_mv(mv, xd->mb_to_left_edge - bw * 8 - MV_BORDER,
diff --git a/vp10/common/onyxc_int.h b/vp10/common/onyxc_int.h
index 3eac586f4..bdd9ffeaf 100644
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@@ -332,7 +332,7 @@ typedef struct VP10Common {
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
 #if CONFIG_VAR_TX
   TXFM_CONTEXT *above_txfm_context;
-  TXFM_CONTEXT left_txfm_context[8];
+  TXFM_CONTEXT left_txfm_context[MI_BLOCK_SIZE];
 #endif
   int above_context_alloc_cols;
 
@@ -440,7 +440,7 @@ static INLINE void vp10_init_macroblockd(VP10_COMMON *cm, MACROBLOCKD *xd,
 
 static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
   const int above_idx = mi_col * 2;
-  const int left_idx = (mi_row * 2) & 15;  // FIXME: Mask should be CU_SIZE*2-1
+  const int left_idx = (mi_row * 2) & MI_MASK_2;
   int i;
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblockd_plane *const pd = &xd->plane[i];
diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index 517538915..57c26a0fd 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@@ -454,52 +454,52 @@ void vp10_make_masked_inter_predictor(
     const MACROBLOCKD *xd) {
   const MODE_INFO *mi = xd->mi[0];
 #if CONFIG_VP9_HIGHBITDEPTH
-  uint8_t tmp_dst_[2 * CU_SIZE * CU_SIZE];
+  uint8_t tmp_dst_[2 * MAX_SB_SQUARE];
   uint8_t *tmp_dst =
       (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
       CONVERT_TO_BYTEPTR(tmp_dst_) : tmp_dst_;
-  vp10_make_inter_predictor(pre, pre_stride, tmp_dst, CU_SIZE,
+  vp10_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE,
                             subpel_x, subpel_y, sf, w, h, 0,
                             interp_filter, xs, ys, xd);
 #if CONFIG_SUPERTX
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     build_masked_compound_extend_highbd(
-        dst, dst_stride, tmp_dst, CU_SIZE, plane,
+        dst, dst_stride, tmp_dst, MAX_SB_SIZE, plane,
         mi->mbmi.interinter_wedge_index,
         mi->mbmi.sb_type,
         wedge_offset_y, wedge_offset_x, h, w);
   else
     build_masked_compound_extend(
-        dst, dst_stride, tmp_dst, CU_SIZE, plane,
+        dst, dst_stride, tmp_dst, MAX_SB_SIZE, plane,
         mi->mbmi.interinter_wedge_index,
         mi->mbmi.sb_type,
         wedge_offset_y, wedge_offset_x, h, w);
 #else
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     build_masked_compound_highbd(
-        dst, dst_stride, tmp_dst, CU_SIZE,
+        dst, dst_stride, tmp_dst, MAX_SB_SIZE,
         mi->mbmi.interinter_wedge_index,
         mi->mbmi.sb_type, h, w);
   else
     build_masked_compound(
-        dst, dst_stride, tmp_dst, CU_SIZE,
+        dst, dst_stride, tmp_dst, MAX_SB_SIZE,
         mi->mbmi.interinter_wedge_index,
         mi->mbmi.sb_type, h, w);
 #endif  // CONFIG_SUPERTX
 #else   // CONFIG_VP9_HIGHBITDEPTH
-  uint8_t tmp_dst[CU_SIZE * CU_SIZE];
-  vp10_make_inter_predictor(pre, pre_stride, tmp_dst, CU_SIZE,
+  uint8_t tmp_dst[MAX_SB_SQUARE];
+  vp10_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE,
                             subpel_x, subpel_y, sf, w, h, 0,
                             interp_filter, xs, ys, xd);
 #if CONFIG_SUPERTX
   build_masked_compound_extend(
-      dst, dst_stride, tmp_dst, CU_SIZE, plane,
+      dst, dst_stride, tmp_dst, MAX_SB_SIZE, plane,
       mi->mbmi.interinter_wedge_index,
       mi->mbmi.sb_type,
       wedge_offset_y, wedge_offset_x, h, w);
 #else
   build_masked_compound(
-      dst, dst_stride, tmp_dst, CU_SIZE,
+      dst, dst_stride, tmp_dst, MAX_SB_SIZE,
       mi->mbmi.interinter_wedge_index,
       mi->mbmi.sb_type, h, w);
 #endif  // CONFIG_SUPERTX
@@ -877,12 +877,13 @@ void vp10_build_masked_inter_predictor_complex(
     int mi_row_ori, int mi_col_ori, BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
     PARTITION_TYPE partition, int plane) {
   int i, j;
-  uint8_t mask[MAXTXLEN];
-  int top_w = 4 << b_width_log2_lookup[top_bsize],
-      top_h = 4 << b_height_log2_lookup[top_bsize];
-  int w = 4 << b_width_log2_lookup[bsize], h = 4 << b_height_log2_lookup[bsize];
-  int w_offset = (mi_col - mi_col_ori) << 3,
-      h_offset = (mi_row - mi_row_ori) << 3;
+  uint8_t mask[MAX_TX_SIZE];
+  int top_w = 4 << b_width_log2_lookup[top_bsize];
+  int top_h = 4 << b_height_log2_lookup[top_bsize];
+  int w = 4 << b_width_log2_lookup[bsize];
+  int h = 4 << b_height_log2_lookup[bsize];
+  int w_offset = (mi_col - mi_col_ori) * MI_SIZE;
+  int h_offset = (mi_row - mi_row_ori) * MI_SIZE;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   uint16_t *dst16= CONVERT_TO_SHORTPTR(dst);
@@ -890,6 +891,8 @@ void vp10_build_masked_inter_predictor_complex(
   int b_hdb = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+  assert(bsize <= BLOCK_32X32);
+
   top_w >>= pd->subsampling_x;
   top_h >>= pd->subsampling_y;
   w >>= pd->subsampling_x;
@@ -916,7 +919,8 @@ void vp10_build_masked_inter_predictor_complex(
             if (m == 0)
               dst_tmp[j] = dst2_tmp[j];
             else
-              dst_tmp[j] = (dst_tmp[j] * m + dst2_tmp[j] * (64 - m) + 32) >> 6;
+              dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
+                                              dst2_tmp[j] * (64 - m), 6);
           }
           dst_tmp += dst_stride;
           dst2_tmp += dst2_stride;
@@ -943,7 +947,8 @@ void vp10_build_masked_inter_predictor_complex(
             if (m == 0)
               dst_tmp[j] = dst2_tmp[j];
             else
-              dst_tmp[j] = (dst_tmp[j] * m + dst2_tmp[j] * (64 - m) + 32) >> 6;
+              dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
+                                              dst2_tmp[j] * (64 - m), 6);
           }
           dst_tmp += dst_stride;
           dst2_tmp += dst2_stride;
@@ -978,7 +983,8 @@ void vp10_build_masked_inter_predictor_complex(
             if (m == 0)
               dst_tmp[j] = dst2_tmp[j];
             else
-              dst_tmp[j] = (dst_tmp[j] * m + dst2_tmp[j] * (64 - m) + 32) >> 6;
+              dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
+                                              dst2_tmp[j] * (64 - m), 6);
           }
           memcpy(dst_tmp + j, dst2_tmp + j,
                      (top_w - w_offset - w) * sizeof(uint16_t));
@@ -1001,7 +1007,8 @@ void vp10_build_masked_inter_predictor_complex(
             if (m == 0)
               dst_tmp[j] = dst2_tmp[j];
             else
-              dst_tmp[j] = (dst_tmp[j] * m + dst2_tmp[j] * (64 - m) + 32) >> 6;
+              dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
+                                              dst2_tmp[j] * (64 - m), 6);
           }
             memcpy(dst_tmp + j, dst2_tmp + j,
                        (top_w - w_offset - w) * sizeof(uint8_t));
@@ -1158,11 +1165,38 @@ static const uint8_t obmc_mask_16[2][16] = {
 };
 
 static const uint8_t obmc_mask_32[2][32] = {
-    { 33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55,
-      56, 57, 58, 59, 60, 60, 61, 62, 62, 63, 63, 64, 64, 64, 64, 64},
-    { 31, 29, 28, 26, 24, 23, 21, 20, 19, 17, 16, 14, 13, 12, 11,  9,
-       8,  7,  6,  5,  4,  4,  3,  2,  2,  1,  1,  0,  0,  0,  0,  0}
+    { 33, 35, 36, 38, 40, 41, 43, 44,
+      45, 47, 48, 50, 51, 52, 53, 55,
+      56, 57, 58, 59, 60, 60, 61, 62,
+      62, 63, 63, 64, 64, 64, 64, 64 },
+    { 31, 29, 28, 26, 24, 23, 21, 20,
+      19, 17, 16, 14, 13, 12, 11,  9,
+       8,  7,  6,  5,  4,  4,  3,  2,
+       2,  1,  1,  0,  0,  0,  0,  0 }
+};
+
+#if CONFIG_EXT_PARTITION
+// TODO(debargha): What are the correct values here?
+static const uint8_t obmc_mask_64[2][64] = {
+    { 33, 33, 35, 35, 36, 36, 38, 38,
+      40, 40, 41, 41, 43, 43, 44, 44,
+      45, 45, 47, 47, 48, 48, 50, 50,
+      51, 51, 52, 52, 53, 53, 55, 55,
+      56, 56, 57, 57, 58, 58, 59, 59,
+      60, 60, 60, 60, 61, 61, 62, 62,
+      62, 62, 63, 63, 63, 63, 64, 64,
+      64, 64, 64, 64, 64, 64, 64, 64 },
+    { 31, 31, 29, 29, 28, 28, 26, 26,
+      24, 24, 23, 23, 21, 21, 20, 20,
+      19, 19, 17, 17, 16, 16, 14, 14,
+      13, 13, 12, 12, 11, 11,  9,  9,
+       8,  8,  7,  7,  6,  6,  5,  5,
+       4,  4,  4,  4,  3,  3,  2,  2,
+       2,  2,  1,  1,  1,  1,  0,  0,
+       0,  0,  0,  0,  0,  0,  0,  0 }
 };
+#endif  // CONFIG_EXT_PARTITION
+
 
 void setup_obmc_mask(int length, const uint8_t *mask[2]) {
   switch (length) {
@@ -1190,9 +1224,15 @@ void setup_obmc_mask(int length, const uint8_t *mask[2]) {
       mask[0] = obmc_mask_32[0];
       mask[1] = obmc_mask_32[1];
       break;
+#if CONFIG_EXT_PARTITION
+    case 64:
+      mask[0] = obmc_mask_64[0];
+      mask[1] = obmc_mask_64[1];
+      break;
+#endif  // CONFIG_EXT_PARTITION
     default:
-      mask[0] = obmc_mask_32[0];
-      mask[1] = obmc_mask_32[1];
+      mask[0] = NULL;
+      mask[1] = NULL;
       assert(0);
       break;
   }
@@ -1265,15 +1305,15 @@ void vp10_build_obmc_inter_prediction(VP10_COMMON *cm,
 
     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
       const struct macroblockd_plane *pd = &xd->plane[plane];
-      int bw = (mi_step * 8) >> pd->subsampling_x;
+      int bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
       int bh = overlap >> pd->subsampling_y;
       int row, col;
       int dst_stride = use_tmp_dst_buf ? final_stride[plane] : pd->dst.stride;
       uint8_t *dst = use_tmp_dst_buf ?
-          &final_buf[plane][(i * 8) >> pd->subsampling_x] :
-          &pd->dst.buf[(i * 8) >> pd->subsampling_x];
+          &final_buf[plane][(i * MI_SIZE) >> pd->subsampling_x] :
+          &pd->dst.buf[(i * MI_SIZE) >> pd->subsampling_x];
       int tmp_stride = tmp_stride1[plane];
-      uint8_t *tmp = &tmp_buf1[plane][(i * 8) >> pd->subsampling_x];
+      uint8_t *tmp = &tmp_buf1[plane][(i * MI_SIZE) >> pd->subsampling_x];
       const uint8_t *mask[2];
 
       setup_obmc_mask(bh, mask);
@@ -1285,8 +1325,9 @@ void vp10_build_obmc_inter_prediction(VP10_COMMON *cm,
 
         for (row = 0; row < bh; ++row) {
           for (col = 0; col < bw; ++col)
-            dst16[col] = (mask[0][row] * dst16[col] + mask[1][row] * tmp16[col]
-                          + 32) >> 6;
+            dst16[col] = ROUND_POWER_OF_TWO(mask[0][row] * dst16[col] +
+                                            mask[1][row] * tmp16[col], 6);
+
           dst16 += dst_stride;
           tmp16 += tmp_stride;
         }
@@ -1294,8 +1335,8 @@ void vp10_build_obmc_inter_prediction(VP10_COMMON *cm,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       for (row = 0; row < bh; ++row) {
         for (col = 0; col < bw; ++col)
-          dst[col] = (mask[0][row] * dst[col] + mask[1][row] * tmp[col] + 32)
-                     >> 6;
+          dst[col] = ROUND_POWER_OF_TWO(mask[0][row] * dst[col] +
+                                        mask[1][row] * tmp[col], 6);
         dst += dst_stride;
         tmp += tmp_stride;
       }
@@ -1332,15 +1373,15 @@ void vp10_build_obmc_inter_prediction(VP10_COMMON *cm,
     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
       const struct macroblockd_plane *pd = &xd->plane[plane];
       int bw = overlap >> pd->subsampling_x;
-      int bh = (mi_step * 8) >> pd->subsampling_y;
+      int bh = (mi_step * MI_SIZE) >> pd->subsampling_y;
       int row, col;
       int dst_stride = use_tmp_dst_buf ? final_stride[plane] : pd->dst.stride;
       uint8_t *dst = use_tmp_dst_buf ?
-          &final_buf[plane][(i * 8 * dst_stride) >> pd->subsampling_y] :
-          &pd->dst.buf[(i * 8 * dst_stride) >> pd->subsampling_y];
+          &final_buf[plane][(i * MI_SIZE * dst_stride) >> pd->subsampling_y] :
+          &pd->dst.buf[(i * MI_SIZE * dst_stride) >> pd->subsampling_y];
       int tmp_stride = tmp_stride2[plane];
       uint8_t *tmp = &tmp_buf2[plane]
-                              [(i * 8 * tmp_stride) >> pd->subsampling_y];
+                              [(i * MI_SIZE * tmp_stride) >> pd->subsampling_y];
       const uint8_t *mask[2];
 
       setup_obmc_mask(bw, mask);
@@ -1352,8 +1393,8 @@ void vp10_build_obmc_inter_prediction(VP10_COMMON *cm,
 
         for (row = 0; row < bh; ++row) {
           for (col = 0; col < bw; ++col)
-            dst16[col] = (mask[0][col] * dst16[col] + mask[1][col] * tmp16[col]
-                          + 32) >> 6;
+            dst16[col] = ROUND_POWER_OF_TWO(mask[0][col] * dst16[col] +
+                                            mask[1][col] * tmp16[col], 6);
           dst16 += dst_stride;
           tmp16 += tmp_stride;
         }
@@ -1361,8 +1402,8 @@ void vp10_build_obmc_inter_prediction(VP10_COMMON *cm,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       for (row = 0; row < bh; ++row) {
         for (col = 0; col < bw; ++col)
-          dst[col] = (mask[0][col] * dst[col] + mask[1][col] * tmp[col] + 32)
-                     >> 6;
+          dst[col] = ROUND_POWER_OF_TWO(mask[0][col] * dst[col] +
+                                        mask[1][col] * tmp[col], 6);
         dst += dst_stride;
         tmp += tmp_stride;
       }
@@ -1572,7 +1613,31 @@ static void combine_interintra(PREDICTION_MODE mode,
   static const int scale_bits = 8;
   static const int scale_max = 256;
   static const int scale_round = 127;
-  static const int weights1d[64] = {
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha): Fill in the correct weights for 128 wide blocks.
+  static const int weights1d[MAX_SB_SIZE] = {
+      128, 128, 125, 125, 122, 122, 119, 119,
+      116, 116, 114, 114, 111, 111, 109, 109,
+      107, 107, 105, 105, 103, 103, 101, 101,
+       99,  99,  97,  97,  96,  96,  94,  94,
+       93,  93,  91,  91,  90,  90,  89,  89,
+       88,  88,  86,  86,  85,  85,  84,  84,
+       83,  83,  82,  82,  81,  81,  81,  81,
+       80,  80,  79,  79,  78,  78,  78,  78,
+       77,  77,  76,  76,  76,  76,  75,  75,
+       75,  75,  74,  74,  74,  74,  73,  73,
+       73,  73,  72,  72,  72,  72,  71,  71,
+       71,  71,  71,  71,  70,  70,  70,  70,
+       70,  70,  70,  70,  69,  69,  69,  69,
+       69,  69,  69,  69,  68,  68,  68,  68,
+       68,  68,  68,  68,  68,  68,  67,  67,
+       67,  67,  67,  67,  67,  67,  67,  67,
+  };
+  static int size_scales[BLOCK_SIZES] = {
+      32, 16, 16, 16, 8, 8, 8, 4, 4, 4, 2, 2, 2, 1, 1, 1
+  };
+#else
+  static const int weights1d[MAX_SB_SIZE] = {
       128, 125, 122, 119, 116, 114, 111, 109,
       107, 105, 103, 101,  99,  97,  96,  94,
        93,  91,  90,  89,  88,  86,  85,  84,
@@ -1582,14 +1647,14 @@ static void combine_interintra(PREDICTION_MODE mode,
        70,  70,  69,  69,  69,  69,  68,  68,
        68,  68,  68,  67,  67,  67,  67,  67,
   };
-  const int bw = 4 << b_width_log2_lookup[plane_bsize];
-  const int bh = 4 << b_height_log2_lookup[plane_bsize];
-
-  int size = VPXMAX(bw, bh);
-  int size_scale = (size >= 64 ? 1 :
-                    size == 32 ? 2 :
-                    size == 16 ? 4 :
-                    size == 8  ? 8 : 16);
+  static int size_scales[BLOCK_SIZES] = {
+      16, 8, 8, 8, 4, 4, 4, 2, 2, 2, 1, 1, 1
+  };
+#endif  // CONFIG_EXT_PARTITION
+
+  const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+  const int size_scale = size_scales[plane_bsize];
   int i, j;
 
   if (use_wedge_interintra && get_wedge_bits(bsize)) {
@@ -1712,7 +1777,31 @@ static void combine_interintra_highbd(PREDICTION_MODE mode,
   static const int scale_bits = 8;
   static const int scale_max = 256;
   static const int scale_round = 127;
-  static const int weights1d[64] = {
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha): Fill in the correct weights for 128 wide blocks.
+  static const int weights1d[MAX_SB_SIZE] = {
+      128, 128, 125, 125, 122, 122, 119, 119,
+      116, 116, 114, 114, 111, 111, 109, 109,
+      107, 107, 105, 105, 103, 103, 101, 101,
+       99,  99,  97,  97,  96,  96,  94,  94,
+       93,  93,  91,  91,  90,  90,  89,  89,
+       88,  88,  86,  86,  85,  85,  84,  84,
+       83,  83,  82,  82,  81,  81,  81,  81,
+       80,  80,  79,  79,  78,  78,  78,  78,
+       77,  77,  76,  76,  76,  76,  75,  75,
+       75,  75,  74,  74,  74,  74,  73,  73,
+       73,  73,  72,  72,  72,  72,  71,  71,
+       71,  71,  71,  71,  70,  70,  70,  70,
+       70,  70,  70,  70,  69,  69,  69,  69,
+       69,  69,  69,  69,  68,  68,  68,  68,
+       68,  68,  68,  68,  68,  68,  67,  67,
+       67,  67,  67,  67,  67,  67,  67,  67,
+  };
+  static int size_scales[BLOCK_SIZES] = {
+      32, 16, 16, 16, 8, 8, 8, 4, 4, 4, 2, 2, 2, 1, 1, 1
+  };
+#else
+  static const int weights1d[MAX_SB_SIZE] = {
       128, 125, 122, 119, 116, 114, 111, 109,
       107, 105, 103, 101,  99,  97,  96,  94,
        93,  91,  90,  89,  88,  86,  85,  84,
@@ -1722,15 +1811,16 @@ static void combine_interintra_highbd(PREDICTION_MODE mode,
        70,  70,  69,  69,  69,  69,  68,  68,
        68,  68,  68,  67,  67,  67,  67,  67,
   };
-  const int bw = 4 << b_width_log2_lookup[plane_bsize];
-  const int bh = 4 << b_height_log2_lookup[plane_bsize];
-
-  int size = VPXMAX(bw, bh);
-  int size_scale = (size >= 64 ? 1 :
-                    size == 32 ? 2 :
-                    size == 16 ? 4 :
-                    size == 8  ? 8 : 16);
+  static int size_scales[BLOCK_SIZES] = {
+      16, 8, 8, 8, 4, 4, 4, 2, 2, 2, 1, 1, 1
+  };
+#endif  // CONFIG_EXT_PARTITION
+
+  const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+  const int size_scale = size_scales[plane_bsize];
   int i, j;
+
   uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8);
   uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
   uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
@@ -1889,8 +1979,7 @@ void vp10_build_interintra_predictors_sby(MACROBLOCKD *xd,
   const int bw = 4 << b_width_log2_lookup[bsize];
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    DECLARE_ALIGNED(16, uint16_t,
-                    intrapredictor[CU_SIZE * CU_SIZE]);
+    DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
     build_intra_predictors_for_interintra(
         xd, xd->plane[0].dst.buf, xd->plane[0].dst.stride,
         CONVERT_TO_BYTEPTR(intrapredictor), bw,
@@ -1907,7 +1996,7 @@ void vp10_build_interintra_predictors_sby(MACROBLOCKD *xd,
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   {
-    uint8_t intrapredictor[CU_SIZE * CU_SIZE];
+    uint8_t intrapredictor[MAX_SB_SQUARE];
     build_intra_predictors_for_interintra(
         xd, xd->plane[0].dst.buf, xd->plane[0].dst.stride,
         intrapredictor, bw,
@@ -1931,8 +2020,7 @@ void vp10_build_interintra_predictors_sbc(MACROBLOCKD *xd,
   const int bw = 4 << b_width_log2_lookup[uvbsize];
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    DECLARE_ALIGNED(16, uint16_t,
-                    uintrapredictor[CU_SIZE * CU_SIZE]);
+    DECLARE_ALIGNED(16, uint16_t, uintrapredictor[MAX_SB_SQUARE]);
     build_intra_predictors_for_interintra(
         xd, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
         CONVERT_TO_BYTEPTR(uintrapredictor), bw,
@@ -1950,7 +2038,7 @@ void vp10_build_interintra_predictors_sbc(MACROBLOCKD *xd,
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   {
-    uint8_t uintrapredictor[CU_SIZE * CU_SIZE];
+    uint8_t uintrapredictor[MAX_SB_SQUARE];
     build_intra_predictors_for_interintra(
         xd, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
         uintrapredictor, bw,
@@ -2117,30 +2205,30 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
     if (ref && get_wedge_bits(mi->mbmi.sb_type)
         && mi->mbmi.use_wedge_interinter) {
 #if CONFIG_VP9_HIGHBITDEPTH
-      uint8_t tmp_dst_[2 * CU_SIZE * CU_SIZE];
+      uint8_t tmp_dst_[2 * MAX_SB_SQUARE];
       uint8_t *tmp_dst =
           (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
           CONVERT_TO_BYTEPTR(tmp_dst_) : tmp_dst_;
 #else
-      uint8_t tmp_dst[CU_SIZE * CU_SIZE];
+      uint8_t tmp_dst[MAX_SB_SQUARE];
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #if CONFIG_VP9_HIGHBITDEPTH
         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
           int k;
           for (k = 0; k < h; ++k)
-            memcpy(tmp_dst_ + 2 * CU_SIZE * k, ext_dst1 +
+            memcpy(tmp_dst_ + 2 * MAX_SB_SIZE * k, ext_dst1 +
                    ext_dst_stride1 * 2 * k, w * 2);
         } else {
           int k;
           for (k = 0; k < h; ++k)
-            memcpy(tmp_dst_ + CU_SIZE * k, ext_dst1 +
+            memcpy(tmp_dst_ + MAX_SB_SIZE * k, ext_dst1 +
                    ext_dst_stride1 * k, w);
         }
 #else
         {
           int k;
           for (k = 0; k < h; ++k)
-            memcpy(tmp_dst + CU_SIZE * k, ext_dst1 +
+            memcpy(tmp_dst + MAX_SB_SIZE * k, ext_dst1 +
                    ext_dst_stride1 * k, w);
         }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -2149,20 +2237,20 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         build_masked_compound_extend_highbd(
-            dst, dst_buf->stride, tmp_dst, CU_SIZE, plane,
+            dst, dst_buf->stride, tmp_dst, MAX_SB_SIZE, plane,
             mi->mbmi.interinter_wedge_index,
             mi->mbmi.sb_type,
             wedge_offset_y, wedge_offset_x, h, w);
       } else {
         build_masked_compound_extend(
-            dst, dst_buf->stride, tmp_dst, CU_SIZE, plane,
+            dst, dst_buf->stride, tmp_dst, MAX_SB_SIZE, plane,
             mi->mbmi.interinter_wedge_index,
             mi->mbmi.sb_type,
             wedge_offset_y, wedge_offset_x, h, w);
       }
 #else
       build_masked_compound_extend(dst, dst_buf->stride, tmp_dst,
-                                   CU_SIZE, plane,
+                                   MAX_SB_SIZE, plane,
                                    mi->mbmi.interinter_wedge_index,
                                    mi->mbmi.sb_type,
                                    wedge_offset_y, wedge_offset_x, h, w);
@@ -2171,12 +2259,12 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
         build_masked_compound_highbd(dst, dst_buf->stride, tmp_dst,
-                                     CU_SIZE,
+                                     MAX_SB_SIZE,
                                      mi->mbmi.interinter_wedge_index,
                                      mi->mbmi.sb_type, h, w);
       else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-        build_masked_compound(dst, dst_buf->stride, tmp_dst, CU_SIZE,
+        build_masked_compound(dst, dst_buf->stride, tmp_dst, MAX_SB_SIZE,
                               mi->mbmi.interinter_wedge_index,
                               mi->mbmi.sb_type, h, w);
 #endif  // CONFIG_SUPERTX
diff --git a/vp10/common/reconinter.h b/vp10/common/reconinter.h
index 4dcd203e7..75c371e39 100644
--- a/vp10/common/reconinter.h
+++ b/vp10/common/reconinter.h
@@ -443,8 +443,8 @@ void vp10_build_prediction_by_left_preds(VP10_COMMON *cm,
 #endif  // CONFIG_OBMC
 
 #if CONFIG_EXT_INTER
-#define MASK_MASTER_SIZE   (2 * CU_SIZE)
-#define MASK_MASTER_STRIDE (2 * CU_SIZE)
+#define MASK_MASTER_SIZE   (2 * MAX_SB_SIZE)
+#define MASK_MASTER_STRIDE (2 * MAX_SB_SIZE)
 
 void vp10_init_wedge_masks();
 
diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c
index 11c4b940b..300005f74 100644
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c
@@ -44,30 +44,30 @@ static const uint8_t extend_modes[INTRA_MODES] = {
   NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // TM
 };
 
-static const uint8_t orders_64x64[1] = { 0 };
-static const uint8_t orders_64x32[2] = { 0, 1 };
-static const uint8_t orders_32x64[2] = { 0, 1 };
-static const uint8_t orders_32x32[4] = {
+static const uint8_t orders_128x128[1] = { 0 };
+static const uint8_t orders_128x64[2] = { 0, 1 };
+static const uint8_t orders_64x128[2] = { 0, 1 };
+static const uint8_t orders_64x64[4] = {
   0, 1,
   2, 3,
 };
-static const uint8_t orders_32x16[8] = {
+static const uint8_t orders_64x32[8] = {
   0, 2,
   1, 3,
   4, 6,
   5, 7,
 };
-static const uint8_t orders_16x32[8] = {
+static const uint8_t orders_32x64[8] = {
   0, 1, 2, 3,
   4, 5, 6, 7,
 };
-static const uint8_t orders_16x16[16] = {
+static const uint8_t orders_32x32[16] = {
   0,   1,  4,  5,
   2,   3,  6,  7,
   8,   9, 12, 13,
   10, 11, 14, 15,
 };
-static const uint8_t orders_16x8[32] = {
+static const uint8_t orders_32x16[32] = {
   0,   2,  8, 10,
   1,   3,  9, 11,
   4,   6, 12, 14,
@@ -77,13 +77,13 @@ static const uint8_t orders_16x8[32] = {
   20, 22, 28, 30,
   21, 23, 29, 31,
 };
-static const uint8_t orders_8x16[32] = {
+static const uint8_t orders_16x32[32] = {
   0,   1,  2,  3,  8,  9, 10, 11,
   4,   5,  6,  7, 12, 13, 14, 15,
   16, 17, 18, 19, 24, 25, 26, 27,
   20, 21, 22, 23, 28, 29, 30, 31,
 };
-static const uint8_t orders_8x8[64] = {
+static const uint8_t orders_16x16[64] = {
   0,   1,  4,  5, 16, 17, 20, 21,
   2,   3,  6,  7, 18, 19, 22, 23,
   8,   9, 12, 13, 24, 25, 28, 29,
@@ -93,24 +93,96 @@ static const uint8_t orders_8x8[64] = {
   40, 41, 44, 45, 56, 57, 60, 61,
   42, 43, 46, 47, 58, 59, 62, 63,
 };
+
+#if CONFIG_EXT_PARTITION
+static const uint8_t orders_16x8[128] = {
+  0,   2,  8, 10,  32,  34,  40,  42,
+  1,   3,  9, 11,  33,  35,  41,  43,
+  4,   6, 12, 14,  36,  38,  44,  46,
+  5,   7, 13, 15,  37,  39,  45,  47,
+  16, 18, 24, 26,  48,  50,  56,  58,
+  17, 19, 25, 27,  49,  51,  57,  59,
+  20, 22, 28, 30,  52,  54,  60,  62,
+  21, 23, 29, 31,  53,  55,  61,  63,
+  64, 66, 72, 74,  96,  98, 104, 106,
+  65, 67, 73, 75,  97,  99, 105, 107,
+  68, 70, 76, 78, 100, 102, 108, 110,
+  69, 71, 77, 79, 101, 103, 109, 111,
+  80, 82, 88, 90, 112, 114, 120, 122,
+  81, 83, 89, 91, 113, 115, 121, 123,
+  84, 86, 92, 94, 116, 118, 124, 126,
+  85, 87, 93, 95, 117, 119, 125, 127,
+};
+static const uint8_t orders_8x16[128] = {
+  0,   1,  2,  3,  8,  9, 10, 11,  32,  33,  34,  35,  40,  41,  42,  43,
+  4,   5,  6,  7, 12, 13, 14, 15,  36,  37,  38,  39,  44,  45,  46,  47,
+  16, 17, 18, 19, 24, 25, 26, 27,  48,  49,  50,  51,  56,  57,  58,  59,
+  20, 21, 22, 23, 28, 29, 30, 31,  52,  53,  54,  55,  60,  61,  62,  63,
+  64, 65, 66, 67, 72, 73, 74, 75,  96,  97,  98,  99, 104, 105, 106, 107,
+  68, 69, 70, 71, 76, 77, 78, 79, 100, 101, 102, 103, 108, 109, 110, 111,
+  80, 81, 82, 83, 88, 89, 90, 91, 112, 113, 114, 115, 120, 121, 122, 123,
+  84, 85, 86, 87, 92, 93, 94, 95, 116, 117, 118, 119, 124, 125, 126, 127,
+};
+static const uint8_t orders_8x8[256] = {
+0,     1,   4,   5,  16,  17,  20,  21,  64,  65,  68,  69,  80,  81,  84,  85,
+2,     3,   6,   7,  18,  19,  22,  23,  66,  67,  70,  71,  82,  83,  86,  87,
+8,     9,  12,  13,  24,  25,  28,  29,  72,  73,  76,  77,  88,  89,  92,  93,
+10,   11,  14,  15,  26,  27,  30,  31,  74,  75,  78,  79,  90,  91,  94,  95,
+32,   33,  36,  37,  48,  49,  52,  53,  96,  97, 100, 101, 112, 113, 116, 117,
+34,   35,  38,  39,  50,  51,  54,  55,  98,  99, 102, 103, 114, 115, 118, 119,
+40,   41,  44,  45,  56,  57,  60,  61, 104, 105, 108, 109, 120, 121, 124, 125,
+42,   43,  46,  47,  58,  59,  62,  63, 106, 107, 110, 111, 122, 123, 126, 127,
+128, 129, 132, 133, 144, 145, 148, 149, 192, 193, 196, 197, 208, 209, 212, 213,
+130, 131, 134, 135, 146, 147, 150, 151, 194, 195, 198, 199, 210, 211, 214, 215,
+136, 137, 140, 141, 152, 153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221,
+138, 139, 142, 143, 154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223,
+160, 161, 164, 165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245,
+162, 163, 166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247,
+168, 169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253,
+170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254, 255,
+};
+
 static const uint8_t *const orders[BLOCK_SIZES] = {
-  orders_8x8, orders_8x8, orders_8x8, orders_8x8,
-  orders_8x16, orders_16x8, orders_16x16,
-  orders_16x32, orders_32x16, orders_32x32,
-  orders_32x64, orders_64x32, orders_64x64,
+  //                              4X4
+                                  orders_8x8,
+  // 4X8,         8X4,            8X8
+  orders_8x8,     orders_8x8,     orders_8x8,
+  // 8X16,        16X8,           16X16
+  orders_8x16,    orders_16x8,    orders_16x16,
+  // 16X32,       32X16,          32X32
+  orders_16x32,   orders_32x16,   orders_32x32,
+  // 32X64,       64X32,          64X64
+  orders_32x64,   orders_64x32,   orders_64x64,
+  // 64x128,      128x64,         128x128
+  orders_64x128,  orders_128x64,  orders_128x128
 };
+#else
+static const uint8_t *const orders[BLOCK_SIZES] = {
+  //                              4X4
+                                  orders_16x16,
+  // 4X8,         8X4,            8X8
+  orders_16x16,   orders_16x16,   orders_16x16,
+  // 8X16,        16X8,           16X16
+  orders_16x32,   orders_32x16,   orders_32x32,
+  // 16X32,       32X16,          32X32
+  orders_32x64,   orders_64x32,   orders_64x64,
+  // 32X64,       64X32,          64X64
+  orders_64x128,  orders_128x64,  orders_128x128
+};
+#endif  // CONFIG_EXT_PARTITION
+
 #if CONFIG_EXT_PARTITION_TYPES
-static const uint8_t orders_verta_32x32[4] = {
+static const uint8_t orders_verta_64x64[4] = {
   0, 2,
   1, 2,
 };
-static const uint8_t orders_verta_16x16[16] = {
+static const uint8_t orders_verta_32x32[16] = {
   0,   2,  4,  6,
   1,   2,  5,  6,
   8,  10, 12, 14,
   9,  10, 13, 14,
 };
-static const uint8_t orders_verta_8x8[64] = {
+static const uint8_t orders_verta_16x16[64] = {
   0,   2,  4,  6, 16, 18, 20, 22,
   1,   2,  5,  6, 17, 18, 21, 22,
   8,  10, 12, 14, 24, 26, 28, 30,
@@ -120,12 +192,53 @@ static const uint8_t orders_verta_8x8[64] = {
   40, 42, 44, 46, 56, 58, 60, 62,
   41, 42, 45, 46, 57, 58, 61, 62,
 };
+#if CONFIG_EXT_PARTITION
+static const uint8_t orders_verta_8x8[256] = {
+0,     2,   4,   6,  16,  18,  20,  22,  64,  66,  68,  70,  80,  82,  84,  86,
+1,     2,   5,   6,  17,  18,  21,  22,  65,  66,  69,  70,  81,  82,  85,  86,
+8,    10,  12,  14,  24,  26,  28,  30,  72,  74,  76,  78,  88,  90,  92,  94,
+9,    10,  13,  14,  25,  26,  29,  30,  73,  74,  77,  78,  89,  90,  93,  94,
+32,   34,  36,  38,  48,  50,  52,  54,  96,  98, 100, 102, 112, 114, 116, 118,
+33,   34,  37,  38,  49,  50,  53,  54,  97,  98, 101, 102, 113, 114, 117, 118,
+40,   42,  44,  46,  56,  58,  60,  62, 104, 106, 108, 110, 120, 122, 124, 126,
+41,   42,  45,  46,  57,  58,  61,  62, 105, 106, 109, 110, 121, 122, 125, 126,
+128, 130, 132, 134, 144, 146, 148, 150, 192, 194, 196, 198, 208, 210, 212, 214,
+129, 130, 133, 134, 145, 146, 149, 150, 193, 194, 197, 198, 209, 210, 213, 214,
+136, 138, 140, 142, 152, 154, 156, 158, 200, 202, 204, 206, 216, 218, 220, 222,
+137, 138, 141, 142, 153, 154, 157, 158, 201, 202, 205, 206, 217, 218, 221, 222,
+160, 162, 164, 166, 176, 178, 180, 182, 224, 226, 228, 230, 240, 242, 244, 246,
+161, 162, 165, 166, 177, 178, 181, 182, 225, 226, 229, 230, 241, 242, 245, 246,
+168, 170, 172, 174, 184, 186, 188, 190, 232, 234, 236, 238, 248, 250, 252, 254,
+169, 170, 173, 174, 185, 186, 189, 190, 233, 234, 237, 238, 249, 250, 253, 254,
+};
+static const uint8_t *const orders_verta[BLOCK_SIZES] = {
+  //                                  4X4
+                                      orders_verta_8x8,
+  // 4X8,           8X4,              8X8
+  orders_verta_8x8, orders_verta_8x8, orders_verta_8x8,
+  // 8X16,          16X8,             16X16
+  orders_8x16,      orders_16x8,      orders_verta_16x16,
+  // 16X32,         32X16,            32X32
+  orders_16x32,     orders_32x16,     orders_verta_32x32,
+  // 32X64,         64X32,            64X64
+  orders_32x64,     orders_64x32,     orders_verta_64x64,
+  // 64x128,        128x64,           128x128
+  orders_64x128,    orders_128x64,    orders_128x128
+};
+#else
 static const uint8_t *const orders_verta[BLOCK_SIZES] = {
-  orders_verta_8x8, orders_verta_8x8, orders_verta_8x8, orders_verta_8x8,
-  orders_8x16, orders_16x8, orders_verta_16x16,
-  orders_16x32, orders_32x16, orders_verta_32x32,
-  orders_32x64, orders_64x32, orders_64x64,
+  //                                      4X4
+                                          orders_verta_16x16,
+  // 4X8,             8X4,                8X8
+  orders_verta_16x16, orders_verta_16x16, orders_verta_16x16,
+  // 8X16,            16X8,               16X16
+  orders_16x32,       orders_32x16,       orders_verta_32x32,
+  // 16X32,           32X16,              32X32
+  orders_32x64,       orders_64x32,       orders_verta_64x64,
+  // 32X64,           64X32,              64X64
+  orders_64x128,      orders_128x64,      orders_128x128
 };
+#endif  // CONFIG_EXT_PARTITION
 #endif  // CONFIG_EXT_PARTITION_TYPES
 
 static int vp10_has_right(BLOCK_SIZE bsize, int mi_row, int mi_col,
@@ -154,24 +267,26 @@ static int vp10_has_right(BLOCK_SIZE bsize, int mi_row, int mi_col,
         order = orders_verta[bsize];
       else
 #endif  // CONFIG_EXT_PARTITION_TYPES
-        order = orders[bsize];
+      order = orders[bsize];
 
       if (x + step < w)
         return 1;
 
-      mi_row = (mi_row & 7) >> hl;
-      mi_col = (mi_col & 7) >> wl;
+      mi_row = (mi_row & MI_MASK) >> hl;
+      mi_col = (mi_col & MI_MASK) >> wl;
 
       // If top row of coding unit
       if (mi_row == 0)
         return 1;
 
       // If rightmost column of coding unit
-      if (((mi_col + 1) << wl) >= 8)
+      if (((mi_col + 1) << wl) >= MI_BLOCK_SIZE)
         return 0;
 
-      my_order = order[((mi_row + 0) << (3 - wl)) + mi_col + 0];
-      tr_order = order[((mi_row - 1) << (3 - wl)) + mi_col + 1];
+      my_order =
+        order[((mi_row + 0) << (MI_BLOCK_SIZE_LOG2 - wl)) + mi_col + 0];
+      tr_order =
+        order[((mi_row - 1) << (MI_BLOCK_SIZE_LOG2 - wl)) + mi_col + 1];
 
       return my_order > tr_order;
     } else {
@@ -200,17 +315,17 @@ static int vp10_has_bottom(BLOCK_SIZE bsize, int mi_row, int mi_col,
     if (y + step < h)
       return 1;
 
-    mi_row = (mi_row & 7) >> hl;
-    mi_col = (mi_col & 7) >> wl;
+    mi_row = (mi_row & MI_MASK) >> hl;
+    mi_col = (mi_col & MI_MASK) >> wl;
 
     if (mi_col == 0)
-      return (mi_row << (hl + !ss_y)) + y + step < (8 << !ss_y);
+      return (mi_row << (hl + !ss_y)) + y + step < (MI_BLOCK_SIZE << !ss_y);
 
-    if (((mi_row + 1) << hl) >= 8)
+    if (((mi_row + 1) << hl) >= MI_BLOCK_SIZE)
       return 0;
 
-    my_order = order[((mi_row + 0) << (3 - wl)) + mi_col + 0];
-    bl_order = order[((mi_row + 1) << (3 - wl)) + mi_col - 1];
+    my_order = order[((mi_row + 0) << (MI_BLOCK_SIZE_LOG2 - wl)) + mi_col + 0];
+    bl_order = order[((mi_row + 1) << (MI_BLOCK_SIZE_LOG2 - wl)) + mi_col - 1];
 
     return bl_order < my_order;
   }
@@ -336,8 +451,8 @@ static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bs,
   if (filter_type != INTRA_FILTER_LINEAR) {
     const int pad_size = SUBPEL_TAPS >> 1;
     int len;
-    DECLARE_ALIGNED(16, uint8_t, buf[SUBPEL_SHIFTS][64]);
-    DECLARE_ALIGNED(16, uint8_t, src[64 + SUBPEL_TAPS]);
+    DECLARE_ALIGNED(16, uint8_t, buf[SUBPEL_SHIFTS][MAX_SB_SIZE]);
+    DECLARE_ALIGNED(16, uint8_t, src[MAX_SB_SIZE + SUBPEL_TAPS]);
     uint8_t flags[SUBPEL_SHIFTS];
 
     memset(flags, 0, SUBPEL_SHIFTS * sizeof(flags[0]));
@@ -467,8 +582,8 @@ static void dr_prediction_z3(uint8_t *dst, ptrdiff_t stride, int bs,
   if (filter_type != INTRA_FILTER_LINEAR) {
     const int pad_size = SUBPEL_TAPS >> 1;
     int len, i;
-    DECLARE_ALIGNED(16, uint8_t, buf[64][4 * SUBPEL_SHIFTS]);
-    DECLARE_ALIGNED(16, uint8_t, src[(64 + SUBPEL_TAPS) * 4]);
+    DECLARE_ALIGNED(16, uint8_t, buf[MAX_SB_SIZE][4 * SUBPEL_SHIFTS]);
+    DECLARE_ALIGNED(16, uint8_t, src[(MAX_SB_SIZE + SUBPEL_TAPS) * 4]);
     uint8_t flags[SUBPEL_SHIFTS];
 
     memset(flags, 0, SUBPEL_SHIFTS * sizeof(flags[0]));
@@ -1063,8 +1178,8 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
   int i;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  DECLARE_ALIGNED(16, uint16_t, left_col[64]);
-  DECLARE_ALIGNED(16, uint16_t, above_data[64 + 16]);
+  DECLARE_ALIGNED(16, uint16_t, left_col[MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[MAX_SB_SIZE + 16]);
   uint16_t *above_row = above_data + 16;
   const uint16_t *const_above_row = above_row;
   const int bs = 4 << tx_size;
@@ -1220,9 +1335,9 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
                                    int n_left_px, int n_bottomleft_px,
                                    int plane) {
   int i;
-  DECLARE_ALIGNED(16, uint8_t, left_col[64]);
+  DECLARE_ALIGNED(16, uint8_t, left_col[MAX_SB_SIZE]);
   const uint8_t *above_ref = ref - ref_stride;
-  DECLARE_ALIGNED(16, uint8_t, above_data[64 + 16]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[MAX_SB_SIZE + 16]);
   uint8_t *above_row = above_data + 16;
   const uint8_t *const_above_row = above_row;
   const int bs = 4 << tx_size;
diff --git a/vp10/common/thread_common.c b/vp10/common/thread_common.c
index c48eb46b5..b2339c686 100644
--- a/vp10/common/thread_common.c
+++ b/vp10/common/thread_common.c
@@ -109,6 +109,12 @@ void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer,
     path = LF_PATH_SLOW;
 #endif  // !CONFIG_EXT_PARTITION_TYPES
 
+#if CONFIG_EXT_PARTITION
+  printf("STOPPING: This code has not been modified to work with the "
+         "extended coding unit size experiment");
+  exit(EXIT_FAILURE);
+#endif  // CONFIG_EXT_PARTITION
+
   for (mi_row = start; mi_row < stop;
        mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
     MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
@@ -176,6 +182,12 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,
   const int num_workers = VPXMIN(nworkers, tile_cols);
   int i;
 
+#if CONFIG_EXT_PARTITION
+      printf("STOPPING: This code has not been modified to work with the "
+             "extended coding unit size experiment");
+      exit(EXIT_FAILURE);
+#endif  // CONFIG_EXT_PARTITION
+
   if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
       num_workers > lf_sync->num_workers) {
     vp10_loop_filter_dealloc(lf_sync);
diff --git a/vp10/common/vp10_convolve.c b/vp10/common/vp10_convolve.c
index 8fdd8f16c..9e0dc29c3 100644
--- a/vp10/common/vp10_convolve.c
+++ b/vp10/common/vp10_convolve.c
@@ -5,8 +5,8 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 
-#define MAX_BLOCK_WIDTH (64)
-#define MAX_BLOCK_HEIGHT (64)
+#define MAX_BLOCK_WIDTH (MAX_SB_SIZE)
+#define MAX_BLOCK_HEIGHT (MAX_SB_SIZE)
 #define MAX_STEP (32)
 #define MAX_FILTER_TAP (12)
 
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 8aa002700..2e49b3685 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -489,7 +489,7 @@ static void extend_and_predict_highbd(const uint8_t *buf_ptr1,
                                       MACROBLOCKD *xd,
                                       int w, int h, int ref, int xs, int ys) {
   DECLARE_ALIGNED(16, uint16_t,
-                  mc_buf_high[(CU_SIZE + 16) * 2 * (CU_SIZE + 16) * 2]);
+    mc_buf_high[(MAX_SB_SIZE + 16) * 2 * (MAX_SB_SIZE + 16) * 2]);
   const uint8_t *buf_ptr;
 
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -535,7 +535,8 @@ static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
 #endif  // CONFIG_EXT_INTER && CONFIG_SUPERTX
                                MACROBLOCKD *xd,
                                int w, int h, int ref, int xs, int ys) {
-  DECLARE_ALIGNED(16, uint8_t, mc_buf[(CU_SIZE + 16) * 2 * (CU_SIZE + 16) * 2]);
+  DECLARE_ALIGNED(16, uint8_t,
+    mc_buf[(MAX_SB_SIZE + 16) * 2 * (MAX_SB_SIZE + 16) * 2]);
   const uint8_t *buf_ptr;
 
   build_mc_border(buf_ptr1, pre_buf_stride, mc_buf, b_w,
@@ -1093,7 +1094,7 @@ static void set_param_topblock(VP10_COMMON *const cm,  MACROBLOCKD *const xd,
     }
 #if CONFIG_VAR_TX
   xd->above_txfm_context = cm->above_txfm_context + mi_col;
-  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MI_MASK);
   set_txfm_ctx(xd->left_txfm_context, xd->mi[0]->mbmi.tx_size, bh);
   set_txfm_ctx(xd->above_txfm_context, xd->mi[0]->mbmi.tx_size, bw);
 #endif
@@ -1304,38 +1305,38 @@ static void dec_predict_sb_complex(VP10Decoder *const pbi,
   uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
 
   DECLARE_ALIGNED(16, uint8_t,
-                  tmp_buf1[MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * 2]);
+                  tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
   DECLARE_ALIGNED(16, uint8_t,
-                  tmp_buf2[MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * 2]);
+                  tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
   DECLARE_ALIGNED(16, uint8_t,
-                  tmp_buf3[MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * 2]);
-  int dst_stride1[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
-  int dst_stride2[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
-  int dst_stride3[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
+                  tmp_buf3[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  int dst_stride1[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
+  int dst_stride2[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
+  int dst_stride3[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     int len = sizeof(uint16_t);
     dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
-    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAXTXLEN * MAXTXLEN * len);
-    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAXTXLEN * MAXTXLEN * len);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_TX_SQUARE * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_TX_SQUARE * len);
     dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
-    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAXTXLEN * MAXTXLEN * len);
-    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAXTXLEN * MAXTXLEN * len);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_TX_SQUARE * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_TX_SQUARE * len);
     dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3);
-    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAXTXLEN * MAXTXLEN * len);
-    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAXTXLEN * MAXTXLEN * len);
+    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAX_TX_SQUARE * len);
+    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAX_TX_SQUARE * len);
   } else {
 #endif
     dst_buf1[0] = tmp_buf1;
-    dst_buf1[1] = tmp_buf1 + MAXTXLEN * MAXTXLEN;
-    dst_buf1[2] = tmp_buf1 + 2 * MAXTXLEN * MAXTXLEN;
+    dst_buf1[1] = tmp_buf1 + MAX_TX_SQUARE;
+    dst_buf1[2] = tmp_buf1 + 2 * MAX_TX_SQUARE;
     dst_buf2[0] = tmp_buf2;
-    dst_buf2[1] = tmp_buf2 + MAXTXLEN * MAXTXLEN;
-    dst_buf2[2] = tmp_buf2 + 2 * MAXTXLEN * MAXTXLEN;
+    dst_buf2[1] = tmp_buf2 + MAX_TX_SQUARE;
+    dst_buf2[2] = tmp_buf2 + 2 * MAX_TX_SQUARE;
     dst_buf3[0] = tmp_buf3;
-    dst_buf3[1] = tmp_buf3 + MAXTXLEN * MAXTXLEN;
-    dst_buf3[2] = tmp_buf3 + 2 * MAXTXLEN * MAXTXLEN;
+    dst_buf3[1] = tmp_buf3 + MAX_TX_SQUARE;
+    dst_buf3[2] = tmp_buf3 + 2 * MAX_TX_SQUARE;
 #if CONFIG_VP9_HIGHBITDEPTH
   }
 #endif
@@ -1900,39 +1901,37 @@ static void decode_block(VP10Decoder *const pbi, MACROBLOCKD *const xd,
       if (mbmi->obmc) {
 #if CONFIG_VP9_HIGHBITDEPTH
         DECLARE_ALIGNED(16, uint8_t,
-                        tmp_buf1[2 * MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
+                        tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
         DECLARE_ALIGNED(16, uint8_t,
-                        tmp_buf2[2 * MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
+                        tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
 #else
         DECLARE_ALIGNED(16, uint8_t,
-                        tmp_buf1[MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
+                        tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
         DECLARE_ALIGNED(16, uint8_t,
-                        tmp_buf2[MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
+                        tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
-        int dst_stride1[MAX_MB_PLANE] = {CU_SIZE, CU_SIZE, CU_SIZE};
-        int dst_stride2[MAX_MB_PLANE] = {CU_SIZE, CU_SIZE, CU_SIZE};
+        int dst_stride1[MAX_MB_PLANE] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
+        int dst_stride2[MAX_MB_PLANE] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
 
         assert(mbmi->sb_type >= BLOCK_8X8);
 #if CONFIG_VP9_HIGHBITDEPTH
         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
           int len = sizeof(uint16_t);
           dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
-          dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + CU_SIZE * CU_SIZE * len);
-          dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 +
-                                           CU_SIZE * CU_SIZE * 2 * len);
+          dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
+          dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
           dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
-          dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + CU_SIZE * CU_SIZE * len);
-          dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 +
-                                           CU_SIZE * CU_SIZE * 2 * len);
+          dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
+          dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
         } else {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
           dst_buf1[0] = tmp_buf1;
-          dst_buf1[1] = tmp_buf1 + CU_SIZE * CU_SIZE;
-          dst_buf1[2] = tmp_buf1 + CU_SIZE * CU_SIZE * 2;
+          dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
+          dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
           dst_buf2[0] = tmp_buf2;
-          dst_buf2[1] = tmp_buf2 + CU_SIZE * CU_SIZE;
-          dst_buf2[2] = tmp_buf2 + CU_SIZE * CU_SIZE * 2;
+          dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
+          dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
 #if CONFIG_VP9_HIGHBITDEPTH
         }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -3281,7 +3280,7 @@ static const uint8_t *decode_tiles(VP10Decoder *pbi,
 #if CONFIG_ANS
                            &td->token_ans,
 #endif  // CONFIG_ANS
-                           BLOCK_64X64, 4);
+                           BLOCK_LARGEST, MAX_SB_SIZE_LOG2 - 2);
         }
         pbi->mb.corrupted |= td->xd.corrupted;
         if (pbi->mb.corrupted)
@@ -3396,7 +3395,7 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
 #if CONFIG_ANS
                        &tile_data->token_ans,
 #endif  // CONFIG_ANS
-                       BLOCK_64X64, 4);
+                       BLOCK_LARGEST, MAX_SB_SIZE_LOG2 - 2);
     }
   }
   return !tile_data->xd.corrupted;
diff --git a/vp10/decoder/decoder.h b/vp10/decoder/decoder.h
index 5337cbed4..23c742421 100644
--- a/vp10/decoder/decoder.h
+++ b/vp10/decoder/decoder.h
@@ -39,8 +39,8 @@ typedef struct TileData {
 #endif  // CONFIG_ANS
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
-  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][64 * 64]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
 } TileData;
 
 typedef struct TileWorkerData {
@@ -52,8 +52,8 @@ typedef struct TileWorkerData {
   FRAME_COUNTS counts;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
-  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][64 * 64]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
   struct vpx_internal_error_info error_info;
 } TileWorkerData;
 
diff --git a/vp10/decoder/detokenize.c b/vp10/decoder/detokenize.c
index e0f59fb6e..bf4822197 100644
--- a/vp10/decoder/detokenize.c
+++ b/vp10/decoder/detokenize.c
@@ -62,7 +62,7 @@ static int decode_coefs(const MACROBLOCKD *xd,
   const vpx_prob *prob;
   unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
   unsigned int (*eob_branch_count)[COEFF_CONTEXTS];
-  uint8_t token_cache[32 * 32];
+  uint8_t token_cache[MAX_TX_SQUARE];
   const uint8_t *band_translate = get_band_translate(tx_size);
   int dq_shift;
   int v, token;
@@ -245,7 +245,7 @@ static int decode_coefs_ans(const MACROBLOCKD *const xd,
   const vpx_prob *prob;
   unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
   unsigned int (*eob_branch_count)[COEFF_CONTEXTS];
-  uint8_t token_cache[32 * 32];
+  uint8_t token_cache[MAX_TX_SQUARE];
   const uint8_t *band_translate = get_band_translate(tx_size);
   int dq_shift;
   int v, token;
diff --git a/vp10/encoder/aq_complexity.c b/vp10/encoder/aq_complexity.c
index 2506a4e55..9f73eccf7 100644
--- a/vp10/encoder/aq_complexity.c
+++ b/vp10/encoder/aq_complexity.c
@@ -116,8 +116,8 @@ void vp10_caq_select_segment(VP10_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
   VP10_COMMON *const cm = &cpi->common;
 
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
-  const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+  const int bw = num_8x8_blocks_wide_lookup[BLOCK_LARGEST];
+  const int bh = num_8x8_blocks_high_lookup[BLOCK_LARGEST];
   const int xmis = VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[bs]);
   const int ymis = VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[bs]);
   int x, y;
diff --git a/vp10/encoder/aq_cyclicrefresh.c b/vp10/encoder/aq_cyclicrefresh.c
index 4d7b7d950..defb97401 100644
--- a/vp10/encoder/aq_cyclicrefresh.c
+++ b/vp10/encoder/aq_cyclicrefresh.c
@@ -415,9 +415,9 @@ static void cyclic_refresh_update_map(VP10_COMP *const cpi) {
     bl_index = mi_row * cm->mi_cols + mi_col;
     // Loop through all 8x8 blocks in superblock and update map.
     xmis =
-        VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_64X64]);
+        VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_LARGEST]);
     ymis =
-        VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]);
+        VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_LARGEST]);
     for (y = 0; y < ymis; y++) {
       for (x = 0; x < xmis; x++) {
         const int bl_index2 = bl_index + y * cm->mi_cols + x;
diff --git a/vp10/encoder/aq_variance.c b/vp10/encoder/aq_variance.c
index bed5162fb..45dc8b8f1 100644
--- a/vp10/encoder/aq_variance.c
+++ b/vp10/encoder/aq_variance.c
@@ -32,9 +32,11 @@ static const int segment_id[ENERGY_SPAN] = {0, 1, 1, 2, 3, 4};
 
 #define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN]
 
-DECLARE_ALIGNED(16, static const uint8_t, vp10_64_zeros[64]) = {0};
+DECLARE_ALIGNED(16, static const uint8_t,
+                vp10_all_zeros[MAX_SB_SIZE]) = {0};
 #if CONFIG_VP9_HIGHBITDEPTH
-DECLARE_ALIGNED(16, static const uint16_t, vp10_highbd_64_zeros[64]) = {0};
+DECLARE_ALIGNED(16, static const uint16_t,
+                vp10_highbd_all_zeros[MAX_SB_SIZE]) = {0};
 #endif
 
 unsigned int vp10_vaq_segment_id(int energy) {
@@ -153,17 +155,17 @@ static unsigned int block_variance(VP10_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
-                           CONVERT_TO_BYTEPTR(vp10_highbd_64_zeros), 0, bw, bh,
+                           CONVERT_TO_BYTEPTR(vp10_highbd_all_zeros), 0, bw, bh,
                            &sse, &avg);
       sse >>= 2 * (xd->bd - 8);
       avg >>= (xd->bd - 8);
     } else {
       aq_variance(x->plane[0].src.buf, x->plane[0].src.stride,
-                  vp10_64_zeros, 0, bw, bh, &sse, &avg);
+                  vp10_all_zeros, 0, bw, bh, &sse, &avg);
     }
 #else
     aq_variance(x->plane[0].src.buf, x->plane[0].src.stride,
-                vp10_64_zeros, 0, bw, bh, &sse, &avg);
+                vp10_all_zeros, 0, bw, bh, &sse, &avg);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     var = sse - (((int64_t)avg * avg) / (bw * bh));
     return (256 * var) / (bw * bh);
@@ -172,17 +174,17 @@ static unsigned int block_variance(VP10_COMP *cpi, MACROBLOCK *x,
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
                                x->plane[0].src.stride,
-                               CONVERT_TO_BYTEPTR(vp10_highbd_64_zeros),
+                               CONVERT_TO_BYTEPTR(vp10_highbd_all_zeros),
                                0, &sse);
     } else {
       var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
                                x->plane[0].src.stride,
-                               vp10_64_zeros, 0, &sse);
+                               vp10_all_zeros, 0, &sse);
     }
 #else
     var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
                              x->plane[0].src.stride,
-                             vp10_64_zeros, 0, &sse);
+                             vp10_all_zeros, 0, &sse);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     return (256 * var) >> num_pels_log2_lookup[bs];
   }
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index ac731352e..721a7a6da 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -1893,7 +1893,7 @@ static void write_modes(VP10_COMP *const cpi,
 
     for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) {
       write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end, 0,
-                             mi_row, mi_col, BLOCK_64X64);
+                             mi_row, mi_col, BLOCK_LARGEST);
     }
   }
 }
diff --git a/vp10/encoder/block.h b/vp10/encoder/block.h
index 582f3bc23..b5e61d99e 100644
--- a/vp10/encoder/block.h
+++ b/vp10/encoder/block.h
@@ -28,7 +28,7 @@ typedef struct {
 } diff;
 
 typedef struct macroblock_plane {
-  DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
+  DECLARE_ALIGNED(16, int16_t, src_diff[MAX_SB_SQUARE]);
   tran_low_t *qcoeff;
   tran_low_t *coeff;
   uint16_t *eobs;
@@ -63,10 +63,10 @@ typedef struct {
 } MB_MODE_INFO_EXT;
 
 typedef struct {
-  uint8_t best_palette_color_map[4096];
-  double kmeans_data_buf[2 * 4096];
-  uint8_t kmeans_indices_buf[4096];
-  uint8_t kmeans_pre_indices_buf[4096];
+  uint8_t best_palette_color_map[MAX_SB_SQUARE];
+  double kmeans_data_buf[2 * MAX_SB_SQUARE];
+  uint8_t kmeans_indices_buf[MAX_SB_SQUARE];
+  uint8_t kmeans_pre_indices_buf[MAX_SB_SQUARE];
 } PALETTE_BUFFER;
 
 typedef struct macroblock MACROBLOCK;
@@ -140,11 +140,11 @@ struct macroblock {
 
   // Notes transform blocks where no coefficents are coded.
   // Set during mode selection. Read during block encoding.
-  uint8_t zcoeff_blk[TX_SIZES][256];
+  uint8_t zcoeff_blk[TX_SIZES][MI_BLOCK_SIZE * MI_BLOCK_SIZE * 4];
 #if CONFIG_VAR_TX
-  uint8_t blk_skip[MAX_MB_PLANE][256];
+  uint8_t blk_skip[MAX_MB_PLANE][MI_BLOCK_SIZE * MI_BLOCK_SIZE * 4];
 #if CONFIG_REF_MV
-  uint8_t blk_skip_drl[MAX_MB_PLANE][256];
+  uint8_t blk_skip_drl[MAX_MB_PLANE][MI_BLOCK_SIZE * MI_BLOCK_SIZE * 4];
 #endif
 #endif
 
@@ -164,12 +164,12 @@ struct macroblock {
   int quant_fp;
 
   // skip forward transform and quantization
-  uint8_t skip_txfm[MAX_MB_PLANE][4];
+  uint8_t skip_txfm[MAX_MB_PLANE][MAX_TX_BLOCKS_IN_MAX_SB];
   #define SKIP_TXFM_NONE 0
   #define SKIP_TXFM_AC_DC 1
   #define SKIP_TXFM_AC_ONLY 2
 
-  int64_t bsse[MAX_MB_PLANE][4];
+  int64_t bsse[MAX_MB_PLANE][MAX_TX_BLOCKS_IN_MAX_SB];
 
   // Used to store sub partition's choices.
   MV pred_mv[MAX_REF_FRAMES];
diff --git a/vp10/encoder/context_tree.c b/vp10/encoder/context_tree.c
index 0a7619530..b7c826045 100644
--- a/vp10/encoder/context_tree.c
+++ b/vp10/encoder/context_tree.c
@@ -11,11 +11,14 @@
 #include "vp10/encoder/context_tree.h"
 #include "vp10/encoder/encoder.h"
 
-static const BLOCK_SIZE square[] = {
+static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 2] = {
   BLOCK_8X8,
   BLOCK_16X16,
   BLOCK_32X32,
   BLOCK_64X64,
+#if CONFIG_EXT_PARTITION
+  BLOCK_128X128,
+#endif  // CONFIG_EXT_PARTITION
 };
 
 static void alloc_mode_context(VP10_COMMON *cm, int num_4x4_blk,
@@ -53,6 +56,14 @@ static void alloc_mode_context(VP10_COMMON *cm, int num_4x4_blk,
       ctx->eobs_pbuf[i][k]    = ctx->eobs[i][k];
     }
   }
+
+  if (cm->allow_screen_content_tools) {
+    for (i = 0;  i < 2; ++i) {
+      CHECK_MEM_ERROR(cm, ctx->color_index_map[i],
+                    vpx_memalign(32,
+                                 num_pix * sizeof(*ctx->color_index_map[i])));
+    }
+  }
 }
 
 static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
@@ -177,8 +188,13 @@ static void free_tree_contexts(PC_TREE *tree) {
 // represents the state of our search.
 void vp10_setup_pc_tree(VP10_COMMON *cm, ThreadData *td) {
   int i, j;
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 256;
+  const int tree_nodes = 256 + 64 + 16 + 4 + 1;
+#else
   const int leaf_nodes = 64;
   const int tree_nodes = 64 + 16 + 4 + 1;
+#endif  // CONFIG_EXT_PARTITION
   int pc_tree_index = 0;
   PC_TREE *this_pc;
   PICK_MODE_CONTEXT *this_leaf;
@@ -217,7 +233,7 @@ void vp10_setup_pc_tree(VP10_COMMON *cm, ThreadData *td) {
 
   // Each node has 4 leaf nodes, fill each block_size level of the tree
   // from leafs to the root.
-  for (nodes = 16; nodes > 0; nodes >>= 2) {
+  for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
     for (i = 0; i < nodes; ++i) {
       PC_TREE *const tree = &td->pc_tree[pc_tree_index];
       alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
@@ -233,11 +249,17 @@ void vp10_setup_pc_tree(VP10_COMMON *cm, ThreadData *td) {
 }
 
 void vp10_free_pc_tree(ThreadData *td) {
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 256;
+  const int tree_nodes = 256 + 64 + 16 + 4 + 1;
+#else
+  const int leaf_nodes = 64;
   const int tree_nodes = 64 + 16 + 4 + 1;
+#endif  // CONFIG_EXT_PARTITION
   int i;
 
   // Set up all 4x4 mode contexts
-  for (i = 0; i < 64; ++i)
+  for (i = 0; i < leaf_nodes; ++i)
     free_mode_context(&td->leaf_tree[i]);
 
   // Sets up all the leaf nodes in the tree.
diff --git a/vp10/encoder/context_tree.h b/vp10/encoder/context_tree.h
index de17e3ea2..7b49354d6 100644
--- a/vp10/encoder/context_tree.h
+++ b/vp10/encoder/context_tree.h
@@ -49,7 +49,6 @@ typedef struct {
   // For current partition, only if all Y, U, and V transform blocks'
   // coefficients are quantized to 0, skippable is set to 0.
   int skippable;
-  uint8_t skip_txfm[MAX_MB_PLANE << 2];
   int best_mode_index;
   int hybrid_pred_diff;
   int comp_pred_diff;
diff --git a/vp10/encoder/denoiser.c b/vp10/encoder/denoiser.c
index e87667653..fb0280a58 100644
--- a/vp10/encoder/denoiser.c
+++ b/vp10/encoder/denoiser.c
@@ -189,7 +189,7 @@ int vp10_denoiser_filter_c(const uint8_t *sig, int sig_stride,
 
 static uint8_t *block_start(uint8_t *framebuf, int stride,
                             int mi_row, int mi_col) {
-  return framebuf + (stride * mi_row * 8) + (mi_col * 8);
+  return framebuf + (stride * mi_row * MI_SIZE) + (mi_col * MI_SIZE);
 }
 
 static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index bcedc0ce1..b73f66cce 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -93,7 +93,16 @@ static void rd_supertx_sb(VP10_COMP *cpi, ThreadData *td,
 //  purposes of activity masking.
 // Eventually this should be replaced by custom no-reference routines,
 //  which will be faster.
-static const uint8_t VP9_VAR_OFFS[64] = {
+static const uint8_t VP10_VAR_OFFS[MAX_SB_SIZE] = {
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+#if CONFIG_EXT_PARTITION
     128, 128, 128, 128, 128, 128, 128, 128,
     128, 128, 128, 128, 128, 128, 128, 128,
     128, 128, 128, 128, 128, 128, 128, 128,
@@ -102,10 +111,20 @@ static const uint8_t VP9_VAR_OFFS[64] = {
     128, 128, 128, 128, 128, 128, 128, 128,
     128, 128, 128, 128, 128, 128, 128, 128,
     128, 128, 128, 128, 128, 128, 128, 128
+#endif  // CONFIG_EXT_PARTITION
 };
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static const uint16_t VP9_HIGH_VAR_OFFS_8[64] = {
+static const uint16_t VP10_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = {
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+#if CONFIG_EXT_PARTITION
     128, 128, 128, 128, 128, 128, 128, 128,
     128, 128, 128, 128, 128, 128, 128, 128,
     128, 128, 128, 128, 128, 128, 128, 128,
@@ -114,9 +133,19 @@ static const uint16_t VP9_HIGH_VAR_OFFS_8[64] = {
     128, 128, 128, 128, 128, 128, 128, 128,
     128, 128, 128, 128, 128, 128, 128, 128,
     128, 128, 128, 128, 128, 128, 128, 128
+#endif  // CONFIG_EXT_PARTITION
 };
 
-static const uint16_t VP9_HIGH_VAR_OFFS_10[64] = {
+static const uint16_t VP10_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = {
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+#if CONFIG_EXT_PARTITION
     128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
     128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
     128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
@@ -125,9 +154,19 @@ static const uint16_t VP9_HIGH_VAR_OFFS_10[64] = {
     128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
     128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
     128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4
+#endif  // CONFIG_EXT_PARTITION
 };
 
-static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = {
+static const uint16_t VP10_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+#if CONFIG_EXT_PARTITION
     128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
     128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
     128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
@@ -136,6 +175,7 @@ static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = {
     128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
     128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
     128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16
+#endif  // CONFIG_EXT_PARTITION
 };
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -144,7 +184,7 @@ unsigned int vp10_get_sby_perpixel_variance(VP10_COMP *cpi,
                                            BLOCK_SIZE bs) {
   unsigned int sse;
   const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
-                                              VP9_VAR_OFFS, 0, &sse);
+                                              VP10_VAR_OFFS, 0, &sse);
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
@@ -155,18 +195,18 @@ unsigned int vp10_high_get_sby_perpixel_variance(
   switch (bd) {
     case 10:
       var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
-                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10),
+                               CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_10),
                                0, &sse);
       break;
     case 12:
       var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
-                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12),
+                               CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_12),
                                0, &sse);
       break;
     case 8:
     default:
       var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
-                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8),
+                               CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_8),
                                0, &sse);
       break;
   }
@@ -406,6 +446,13 @@ typedef struct {
   v32x32 split[4];
 } v64x64;
 
+#if CONFIG_EXT_PARTITION
+typedef struct {
+  partition_variance part_variances;
+  v64x64 split[4];
+} v128x128;
+#endif  // CONFIG_EXT_PARTITION
+
 typedef struct {
   partition_variance *part_variances;
   var *split[4];
@@ -415,12 +462,24 @@ typedef enum {
   V16X16,
   V32X32,
   V64X64,
+#if CONFIG_EXT_PARTITION
+  V128X128,
+#endif  // CONFIG_EXT_PARTITION
 } TREE_LEVEL;
 
 static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
   int i;
   node->part_variances = NULL;
   switch (bsize) {
+#if CONFIG_EXT_PARTITION
+    case BLOCK_128X128: {
+      v128x128 *vt = (v128x128 *) data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+#endif  // CONFIG_EXT_PARTITION
     case BLOCK_64X64: {
       v64x64 *vt = (v64x64 *) data;
       node->part_variances = &vt->part_variances;
@@ -770,7 +829,8 @@ static int choose_partitioning(VP10_COMP *cpi,
   const uint8_t *d;
   int sp;
   int dp;
-  int pixels_wide = 64, pixels_high = 64;
+  int pixels_wide = 8 * num_8x8_blocks_wide_lookup[BLOCK_LARGEST];
+  int pixels_high = 8 * num_8x8_blocks_high_lookup[BLOCK_LARGEST];
   int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
       cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]};
 
@@ -781,10 +841,11 @@ static int choose_partitioning(VP10_COMP *cpi,
   int variance4x4downsample[16];
 
   int segment_id = CR_SEGMENT_ID_BASE;
+
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
     const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map :
                                                     cm->last_frame_seg_map;
-    segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+    segment_id = get_segment_id(cm, map, BLOCK_LARGEST, mi_row, mi_col);
 
     if (cyclic_refresh_segment_id_boosted(segment_id)) {
       int q = vp10_get_qindex(&cm->seg, segment_id, cm->base_qindex);
@@ -792,11 +853,12 @@ static int choose_partitioning(VP10_COMP *cpi,
     }
   }
 
-#if CONFIG_EXT_PARTITION_TYPES
-  assert(0);
-#endif
+#if CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+  printf("Not yet implemented: choose_partitioning\n");
+  exit(-1);
+#endif  // CONFIG_EXT_PARTITION
 
-  set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+  set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_LARGEST);
 
   if (xd->mb_to_right_edge < 0)
     pixels_wide += (xd->mb_to_right_edge >> 3);
@@ -813,8 +875,20 @@ static int choose_partitioning(VP10_COMP *cpi,
 
     const YV12_BUFFER_CONFIG *yv12_g = NULL;
     unsigned int y_sad, y_sad_g;
-    const BLOCK_SIZE bsize = BLOCK_32X32
-        + (mi_col + 4 < cm->mi_cols) * 2 + (mi_row + 4 < cm->mi_rows);
+
+    const int max_mi_block_size = num_8x8_blocks_wide_lookup[BLOCK_LARGEST];
+    const int is_right_edge = mi_col + max_mi_block_size / 2 > cm->mi_cols;
+    const int is_left_edge = mi_row + max_mi_block_size / 2 > cm->mi_rows;
+    BLOCK_SIZE bsize;
+
+    if (is_right_edge && is_left_edge)
+      bsize = get_subsize(BLOCK_LARGEST, PARTITION_SPLIT);
+    else if (is_right_edge)
+      bsize = get_subsize(BLOCK_LARGEST, PARTITION_VERT);
+    else if (is_left_edge)
+      bsize = get_subsize(BLOCK_LARGEST, PARTITION_HORZ);
+    else
+      bsize = BLOCK_LARGEST;
 
     assert(yv12 != NULL);
     yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
@@ -834,7 +908,7 @@ static int choose_partitioning(VP10_COMP *cpi,
                          &cm->frame_refs[LAST_FRAME - 1].sf);
     mbmi->ref_frame[0] = LAST_FRAME;
     mbmi->ref_frame[1] = NONE;
-    mbmi->sb_type = BLOCK_64X64;
+    mbmi->sb_type = BLOCK_LARGEST;
     mbmi->mv[0].as_int = 0;
     mbmi->interp_filter = BILINEAR;
 
@@ -849,7 +923,7 @@ static int choose_partitioning(VP10_COMP *cpi,
       x->pred_mv[LAST_FRAME] = mbmi->mv[0].as_mv;
     }
 
-    vp10_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
+    vp10_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_LARGEST);
 
     for (i = 1; i <= 2; ++i) {
       struct macroblock_plane  *p = &x->plane[i];
@@ -868,33 +942,29 @@ static int choose_partitioning(VP10_COMP *cpi,
     d = xd->plane[0].dst.buf;
     dp = xd->plane[0].dst.stride;
 
-    // If the y_sad is very small, take 64x64 as partition and exit.
-    // Don't check on boosted segment for now, as 64x64 is suppressed there.
-    if (segment_id == CR_SEGMENT_ID_BASE &&
-        y_sad < cpi->vbp_threshold_sad) {
-      const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64];
-      const int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64];
-      if (mi_col + block_width / 2 < cm->mi_cols &&
-          mi_row + block_height / 2 < cm->mi_rows) {
-        set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_64X64);
+    // If the y_sad is very small, take the largest partition and exit.
+    // Don't check on boosted segment for now, as largest is suppressed there.
+    if (segment_id == CR_SEGMENT_ID_BASE && y_sad < cpi->vbp_threshold_sad) {
+      if (!is_right_edge && !is_left_edge) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_LARGEST);
         return 0;
       }
     }
   } else {
-    d = VP9_VAR_OFFS;
+    d = VP10_VAR_OFFS;
     dp = 0;
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       switch (xd->bd) {
         case 10:
-          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10);
+          d = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_10);
           break;
         case 12:
-          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12);
+          d = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_12);
           break;
         case 8:
         default:
-          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8);
+          d = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_8);
           break;
       }
     }
@@ -1699,15 +1769,6 @@ static void rd_pick_sb_modes(VP10_COMP *cpi,
     p[i].eobs = ctx->eobs_pbuf[i][0];
   }
 
-  if (cm->current_video_frame == 0 && cm->allow_screen_content_tools) {
-    for (i = 0; i < 2; ++i) {
-      if (ctx->color_index_map[i] == 0) {
-        CHECK_MEM_ERROR(cm, ctx->color_index_map[i],
-                        vpx_memalign(16, (ctx->num_4x4_blk << 4) *
-                                     sizeof(*ctx->color_index_map[i])));
-      }
-    }
-  }
   for (i = 0; i < 2; ++i)
     pd[i].color_index_map = ctx->color_index_map[i];
 
@@ -2084,17 +2145,16 @@ static void update_stats(VP10_COMMON *cm, ThreadData *td
   }
 }
 
-
 typedef struct {
-  ENTROPY_CONTEXT a[16 * MAX_MB_PLANE];
-  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE];
-  PARTITION_CONTEXT sa[8];
-  PARTITION_CONTEXT sl[8];
+  ENTROPY_CONTEXT a[2 * MI_BLOCK_SIZE * MAX_MB_PLANE];
+  ENTROPY_CONTEXT l[2 * MI_BLOCK_SIZE * MAX_MB_PLANE];
+  PARTITION_CONTEXT sa[MI_BLOCK_SIZE];
+  PARTITION_CONTEXT sl[MI_BLOCK_SIZE];
 #if CONFIG_VAR_TX
   TXFM_CONTEXT *p_ta;
   TXFM_CONTEXT *p_tl;
-  TXFM_CONTEXT ta[8];
-  TXFM_CONTEXT tl[8];
+  TXFM_CONTEXT ta[MI_BLOCK_SIZE];
+  TXFM_CONTEXT tl[MI_BLOCK_SIZE];
 #endif
 } RD_SEARCH_MACROBLOCK_CONTEXT;
 
@@ -2892,11 +2952,11 @@ static void rd_use_partition(VP10_COMP *cpi,
 
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
-  if (bsize == BLOCK_64X64)
+  if (bsize == BLOCK_LARGEST)
     assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
 
   if (do_recon) {
-    int output_enabled = (bsize == BLOCK_64X64);
+    int output_enabled = (bsize == BLOCK_LARGEST);
     encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
               pc_tree);
   }
@@ -2909,21 +2969,38 @@ static void rd_use_partition(VP10_COMP *cpi,
 }
 
 static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
-  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,
-  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,
-  BLOCK_8X8,   BLOCK_8X8,   BLOCK_8X8,
-  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
-  BLOCK_16X16
+                              BLOCK_4X4,    //                     4x4
+    BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,    //    4x8,    8x4,     8x8
+    BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,    //   8x16,   16x8,   16x16
+    BLOCK_8X8,   BLOCK_8X8, BLOCK_16X16,    //  16x32,  32x16,   32x32
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,    //  32x64,  64x32,   64x64
+#if CONFIG_EXT_PARTITION
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16     // 64x128, 128x64, 128x128
+#endif  // CONFIG_EXT_PARTITION
 };
 
 static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
-  BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16,
-  BLOCK_16X16, BLOCK_32X32, BLOCK_32X32,
-  BLOCK_32X32, BLOCK_64X64, BLOCK_64X64,
-  BLOCK_64X64, BLOCK_64X64, BLOCK_64X64,
-  BLOCK_64X64
+                               BLOCK_8X8,   //                     4x4
+  BLOCK_16X16, BLOCK_16X16,  BLOCK_16X16,   //    4x8,    8x4,     8x8
+  BLOCK_32X32, BLOCK_32X32,  BLOCK_32X32,   //   8x16,   16x8,   16x16
+  BLOCK_64X64, BLOCK_64X64,  BLOCK_64X64,   //  16x32,  32x16,   32x32
+  BLOCK_64X64, BLOCK_64X64,  BLOCK_64X64,   //  32x64,  64x32,   64x64
+#if CONFIG_EXT_PARTITION
+  BLOCK_64X64, BLOCK_64X64, BLOCK_128X128   // 64x128, 128x64, 128x128
+#endif  // CONFIG_EXT_PARTITION
 };
 
+// Next square block size less or equal than current block size.
+static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
+                                BLOCK_4X4,  //                     4x4
+    BLOCK_4X4,   BLOCK_4X4,     BLOCK_8X8,  //    4x8,    8x4,     8x8
+    BLOCK_8X8,   BLOCK_8X8,   BLOCK_16X16,  //   8x16,   16x8,   16x16
+  BLOCK_16X16, BLOCK_16X16,   BLOCK_32X32,  //  16x32,  32x16,   32x32
+  BLOCK_32X32, BLOCK_32X32,   BLOCK_64X64,  //  32x64,  64x32,   64x64
+#if CONFIG_EXT_PARTITION
+  BLOCK_64X64, BLOCK_64X64, BLOCK_128X128   // 64x128, 128x64, 128x128
+#endif  // CONFIG_EXT_PARTITION
+};
 
 // Look at all the mode_info entries for blocks that are part of this
 // partition and find the min and max values for sb_type.
@@ -2954,15 +3031,6 @@ static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8,
   }
 }
 
-// Next square block size less or equal than current block size.
-static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
-  BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
-  BLOCK_8X8, BLOCK_8X8, BLOCK_8X8,
-  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
-  BLOCK_32X32, BLOCK_32X32, BLOCK_32X32,
-  BLOCK_64X64
-};
-
 // Look at neighboring blocks and set a min and max partition size based on
 // what they chose.
 static void rd_auto_partition_range(VP10_COMP *cpi, const TileInfo *const tile,
@@ -2978,13 +3046,13 @@ static void rd_auto_partition_range(VP10_COMP *cpi, const TileInfo *const tile,
   const int col8x8_remaining = tile->mi_col_end - mi_col;
   int bh, bw;
   BLOCK_SIZE min_size = BLOCK_4X4;
-  BLOCK_SIZE max_size = BLOCK_64X64;
+  BLOCK_SIZE max_size = BLOCK_LARGEST;
   int bs_hist[BLOCK_SIZES] = {0};
 
   // Trap case where we do not have a prediction.
   if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) {
     // Default "min to max" and "max to min"
-    min_size = BLOCK_64X64;
+    min_size = BLOCK_LARGEST;
     max_size = BLOCK_4X4;
 
     // NOTE: each call to get_sb_partition_size_range() uses the previous
@@ -3054,7 +3122,7 @@ static void set_partition_range(VP10_COMMON *cm, MACROBLOCKD *xd,
   MODE_INFO **prev_mi = &cm->prev_mi_grid_visible[idx_str];
   BLOCK_SIZE bs, min_size, max_size;
 
-  min_size = BLOCK_64X64;
+  min_size = BLOCK_LARGEST;
   max_size = BLOCK_4X4;
 
   if (prev_mi) {
@@ -3104,16 +3172,27 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
 }
 
 #if CONFIG_FP_MB_STATS
-const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] =
-  {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4};
-const int num_16x16_blocks_high_lookup[BLOCK_SIZES] =
-  {1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4};
 const int qindex_skip_threshold_lookup[BLOCK_SIZES] =
-  {0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120};
+  {0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120,
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha): What are the correct numbers here?
+  130, 130, 150
+#endif  // CONFIG_EXT_PARTITION
+  };
 const int qindex_split_threshold_lookup[BLOCK_SIZES] =
-  {0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120};
+  {0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120,
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha): What are the correct numbers here?
+  160, 160, 240
+#endif  // CONFIG_EXT_PARTITION
+  };
 const int complexity_16x16_blocks_threshold[BLOCK_SIZES] =
-  {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6};
+  {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha): What are the correct numbers here?
+  8, 8, 10
+#endif  // CONFIG_EXT_PARTITION
+  };
 
 typedef enum {
   MV_ZERO = 0,
@@ -3526,8 +3605,8 @@ static void rd_pick_partition(VP10_COMP *cpi, ThreadData *td,
           pc_tree->partitioning = PARTITION_NONE;
 
         // Adjust dist breakout threshold according to the partition size.
-        dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] +
-            b_height_log2_lookup[bsize]);
+        dist_breakout_thr >>= (2 * (MAX_SB_SIZE_LOG2 - 2))
+          - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
 
         rate_breakout_thr *= num_pels_log2_lookup[bsize];
 
@@ -4124,12 +4203,12 @@ static void rd_pick_partition(VP10_COMP *cpi, ThreadData *td,
 
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       pc_tree->index != 3) {
-    int output_enabled = (bsize == BLOCK_64X64);
+    int output_enabled = (bsize == BLOCK_LARGEST);
     encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
               bsize, pc_tree);
   }
 
-  if (bsize == BLOCK_64X64) {
+  if (bsize == BLOCK_LARGEST) {
     assert(tp_orig < *tp || (tp_orig == *tp && xd->mi[0]->mbmi.skip));
     assert(best_rdc.rate < INT_MAX);
     assert(best_rdc.dist < INT64_MAX);
@@ -4149,6 +4228,11 @@ static void encode_rd_sb_row(VP10_COMP *cpi,
   MACROBLOCKD *const xd = &x->e_mbd;
   SPEED_FEATURES *const sf = &cpi->sf;
   int mi_col;
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 256;
+#else
+  const int leaf_nodes = 64;
+#endif  // CONFIG_EXT_PARTITION
 
   // Initialize the left context for the new SB row
   vp10_zero_left_context(xd);
@@ -4170,10 +4254,10 @@ static void encode_rd_sb_row(VP10_COMP *cpi,
     MODE_INFO **mi = cm->mi_grid_visible + idx_str;
 
     if (sf->adaptive_pred_interp_filter) {
-      for (i = 0; i < 64; ++i)
+      for (i = 0; i < leaf_nodes; ++i)
         td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
 
-      for (i = 0; i < 64; ++i) {
+      for (i = 0; i < leaf_nodes; ++i) {
         td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
         td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
         td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
@@ -4187,29 +4271,29 @@ static void encode_rd_sb_row(VP10_COMP *cpi,
     if (seg->enabled) {
       const uint8_t *const map = seg->update_map ? cpi->segmentation_map
                                                  : cm->last_frame_seg_map;
-      int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+      int segment_id = get_segment_id(cm, map, BLOCK_LARGEST, mi_row, mi_col);
       seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
     }
 
     x->source_variance = UINT_MAX;
     if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
       const BLOCK_SIZE bsize =
-          seg_skip ? BLOCK_64X64 : sf->always_this_block_size;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+          seg_skip ? BLOCK_LARGEST : sf->always_this_block_size;
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_LARGEST);
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
-                       BLOCK_64X64, &dummy_rate, &dummy_dist,
+                       BLOCK_LARGEST, &dummy_rate, &dummy_dist,
 #if CONFIG_SUPERTX
                        &dummy_rate_nocoef,
 #endif  // CONFIG_SUPERTX
                        1, td->pc_root);
     } else if (cpi->partition_search_skippable_frame) {
       BLOCK_SIZE bsize;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_LARGEST);
       bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
-                       BLOCK_64X64, &dummy_rate, &dummy_dist,
+                       BLOCK_LARGEST, &dummy_rate, &dummy_dist,
 #if CONFIG_SUPERTX
                        &dummy_rate_nocoef,
 #endif  // CONFIG_SUPERTX
@@ -4218,7 +4302,7 @@ static void encode_rd_sb_row(VP10_COMP *cpi,
                cm->frame_type != KEY_FRAME) {
       choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
-                       BLOCK_64X64, &dummy_rate, &dummy_dist,
+                       BLOCK_LARGEST, &dummy_rate, &dummy_dist,
 #if CONFIG_SUPERTX
                        &dummy_rate_nocoef,
 #endif  // CONFIG_SUPERTX
@@ -4226,12 +4310,12 @@ static void encode_rd_sb_row(VP10_COMP *cpi,
     } else {
       // If required set upper and lower partition size limits
       if (sf->auto_min_max_partition_size) {
-        set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+        set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_LARGEST);
         rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
                                 &x->min_partition_size,
                                 &x->max_partition_size);
       }
-      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_LARGEST,
                         &dummy_rdc,
 #if CONFIG_SUPERTX
                         &dummy_rate_nocoef,
@@ -4930,19 +5014,15 @@ static void encode_superblock(VP10_COMP *cpi, ThreadData *td,
 #if CONFIG_OBMC
     if (mbmi->obmc) {
 #if CONFIG_VP9_HIGHBITDEPTH
-      DECLARE_ALIGNED(16, uint8_t,
-                      tmp_buf1[2 * MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
-      DECLARE_ALIGNED(16, uint8_t,
-                      tmp_buf2[2 * MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
+      DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+      DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
 #else
-      DECLARE_ALIGNED(16, uint8_t,
-                      tmp_buf1[MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
-      DECLARE_ALIGNED(16, uint8_t,
-                      tmp_buf2[MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
+      DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
+      DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
-      int dst_stride1[MAX_MB_PLANE] = {CU_SIZE, CU_SIZE, CU_SIZE};
-      int dst_stride2[MAX_MB_PLANE] = {CU_SIZE, CU_SIZE, CU_SIZE};
+      int dst_stride1[MAX_MB_PLANE] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
+      int dst_stride2[MAX_MB_PLANE] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
 
       assert(mbmi->sb_type >= BLOCK_8X8);
 
@@ -4950,21 +5030,19 @@ static void encode_superblock(VP10_COMP *cpi, ThreadData *td,
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         int len = sizeof(uint16_t);
         dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
-        dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + CU_SIZE * CU_SIZE * len);
-        dst_buf1[2] = CONVERT_TO_BYTEPTR(
-            tmp_buf1 + CU_SIZE * CU_SIZE * 2 * len);
+        dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
+        dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
         dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
-        dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + CU_SIZE * CU_SIZE * len);
-        dst_buf2[2] = CONVERT_TO_BYTEPTR(
-            tmp_buf2 + CU_SIZE * CU_SIZE * 2 * len);
+        dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
+        dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
       } else {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       dst_buf1[0] = tmp_buf1;
-      dst_buf1[1] = tmp_buf1 + CU_SIZE * CU_SIZE;
-      dst_buf1[2] = tmp_buf1 + CU_SIZE * CU_SIZE * 2;
+      dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
+      dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
       dst_buf2[0] = tmp_buf2;
-      dst_buf2[1] = tmp_buf2 + CU_SIZE * CU_SIZE;
-      dst_buf2[2] = tmp_buf2 + CU_SIZE * CU_SIZE * 2;
+      dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
+      dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
 #if CONFIG_VP9_HIGHBITDEPTH
       }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -5447,38 +5525,35 @@ static void predict_sb_complex(VP10_COMP *cpi, ThreadData *td,
 
   int i, ctx;
   uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
-  DECLARE_ALIGNED(16, uint8_t,
-                  tmp_buf1[MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * 2]);
-  DECLARE_ALIGNED(16, uint8_t,
-                  tmp_buf2[MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * 2]);
-  DECLARE_ALIGNED(16, uint8_t,
-                  tmp_buf3[MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * 2]);
-  int dst_stride1[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
-  int dst_stride2[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
-  int dst_stride3[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf3[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  int dst_stride1[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
+  int dst_stride2[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
+  int dst_stride3[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     int len = sizeof(uint16_t);
     dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
-    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAXTXLEN * MAXTXLEN * len);
-    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAXTXLEN * MAXTXLEN * len);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_TX_SQUARE * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_TX_SQUARE * len);
     dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
-    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAXTXLEN * MAXTXLEN * len);
-    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAXTXLEN * MAXTXLEN * len);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_TX_SQUARE * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_TX_SQUARE * len);
     dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3);
-    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAXTXLEN * MAXTXLEN * len);
-    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAXTXLEN * MAXTXLEN * len);
+    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAX_TX_SQUARE * len);
+    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAX_TX_SQUARE * len);
   } else {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     dst_buf1[0] = tmp_buf1;
-    dst_buf1[1] = tmp_buf1 + MAXTXLEN * MAXTXLEN;
-    dst_buf1[2] = tmp_buf1 + 2 * MAXTXLEN * MAXTXLEN;
+    dst_buf1[1] = tmp_buf1 + MAX_TX_SQUARE;
+    dst_buf1[2] = tmp_buf1 + 2 * MAX_TX_SQUARE;
     dst_buf2[0] = tmp_buf2;
-    dst_buf2[1] = tmp_buf2 + MAXTXLEN * MAXTXLEN;
-    dst_buf2[2] = tmp_buf2 + 2 * MAXTXLEN * MAXTXLEN;
+    dst_buf2[1] = tmp_buf2 + MAX_TX_SQUARE;
+    dst_buf2[2] = tmp_buf2 + 2 * MAX_TX_SQUARE;
     dst_buf3[0] = tmp_buf3;
-    dst_buf3[1] = tmp_buf3 + MAXTXLEN * MAXTXLEN;
-    dst_buf3[2] = tmp_buf3 + 2 * MAXTXLEN * MAXTXLEN;
+    dst_buf3[1] = tmp_buf3 + MAX_TX_SQUARE;
+    dst_buf3[2] = tmp_buf3 + 2 * MAX_TX_SQUARE;
 #if CONFIG_VP9_HIGHBITDEPTH
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -6037,7 +6112,8 @@ static void rd_supertx_sb(VP10_COMP *cpi, ThreadData *td,
   sse_uv = 0;
   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
 #if CONFIG_VAR_TX
-    ENTROPY_CONTEXT ctxa[16], ctxl[16];
+    ENTROPY_CONTEXT ctxa[2 * MI_BLOCK_SIZE];
+    ENTROPY_CONTEXT ctxl[2 * MI_BLOCK_SIZE];
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     int coeff_ctx = 1;
 
@@ -6081,7 +6157,8 @@ static void rd_supertx_sb(VP10_COMP *cpi, ThreadData *td,
 #endif  // CONFIG_EXT_TX
   for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
 #if CONFIG_VAR_TX
-    ENTROPY_CONTEXT ctxa[16], ctxl[16];
+    ENTROPY_CONTEXT ctxa[2 * MI_BLOCK_SIZE];
+    ENTROPY_CONTEXT ctxl[2 * MI_BLOCK_SIZE];
     const struct macroblockd_plane *const pd = &xd->plane[0];
     int coeff_ctx = 1;
 #endif  // CONFIG_VAR_TX
diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index 0b7a04abe..429ac4f5b 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@@ -29,8 +29,8 @@
 #include "vp10/encoder/tokenize.h"
 
 struct optimize_ctx {
-  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
-  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][2 * MI_BLOCK_SIZE];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][2 * MI_BLOCK_SIZE];
 };
 
 void vp10_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
@@ -96,9 +96,9 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
   struct macroblock_plane *const p = &mb->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int ref = is_inter_block(&xd->mi[0]->mbmi);
-  vp10_token_state tokens[1025][2];
-  unsigned best_index[1025][2];
-  uint8_t token_cache[1024];
+  vp10_token_state tokens[MAX_TX_SQUARE+1][2];
+  unsigned best_index[MAX_TX_SQUARE+1][2];
+  uint8_t token_cache[MAX_TX_SQUARE];
   const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 97d091a82..77af3ddcd 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -1955,6 +1955,8 @@ void vp10_change_config(struct VP10_COMP *cpi, const VP10EncoderConfig *oxcf) {
       CHECK_MEM_ERROR(cm, x->palette_buffer,
                       vpx_memalign(16, sizeof(*x->palette_buffer)));
     }
+    vp10_free_pc_tree(&cpi->td);
+    vp10_setup_pc_tree(&cpi->common, &cpi->td);
   }
 
   vp10_reset_segment_features(cm);
@@ -3147,7 +3149,7 @@ static void loopfilter_frame(VP10_COMP *cpi, VP10_COMMON *cm) {
   }
 
   if (lf->filter_level > 0) {
-#if CONFIG_VAR_TX
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION
     vp10_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
 #else
     if (cpi->num_workers > 1)
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index 3126ca427..9e1b6fb7a 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -312,8 +312,8 @@ typedef struct VP10_COMP {
   QUANTS quants;
   ThreadData td;
   MB_MODE_INFO_EXT *mbmi_ext_base;
-  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);   // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);  // 8: SIMD width
   VP10_COMMON common;
   VP10EncoderConfig oxcf;
   struct lookahead_ctx    *lookahead;
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index 23184ed92..4327d974c 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -366,13 +366,13 @@ static unsigned int setup_center_error(const MACROBLOCKD *xd,
 #if CONFIG_VP9_HIGHBITDEPTH
   if (second_pred != NULL) {
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
+      DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
       vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
                                y_stride);
       besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
                         sse1);
     } else {
-      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
       vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
       besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
     }
@@ -384,7 +384,7 @@ static unsigned int setup_center_error(const MACROBLOCKD *xd,
 #else
   (void) xd;
   if (second_pred != NULL) {
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
     vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
     besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
   } else {
@@ -694,7 +694,7 @@ static int upsampled_pref_error(const MACROBLOCKD *xd,
   unsigned int besterr;
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    DECLARE_ALIGNED(16, uint16_t, pred16[64 * 64]);
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
     if (second_pred != NULL)
       vpx_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, y,
                                          y_stride);
@@ -704,9 +704,9 @@ static int upsampled_pref_error(const MACROBLOCKD *xd,
     besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride,
                       sse);
   } else {
-    DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
 #else
-    DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
     (void) xd;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     if (second_pred != NULL)
@@ -1961,10 +1961,10 @@ unsigned int vp10_int_pro_motion_estimation(const VP10_COMP *cpi, MACROBLOCK *x,
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
-  DECLARE_ALIGNED(16, int16_t, hbuf[128]);
-  DECLARE_ALIGNED(16, int16_t, vbuf[128]);
-  DECLARE_ALIGNED(16, int16_t, src_hbuf[64]);
-  DECLARE_ALIGNED(16, int16_t, src_vbuf[64]);
+  DECLARE_ALIGNED(16, int16_t, hbuf[2 * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, int16_t, vbuf[2 * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, int16_t, src_hbuf[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, int16_t, src_vbuf[MAX_SB_SQUARE]);
   int idx;
   const int bw = 4 << b_width_log2_lookup[bsize];
   const int bh = 4 << b_height_log2_lookup[bsize];
diff --git a/vp10/encoder/picklpf.c b/vp10/encoder/picklpf.c
index 56ff5c008..f491006cf 100644
--- a/vp10/encoder/picklpf.c
+++ b/vp10/encoder/picklpf.c
@@ -41,7 +41,7 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
   VP10_COMMON *const cm = &cpi->common;
   int64_t filt_err;
 
-#if CONFIG_VAR_TX
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION
   vp10_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
                          1, partial_frame);
 #else
diff --git a/vp10/encoder/quantize.c b/vp10/encoder/quantize.c
index f8a59ec7d..3f8f0f427 100644
--- a/vp10/encoder/quantize.c
+++ b/vp10/encoder/quantize.c
@@ -461,7 +461,7 @@ void vp10_init_quantizer(VP10_COMP *cpi) {
       cpi->uv_dequant[q][i] = quant;
     }
 
-    for (i = 2; i < 8; i++) {
+    for (i = 2; i < 8; i++) {  // 8: SIMD width
       quants->y_quant[q][i] = quants->y_quant[q][1];
       quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
       quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
diff --git a/vp10/encoder/quantize.h b/vp10/encoder/quantize.h
index 9c0ab3fbf..612846055 100644
--- a/vp10/encoder/quantize.h
+++ b/vp10/encoder/quantize.h
@@ -27,6 +27,7 @@ typedef void (*VP10_QUANT_FACADE)(const tran_low_t *coeff_ptr,
                                   const scan_order *sc);
 
 typedef struct {
+  // 0: dc 1: ac 2-8: ac repeated to SIMD width
   DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
diff --git a/vp10/encoder/rd.c b/vp10/encoder/rd.c
index c2f148dbf..203ac4213 100644
--- a/vp10/encoder/rd.c
+++ b/vp10/encoder/rd.c
@@ -62,7 +62,10 @@ void vp10_rd_cost_init(RD_COST *rd_cost) {
 // This table is used to correct for block size.
 // The factors here are << 2 (2 = x0.5, 32 = x8 etc).
 static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
-  2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32
+  2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32,
+#if CONFIG_EXT_PARTITION
+  48, 48, 64
+#endif  // CONFIG_EXT_PARTITION
 };
 
 static void fill_mode_costs(VP10_COMP *cpi) {
@@ -560,8 +563,8 @@ void vp10_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
 
 void vp10_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
                               const struct macroblockd_plane *pd,
-                              ENTROPY_CONTEXT t_above[16],
-                              ENTROPY_CONTEXT t_left[16]) {
+                              ENTROPY_CONTEXT t_above[2 * MI_BLOCK_SIZE],
+                              ENTROPY_CONTEXT t_left[2 * MI_BLOCK_SIZE]) {
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
@@ -935,7 +938,7 @@ void vp10_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
     int mode;
     for (mode = 0; mode < top_mode; ++mode) {
       const BLOCK_SIZE min_size = VPXMAX(bsize - 1, BLOCK_4X4);
-      const BLOCK_SIZE max_size = VPXMIN(bsize + 2, BLOCK_64X64);
+      const BLOCK_SIZE max_size = VPXMIN(bsize + 2, BLOCK_LARGEST);
       BLOCK_SIZE bs;
       for (bs = min_size; bs <= max_size; ++bs) {
         int *const fact = &factor_buf[bs][mode];
diff --git a/vp10/encoder/rd.h b/vp10/encoder/rd.h
index 675b9db36..533e7751c 100644
--- a/vp10/encoder/rd.h
+++ b/vp10/encoder/rd.h
@@ -330,8 +330,8 @@ void vp10_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame);
 
 void vp10_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
                               const struct macroblockd_plane *pd,
-                              ENTROPY_CONTEXT t_above[16],
-                              ENTROPY_CONTEXT t_left[16]);
+                              ENTROPY_CONTEXT t_above[2 * MI_BLOCK_SIZE],
+                              ENTROPY_CONTEXT t_left[2 * MI_BLOCK_SIZE]);
 
 void vp10_set_rd_speed_thresholds(struct VP10_COMP *cpi);
 
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index f3056e9aa..328e70c75 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -102,8 +102,8 @@ typedef struct {
 struct rdcost_block_args {
   const VP10_COMP *cpi;
   MACROBLOCK *x;
-  ENTROPY_CONTEXT t_above[16];
-  ENTROPY_CONTEXT t_left[16];
+  ENTROPY_CONTEXT t_above[2 * MI_BLOCK_SIZE];
+  ENTROPY_CONTEXT t_left[2 * MI_BLOCK_SIZE];
   int this_rate;
   int64_t this_dist;
   int64_t this_sse;
@@ -376,8 +376,8 @@ static void get_energy_distribution_fine(const VP10_COMP *cpi,
   unsigned int esq[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   unsigned int var[16];
   double total = 0;
-  const int f_index = bsize - 6;
 
+  const int f_index = bsize - BLOCK_16X16;
   if (f_index < 0) {
     int i, j, index;
     int w_shift = bw == 8 ? 1 : 2;
@@ -890,7 +890,7 @@ static int cost_coeffs(MACROBLOCK *x,
   const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
                    x->token_costs[tx_size][type][is_inter_block(mbmi)];
-  uint8_t token_cache[32 * 32];
+  uint8_t token_cache[MAX_TX_SQUARE];
 #if CONFIG_VAR_TX
   int pt = coeff_ctx;
 #else
@@ -1045,10 +1045,10 @@ static void dist_block(const VP10_COMP *cpi, MACROBLOCK *x, int plane,
     if (*eob) {
       const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
 #if CONFIG_VP9_HIGHBITDEPTH
-      DECLARE_ALIGNED(16, uint16_t, recon16[32 * 32]);  // MAX TX_SIZE**2
+      DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
       uint8_t *recon = (uint8_t*)recon16;
 #else
-      DECLARE_ALIGNED(16, uint8_t, recon[32 * 32]);     // MAX TX_SIZE**2
+      DECLARE_ALIGNED(16, uint8_t, recon[MAX_TX_SQUARE]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
       const PLANE_TYPE plane_type = plane == 0 ? PLANE_TYPE_Y : PLANE_TYPE_UV;
@@ -1064,18 +1064,18 @@ static void dist_block(const VP10_COMP *cpi, MACROBLOCK *x, int plane,
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         recon = CONVERT_TO_BYTEPTR(recon);
         inv_txfm_param.bd = xd->bd;
-        vpx_highbd_convolve_copy(dst, dst_stride, recon, 32,
+        vpx_highbd_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE,
                                  NULL, 0, NULL, 0, bs, bs, xd->bd);
-        highbd_inv_txfm_add(dqcoeff, recon, 32, &inv_txfm_param);
+        highbd_inv_txfm_add(dqcoeff, recon, MAX_TX_SIZE, &inv_txfm_param);
       } else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       {
-        vpx_convolve_copy(dst, dst_stride, recon, 32,
+        vpx_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE,
                           NULL, 0, NULL, 0, bs, bs);
-        inv_txfm_add(dqcoeff, recon, 32, &inv_txfm_param);
+        inv_txfm_add(dqcoeff, recon, MAX_TX_SIZE, &inv_txfm_param);
       }
 
-      cpi->fn_ptr[tx_bsize].vf(src, src_stride, recon, 32, &tmp);
+      cpi->fn_ptr[tx_bsize].vf(src, src_stride, recon, MAX_TX_SIZE, &tmp);
     }
 
     *out_dist = (int64_t)tmp * 16;
@@ -2838,10 +2838,10 @@ void vp10_tx_block_rd_b(const VP10_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
   uint8_t *src = &p->src.buf[4 * blk_row * src_stride + 4 * blk_col];
   uint8_t *dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, rec_buffer_alloc_16[32 * 32]);
+  DECLARE_ALIGNED(16, uint16_t, rec_buffer16[MAX_TX_SQUARE]);
   uint8_t *rec_buffer;
 #else
-  DECLARE_ALIGNED(16, uint8_t, rec_buffer[32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, rec_buffer[MAX_TX_SQUARE]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
@@ -2860,16 +2860,16 @@ void vp10_tx_block_rd_b(const VP10_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
   // TODO(any): Use dist_block to compute distortion
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    rec_buffer = CONVERT_TO_BYTEPTR(rec_buffer_alloc_16);
-    vpx_highbd_convolve_copy(dst, pd->dst.stride, rec_buffer, 32,
+    rec_buffer = CONVERT_TO_BYTEPTR(rec_buffer16);
+    vpx_highbd_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE,
                              NULL, 0, NULL, 0, bh, bh, xd->bd);
   } else {
-    rec_buffer = (uint8_t *)rec_buffer_alloc_16;
-    vpx_convolve_copy(dst, pd->dst.stride, rec_buffer, 32,
+    rec_buffer = (uint8_t *)rec_buffer16;
+    vpx_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE,
                       NULL, 0, NULL, 0, bh, bh);
   }
 #else
-  vpx_convolve_copy(dst, pd->dst.stride, rec_buffer, 32,
+  vpx_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE,
                     NULL, 0, NULL, 0, bh, bh);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -2904,12 +2904,12 @@ void vp10_tx_block_rd_b(const VP10_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       inv_txfm_param.bd = xd->bd;
-      highbd_inv_txfm_add(dqcoeff, rec_buffer, 32, &inv_txfm_param);
+      highbd_inv_txfm_add(dqcoeff, rec_buffer, MAX_TX_SIZE, &inv_txfm_param);
     } else {
-      inv_txfm_add(dqcoeff, rec_buffer, 32, &inv_txfm_param);
+      inv_txfm_add(dqcoeff, rec_buffer, MAX_TX_SIZE, &inv_txfm_param);
     }
 #else  // CONFIG_VP9_HIGHBITDEPTH
-    inv_txfm_add(dqcoeff, rec_buffer, 32, &inv_txfm_param);
+    inv_txfm_add(dqcoeff, rec_buffer, MAX_TX_SIZE, &inv_txfm_param);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     if ((bh >> 2) + blk_col > max_blocks_wide ||
@@ -2921,16 +2921,16 @@ void vp10_tx_block_rd_b(const VP10_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
       tmp = 0;
       for (idy = 0; idy < blocks_height; idy += 2) {
         for (idx = 0; idx < blocks_width; idx += 2) {
-          cpi->fn_ptr[BLOCK_8X8].vf(src + 4 * idy * src_stride + 4 * idx,
-                                    src_stride,
-                                    rec_buffer + 4 * idy * 32 + 4 * idx,
-                                    32, &this_dist);
+          uint8_t *const s = src + 4 * idy * src_stride + 4 * idx;
+          uint8_t *const r = rec_buffer + 4 * idy * MAX_TX_SIZE + 4 * idx;
+          cpi->fn_ptr[BLOCK_8X8].vf(s, src_stride, r, MAX_TX_SIZE, &this_dist);
           tmp += this_dist;
         }
       }
     } else {
       uint32_t this_dist;
-      cpi->fn_ptr[txm_bsize].vf(src, src_stride, rec_buffer, 32, &this_dist);
+      cpi->fn_ptr[txm_bsize].vf(src, src_stride, rec_buffer, MAX_TX_SIZE,
+                                &this_dist);
       tmp = this_dist;
     }
   }
@@ -3125,8 +3125,10 @@ static void inter_block_yrd(const VP10_COMP *cpi, MACROBLOCK *x,
     int idx, idy;
     int block = 0;
     int step = 1 << (max_txsize_lookup[plane_bsize] * 2);
-    ENTROPY_CONTEXT ctxa[16], ctxl[16];
-    TXFM_CONTEXT tx_above[8], tx_left[8];
+    ENTROPY_CONTEXT ctxa[2 * MI_BLOCK_SIZE];
+    ENTROPY_CONTEXT ctxl[2 * MI_BLOCK_SIZE];
+    TXFM_CONTEXT tx_above[MI_BLOCK_SIZE];
+    TXFM_CONTEXT tx_left[MI_BLOCK_SIZE];
 
     int pnrate = 0, pnskip = 1;
     int64_t pndist = 0, pnsse = 0;
@@ -3240,7 +3242,7 @@ static void select_tx_type_yrd(const VP10_COMP *cpi, MACROBLOCK *x,
   const int is_inter = is_inter_block(mbmi);
   TX_SIZE best_tx_size[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
   TX_SIZE best_tx = TX_SIZES;
-  uint8_t best_blk_skip[256];
+  uint8_t best_blk_skip[MI_BLOCK_SIZE * MI_BLOCK_SIZE * 4];
   const int n4 = 1 << (num_pels_log2_lookup[bsize] - 4);
   int idx, idy;
   int prune = 0;
@@ -3423,7 +3425,8 @@ static int inter_block_uvrd(const VP10_COMP *cpi, MACROBLOCK *x,
     int step = 1 << (max_txsize_lookup[plane_bsize] * 2);
     int pnrate = 0, pnskip = 1;
     int64_t pndist = 0, pnsse = 0;
-    ENTROPY_CONTEXT ta[16], tl[16];
+    ENTROPY_CONTEXT ta[2 * MI_BLOCK_SIZE];
+    ENTROPY_CONTEXT tl[2 * MI_BLOCK_SIZE];
 
     vp10_get_entropy_contexts(bsize, TX_4X4, pd, ta, tl);
 
@@ -4560,10 +4563,10 @@ static void joint_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
 
   // Prediction buffer from second frame.
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
+  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
   uint8_t *second_pred;
 #else
-  DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   for (ref = 0; ref < 2; ++ref) {
@@ -5733,9 +5736,9 @@ static void single_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
     step_param = cpi->mv_step_param;
   }
 
-  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
+  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_LARGEST) {
     int boffset =
-        2 * (b_width_log2_lookup[BLOCK_64X64] -
+        2 * (b_width_log2_lookup[BLOCK_LARGEST] -
              VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
     step_param = VPXMAX(step_param, boffset);
   }
@@ -6202,16 +6205,15 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
   const int * const intra_mode_cost =
     cpi->mbmode_cost[size_group_lookup[bsize]];
   const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
-  const int tmp_buf_sz = CU_SIZE * CU_SIZE;
 #if CONFIG_REF_MV
   uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
 #endif
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, tmp_buf16[MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, tmp_buf16[MAX_MB_PLANE * MAX_SB_SQUARE]);
   uint8_t *tmp_buf;
 #else
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * MAX_SB_SQUARE]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_OBMC
@@ -6226,7 +6228,7 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
   int best_rate_y, best_rate_uv;
 #endif  // CONFIG_SUPERTX
 #if CONFIG_VAR_TX
-  uint8_t best_blk_skip[3][256];
+  uint8_t best_blk_skip[MAX_MB_PLANE][MI_BLOCK_SIZE * MI_BLOCK_SIZE * 4];
 #endif  // CONFIG_VAR_TX
   int64_t best_distortion = INT64_MAX;
   unsigned int best_pred_var = UINT_MAX;
@@ -6241,8 +6243,8 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
   int orig_dst_stride[MAX_MB_PLANE];
   int rs = 0;
   INTERP_FILTER best_filter = SWITCHABLE;
-  uint8_t skip_txfm[MAX_MB_PLANE][4] = {{0}};
-  int64_t bsse[MAX_MB_PLANE][4] = {{0}};
+  uint8_t skip_txfm[MAX_MB_PLANE][MAX_TX_BLOCKS_IN_MAX_SB] = {{0}};
+  int64_t bsse[MAX_MB_PLANE][MAX_TX_BLOCKS_IN_MAX_SB] = {{0}};
 
   int skip_txfm_sb = 0;
   int64_t skip_sse_sb = INT64_MAX;
@@ -6569,8 +6571,8 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
           restore_dst_buf(xd, orig_dst, orig_dst_stride);
         } else {
           for (j = 0; j < MAX_MB_PLANE; j++) {
-            xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
-            xd->plane[j].dst.stride = 64;
+            xd->plane[j].dst.buf = tmp_buf + j * MAX_SB_SQUARE;
+            xd->plane[j].dst.stride = MAX_SB_SIZE;
           }
         }
         vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
@@ -6648,15 +6650,15 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
     if (have_newmv_in_inter_mode(this_mode)) {
       int_mv tmp_mv[2];
       int rate_mvs[2], tmp_rate_mv = 0;
-      uint8_t pred0[2 * CU_SIZE * CU_SIZE * 3];
-      uint8_t pred1[2 * CU_SIZE * CU_SIZE * 3];
+      uint8_t pred0[2 * MAX_SB_SQUARE * 3];
+      uint8_t pred1[2 * MAX_SB_SQUARE * 3];
       uint8_t *preds0[3] = {pred0,
-                            pred0 + 2 * CU_SIZE * CU_SIZE,
-                            pred0 + 4 * CU_SIZE * CU_SIZE};
+                            pred0 + 2 * MAX_SB_SQUARE,
+                            pred0 + 4 * MAX_SB_SQUARE};
       uint8_t *preds1[3] = {pred1,
-                            pred1 + 2 * CU_SIZE * CU_SIZE,
-                            pred1 + 4 * CU_SIZE * CU_SIZE};
-      int strides[3] = {CU_SIZE, CU_SIZE, CU_SIZE};
+                            pred1 + 2 * MAX_SB_SQUARE,
+                            pred1 + 4 * MAX_SB_SQUARE};
+      int strides[3] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
       vp10_build_inter_predictors_for_planes_single_buf(
           xd, bsize, mi_row, mi_col, 0, preds0, strides);
       vp10_build_inter_predictors_for_planes_single_buf(
@@ -6723,15 +6725,15 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
         mbmi->mv[1].as_int = cur_mv[1].as_int;
       }
     } else {
-      uint8_t pred0[2 * CU_SIZE * CU_SIZE * 3];
-      uint8_t pred1[2 * CU_SIZE * CU_SIZE * 3];
+      uint8_t pred0[2 * MAX_SB_SQUARE * 3];
+      uint8_t pred1[2 * MAX_SB_SQUARE * 3];
       uint8_t *preds0[3] = {pred0,
-                            pred0 + 2 * CU_SIZE * CU_SIZE,
-                            pred0 + 4 * CU_SIZE * CU_SIZE};
+                            pred0 + 2 * MAX_SB_SQUARE,
+                            pred0 + 4 * MAX_SB_SQUARE};
       uint8_t *preds1[3] = {pred1,
-                            pred1 + 2 * CU_SIZE * CU_SIZE,
-                            pred1 + 4 * CU_SIZE * CU_SIZE};
-      int strides[3] = {CU_SIZE, CU_SIZE, CU_SIZE};
+                            pred1 + 2 * MAX_SB_SQUARE,
+                            pred1 + 4 * MAX_SB_SQUARE};
+      int strides[3] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
       vp10_build_inter_predictors_for_planes_single_buf(
           xd, bsize, mi_row, mi_col, 0, preds0, strides);
       vp10_build_inter_predictors_for_planes_single_buf(
@@ -6791,8 +6793,8 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
     int tmp_rate_mv = 0;
     mbmi->ref_frame[1] = NONE;
     for (j = 0; j < MAX_MB_PLANE; j++) {
-      xd->plane[j].dst.buf = tmp_buf + j * tmp_buf_sz;
-      xd->plane[j].dst.stride = CU_SIZE;
+      xd->plane[j].dst.buf = tmp_buf + j * MAX_SB_SQUARE;
+      xd->plane[j].dst.stride = MAX_SB_SIZE;
     }
     vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
     restore_dst_buf(xd, orig_dst, orig_dst_stride);
@@ -6805,11 +6807,11 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
       rmode = intra_mode_cost[mbmi->interintra_mode];
       vp10_build_interintra_predictors(xd,
                                        tmp_buf,
-                                       tmp_buf + tmp_buf_sz,
-                                       tmp_buf + 2 * tmp_buf_sz,
-                                       CU_SIZE,
-                                       CU_SIZE,
-                                       CU_SIZE,
+                                       tmp_buf + MAX_SB_SQUARE,
+                                       tmp_buf + 2 * MAX_SB_SQUARE,
+                                       MAX_SB_SIZE,
+                                       MAX_SB_SIZE,
+                                       MAX_SB_SIZE,
                                        bsize);
       model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum,
                       &skip_txfm_sb, &skip_sse_sb);
@@ -6830,11 +6832,11 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
     if (wedge_bits) {
       vp10_build_interintra_predictors(xd,
                                        tmp_buf,
-                                       tmp_buf + tmp_buf_sz,
-                                       tmp_buf + 2 * tmp_buf_sz,
-                                       CU_SIZE,
-                                       CU_SIZE,
-                                       CU_SIZE,
+                                       tmp_buf + MAX_SB_SQUARE,
+                                       tmp_buf + 2 * MAX_SB_SQUARE,
+                                       MAX_SB_SIZE,
+                                       MAX_SB_SIZE,
+                                       MAX_SB_SIZE,
                                        bsize);
       model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum,
                       &skip_txfm_sb, &skip_sse_sb);
@@ -6852,11 +6854,11 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
         mbmi->interintra_uv_wedge_index = wedge_index;
         vp10_build_interintra_predictors(xd,
                                          tmp_buf,
-                                         tmp_buf + tmp_buf_sz,
-                                         tmp_buf + 2 * tmp_buf_sz,
-                                         CU_SIZE,
-                                         CU_SIZE,
-                                         CU_SIZE,
+                                         tmp_buf + MAX_SB_SQUARE,
+                                         tmp_buf + 2 * MAX_SB_SQUARE,
+                                         MAX_SB_SIZE,
+                                         MAX_SB_SIZE,
+                                         MAX_SB_SIZE,
                                          bsize);
         model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum,
                         &skip_txfm_sb, &skip_sse_sb);
@@ -6937,8 +6939,8 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
     if (best_needs_copy) {
       // again temporarily set the buffers to local memory to prevent a memcpy
       for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
-        xd->plane[i].dst.stride = 64;
+        xd->plane[i].dst.buf = tmp_buf + i * MAX_SB_SQUARE;
+        xd->plane[i].dst.stride = MAX_SB_SIZE;
       }
     }
     rd = tmp_rd;
@@ -7572,33 +7574,33 @@ void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,
   const MODE_INFO *left_mi = xd->left_mi;
 #if CONFIG_OBMC
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * 64 * 64]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * 64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
 #else
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * 64 * 64]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * 64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  uint8_t *dst_buf1[3], *dst_buf2[3];
-  int dst_stride1[3] = {64, 64, 64};
-  int dst_stride2[3] = {64, 64, 64};
+  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+  int dst_stride1[MAX_MB_PLANE] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
+  int dst_stride2[MAX_MB_PLANE] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     int len = sizeof(uint16_t);
     dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
-    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + 4096 * len);
-    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 8192 * len);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_SB_SQUARE * len);
     dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
-    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + 4096 * len);
-    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 8192 * len);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_SB_SQUARE * len);
   } else {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   dst_buf1[0] = tmp_buf1;
-  dst_buf1[1] = tmp_buf1 + 4096;
-  dst_buf1[2] = tmp_buf1 + 8192;
+  dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
+  dst_buf1[2] = tmp_buf1 + 2 * MAX_SB_SQUARE;
   dst_buf2[0] = tmp_buf2;
-  dst_buf2[1] = tmp_buf2 + 4096;
-  dst_buf2[2] = tmp_buf2 + 8192;
+  dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
+  dst_buf2[2] = tmp_buf2 + 2 * MAX_SB_SQUARE;
 #if CONFIG_VP9_HIGHBITDEPTH
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -9386,7 +9388,7 @@ void vp10_rd_pick_inter_mode_sub8x8(struct VP10_COMP *cpi,
       int switchable_filter_index;
       int_mv *second_ref = comp_pred ?
                              &x->mbmi_ext->ref_mvs[second_ref_frame][0] : NULL;
-      b_mode_info tmp_best_bmodes[16];
+      b_mode_info tmp_best_bmodes[16];  // Should this be 4 ?
       MB_MODE_INFO tmp_best_mbmode;
       BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
       int pred_exists = 0;
diff --git a/vp10/encoder/segmentation.c b/vp10/encoder/segmentation.c
index e12282e70..f71946785 100644
--- a/vp10/encoder/segmentation.c
+++ b/vp10/encoder/segmentation.c
@@ -328,13 +328,13 @@ void vp10_choose_segmap_coding_method(VP10_COMMON *cm, MACROBLOCKD *xd) {
       mi_ptr = cm->mi_grid_visible + tile_info.mi_row_start * cm->mi_stride +
                  tile_info.mi_col_start;
       for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
-           mi_row += 8, mi_ptr += 8 * cm->mi_stride) {
+           mi_row += MI_BLOCK_SIZE, mi_ptr += MI_BLOCK_SIZE * cm->mi_stride) {
         MODE_INFO **mi = mi_ptr;
         for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-             mi_col += 8, mi += 8) {
+             mi_col += MI_BLOCK_SIZE, mi += MI_BLOCK_SIZE) {
           count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts,
                         temporal_predictor_count, t_unpred_seg_counts,
-                        mi_row, mi_col, BLOCK_64X64);
+                        mi_row, mi_col, BLOCK_LARGEST);
         }
       }
     }
diff --git a/vp10/encoder/speed_features.c b/vp10/encoder/speed_features.c
index 25766bb45..cd1c91acd 100644
--- a/vp10/encoder/speed_features.c
+++ b/vp10/encoder/speed_features.c
@@ -353,6 +353,11 @@ static void set_rt_speed_feature(VP10_COMP *cpi, SPEED_FEATURES *sf,
     sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
     sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
     sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
+#if CONFIG_EXT_PARTITION
+    sf->inter_mode_mask[BLOCK_64X128] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_128X64] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_128X128] = INTER_NEAREST;
+#endif  // CONFIG_EXT_PARTITION
     sf->max_intra_bsize = BLOCK_32X32;
     sf->allow_skip_recode = 1;
   }
@@ -372,6 +377,11 @@ static void set_rt_speed_feature(VP10_COMP *cpi, SPEED_FEATURES *sf,
     sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO;
     sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO;
     sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO;
+#if CONFIG_EXT_PARTITION
+    sf->inter_mode_mask[BLOCK_64X128] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_128X64] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_128X128] = INTER_NEAREST_NEW_ZERO;
+#endif  // CONFIG_EXT_PARTITION
     sf->adaptive_rd_thresh = 2;
     // This feature is only enabled when partition search is disabled.
     sf->reuse_inter_pred_sby = 1;
@@ -483,7 +493,7 @@ void vp10_set_speed_features_framesize_independent(VP10_COMP *cpi) {
   sf->use_square_partition_only = 0;
   sf->auto_min_max_partition_size = NOT_IN_USE;
   sf->rd_auto_partition_min_limit = BLOCK_4X4;
-  sf->default_max_partition_size = BLOCK_64X64;
+  sf->default_max_partition_size = BLOCK_LARGEST;
   sf->default_min_partition_size = BLOCK_4X4;
   sf->adjust_partitioning_from_last_frame = 0;
   sf->last_partitioning_redo_frequency = 4;
@@ -514,7 +524,7 @@ void vp10_set_speed_features_framesize_independent(VP10_COMP *cpi) {
   sf->schedule_mode_search = 0;
   for (i = 0; i < BLOCK_SIZES; ++i)
     sf->inter_mode_mask[i] = INTER_ALL;
-  sf->max_intra_bsize = BLOCK_64X64;
+  sf->max_intra_bsize = BLOCK_LARGEST;
   sf->reuse_inter_pred_sby = 0;
   // This setting only takes effect when partition_search_type is set
   // to FIXED_PARTITION.
@@ -541,6 +551,12 @@ void vp10_set_speed_features_framesize_independent(VP10_COMP *cpi) {
   else if (oxcf->mode == GOOD)
     set_good_speed_feature(cpi, cm, sf, oxcf->speed);
 
+  // sf->partition_search_breakout_dist_thr is set assuming max 64x64
+  // blocks. Normalise this if the blocks are bigger.
+  if (MAX_SB_SIZE_LOG2 > 6) {
+    sf->partition_search_breakout_dist_thr <<= 2 * (MAX_SB_SIZE_LOG2 - 6);
+  }
+
   cpi->full_search_sad = vp10_full_search_sad;
   cpi->diamond_search_sad = vp10_diamond_search_sad;
 
diff --git a/vp10/encoder/tokenize.c b/vp10/encoder/tokenize.c
index 2398a536e..a283b1059 100644
--- a/vp10/encoder/tokenize.c
+++ b/vp10/encoder/tokenize.c
@@ -438,7 +438,7 @@ static void tokenize_b(int plane, int block, int blk_row, int blk_col,
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   TOKENEXTRA **tp = args->tp;
-  uint8_t token_cache[32 * 32];
+  uint8_t token_cache[MAX_TX_SQUARE];
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
diff --git a/vp10/encoder/x86/denoiser_sse2.c b/vp10/encoder/x86/denoiser_sse2.c
index 047974ef8..5c1303a72 100644
--- a/vp10/encoder/x86/denoiser_sse2.c
+++ b/vp10/encoder/x86/denoiser_sse2.c
@@ -361,9 +361,7 @@ int vp10_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
                                        avg, avg_stride,
                                        increase_denoising,
                                        bs, motion_magnitude, 8);
-  } else if (bs == BLOCK_16X8 || bs == BLOCK_16X16 || bs == BLOCK_16X32 ||
-             bs == BLOCK_32X16|| bs == BLOCK_32X32 || bs == BLOCK_32X64 ||
-             bs == BLOCK_64X32 || bs == BLOCK_64X64) {
+  } else if (bs < BLOCK_SIZES) {
     return vp10_denoiser_NxM_sse2_big(sig, sig_stride,
                                      mc_avg, mc_avg_stride,
                                      avg, avg_stride,
diff --git a/vpx_dsp/vpx_convolve.c b/vpx_dsp/vpx_convolve.c
index 2e85ed481..59d048812 100644
--- a/vpx_dsp/vpx_convolve.c
+++ b/vpx_dsp/vpx_convolve.c
@@ -130,20 +130,20 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride,
   // --Must round-up because block may be located at sub-pixel position.
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint8_t temp[MAX_EXT_SIZE * MAX_CU_SIZE];
+  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
   int intermediate_height =
           (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
-  assert(w <= MAX_CU_SIZE);
-  assert(h <= MAX_CU_SIZE);
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
 
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
   convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                 temp, MAX_CU_SIZE,
+                 temp, MAX_SB_SIZE,
                  x_filters, x0_q4, x_step_q4, w, intermediate_height);
-  convolve_vert(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_CU_SIZE,
+  convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
                 dst, dst_stride,
                 y_filters, y0_q4, y_step_q4, w, h);
 }
@@ -240,13 +240,13 @@ void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
   /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED(16, uint8_t, temp[MAX_CU_SIZE * MAX_CU_SIZE]);
-  assert(w <= MAX_CU_SIZE);
-  assert(h <= MAX_CU_SIZE);
+  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
 
-  vpx_convolve8_c(src, src_stride, temp, MAX_CU_SIZE,
+  vpx_convolve8_c(src, src_stride, temp, MAX_SB_SIZE,
                   filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  vpx_convolve_avg_c(temp, MAX_CU_SIZE, dst, dst_stride,
+  vpx_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride,
                      NULL, 0, NULL, 0, w, h);
 }
 
@@ -463,21 +463,21 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
   // --Must round-up because block may be located at sub-pixel position.
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint16_t temp[MAX_EXT_SIZE * MAX_CU_SIZE];
+  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
   int intermediate_height =
           (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
-  assert(w <= MAX_CU_SIZE);
-  assert(h <= MAX_CU_SIZE);
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
   highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                        CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE,
+                        CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
                         x_filters, x0_q4, x_step_q4, w,
                         intermediate_height, bd);
   highbd_convolve_vert(
-    CONVERT_TO_BYTEPTR(temp) + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_CU_SIZE,
+    CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
     dst, dst_stride,
     y_filters, y0_q4, y_step_q4, w, h, bd);
 }
@@ -561,14 +561,14 @@ void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
                                 const int16_t *filter_y, int y_step_q4,
                                 int w, int h, int bd) {
   // Fixed size intermediate buffer places limits on parameters.
-  DECLARE_ALIGNED(16, uint16_t, temp[MAX_CU_SIZE * MAX_CU_SIZE]);
-  assert(w <= MAX_CU_SIZE);
-  assert(h <= MAX_CU_SIZE);
+  DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
 
   vpx_highbd_convolve8_c(src, src_stride,
-                         CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE,
+                         CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
                          filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
-  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE,
+  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
                             dst, dst_stride,
                             NULL, 0, NULL, 0, w, h, bd);
 }
diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index 8d9bf558d..e12703176 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -13,18 +13,19 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
-#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#if CONFIG_VP10 && CONFIG_EXT_PARTITION
-# define MAX_CU_SIZE 128
-#else
-# define MAX_CU_SIZE 64
-#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+#ifndef MAX_SB_SIZE
+# if CONFIG_VP10 && CONFIG_EXT_PARTITION
+#   define MAX_SB_SIZE 128
+# else
+#   define MAX_SB_SIZE 64
+# endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+#endif  // ndef MAX_SB_SIZE
 
 #define VPXMIN(x, y) (((x) < (y)) ? (x) : (y))
 #define VPXMAX(x, y) (((x) > (y)) ? (x) : (y))
diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h
index 95c721ab6..ab387d664 100644
--- a/vpx_dsp/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -99,27 +99,27 @@ void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
                               int w, int h) { \
   assert(filter_x[3] != 128); \
   assert(filter_y[3] != 128); \
-  assert(w <= MAX_CU_SIZE); \
-  assert(h <= MAX_CU_SIZE); \
+  assert(w <= MAX_SB_SIZE); \
+  assert(h <= MAX_SB_SIZE); \
   assert(x_step_q4 == 16); \
   assert(y_step_q4 == 16); \
   if (filter_x[0] || filter_x[1] || filter_x[2]|| \
       filter_y[0] || filter_y[1] || filter_y[2]) { \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+7)]); \
+    DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE+7)]); \
     vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
-                              fdata2, MAX_CU_SIZE, \
+                              fdata2, MAX_SB_SIZE, \
                               filter_x, x_step_q4, filter_y, y_step_q4, \
                               w, h + 7); \
-    vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_CU_SIZE, MAX_CU_SIZE, \
+    vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
                                     dst, dst_stride, \
                                     filter_x, x_step_q4, filter_y, \
                                     y_step_q4, w, h); \
   } else { \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+1)]); \
-    vpx_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_CU_SIZE, \
+    DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE+1)]); \
+    vpx_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_SB_SIZE, \
                               filter_x, x_step_q4, filter_y, y_step_q4, \
                               w, h + 1); \
-    vpx_convolve8_##avg##vert_##opt(fdata2, MAX_CU_SIZE, dst, dst_stride, \
+    vpx_convolve8_##avg##vert_##opt(fdata2, MAX_SB_SIZE, dst, dst_stride, \
                                     filter_x, x_step_q4, filter_y, \
                                     y_step_q4, w, h); \
   } \
@@ -239,38 +239,38 @@ void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
                                      const int16_t *filter_x, int x_step_q4, \
                                      const int16_t *filter_y, int y_step_q4, \
                                      int w, int h, int bd) { \
-  assert(w <= MAX_CU_SIZE); \
-  assert(h <= MAX_CU_SIZE); \
+  assert(w <= MAX_SB_SIZE); \
+  assert(h <= MAX_SB_SIZE); \
   if (x_step_q4 == 16 && y_step_q4 == 16) { \
     if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
         filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
-      DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+7)]); \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE+7)]); \
       vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, \
                                        src_stride, \
                                        CONVERT_TO_BYTEPTR(fdata2), \
-                                       MAX_CU_SIZE, \
+                                       MAX_SB_SIZE, \
                                        filter_x, x_step_q4, \
                                        filter_y, y_step_q4, \
                                        w, h + 7, bd); \
       vpx_highbd_convolve8_##avg##vert_##opt( \
-        CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_CU_SIZE, \
-        MAX_CU_SIZE, \
+        CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_SB_SIZE, \
+        MAX_SB_SIZE, \
         dst, \
         dst_stride, \
         filter_x, x_step_q4, \
         filter_y, y_step_q4, \
         w, h, bd); \
     } else { \
-      DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+1)]); \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE+1)]); \
       vpx_highbd_convolve8_horiz_##opt(src, \
                                        src_stride, \
                                        CONVERT_TO_BYTEPTR(fdata2), \
-                                       MAX_CU_SIZE, \
+                                       MAX_SB_SIZE, \
                                        filter_x, x_step_q4, \
                                        filter_y, y_step_q4, \
                                        w, h + 1, bd); \
       vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), \
-                                             MAX_CU_SIZE, \
+                                             MAX_SB_SIZE, \
                                              dst, \
                                              dst_stride, \
                                              filter_x, x_step_q4, \
diff --git a/vpx_dsp/x86/subtract_sse2.asm b/vpx_dsp/x86/subtract_sse2.asm
index 4273efb85..2225b7cf6 100644
--- a/vpx_dsp/x86/subtract_sse2.asm
+++ b/vpx_dsp/x86/subtract_sse2.asm
@@ -31,6 +31,10 @@ cglobal subtract_block, 7, 7, 8, \
   je .case_16
   cmp                colsd, 32
   je .case_32
+%if CONFIG_EXT_PARTITION
+  cmp                colsd, 64
+  je .case_64
+%endif
 
 %macro loop16 6
   mova                  m0, [srcq+%1]
@@ -55,6 +59,22 @@ cglobal subtract_block, 7, 7, 8, \
   mova [diffq+mmsize*1+%6], m1
 %endmacro
 
+%if CONFIG_EXT_PARTITION
+  mov             pred_str, pred_stridemp
+.loop_128:
+  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize,  0*mmsize,  2*mmsize
+  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize,  4*mmsize,  6*mmsize
+  loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize,  8*mmsize, 10*mmsize
+  loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  sub                rowsd, 1
+  jnz .loop_128
+  RET
+
+.case_64:
+%endif
   mov             pred_str, pred_stridemp
 .loop_64:
   loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 6c5991858..48a88aeb4 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -844,12 +844,12 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
   // --Require an additional 8 rows for the horiz_w8 transpose tail.
-  DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_CU_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_SB_SIZE]);
   const int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
-  assert(w <= MAX_CU_SIZE);
-  assert(h <= MAX_CU_SIZE);
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
@@ -857,33 +857,33 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
                             src_stride,
                             temp,
-                            MAX_CU_SIZE,
+                            MAX_SB_SIZE,
                             x_filters, x0_q4, x_step_q4,
                             w, intermediate_height);
   } else {
     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
                             src_stride,
                             temp,
-                            MAX_CU_SIZE,
+                            MAX_SB_SIZE,
                             x_filters, x0_q4, x_step_q4,
                             w, intermediate_height);
   }
 
   if (w >= 16) {
-    scaledconvolve_vert_w16(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1),
-                            MAX_CU_SIZE,
+    scaledconvolve_vert_w16(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+                            MAX_SB_SIZE,
                             dst,
                             dst_stride,
                             y_filters, y0_q4, y_step_q4, w, h);
   } else if (w == 8) {
-    scaledconvolve_vert_w8(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1),
-                           MAX_CU_SIZE,
+    scaledconvolve_vert_w8(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+                           MAX_SB_SIZE,
                            dst,
                            dst_stride,
                            y_filters, y0_q4, y_step_q4, w, h);
   } else {
-    scaledconvolve_vert_w4(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1),
-                           MAX_CU_SIZE,
+    scaledconvolve_vert_w4(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+                           MAX_SB_SIZE,
                            dst,
                            dst_stride,
                            y_filters, y0_q4, y_step_q4, w, h);
diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
index 38dd2706b..04467d0b2 100644
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h
@@ -21,7 +21,11 @@ extern "C" {
 #include "vpx/vpx_integer.h"
 
 #define VP8BORDERINPIXELS           32
-#define VP9INNERBORDERINPIXELS      96
+#if CONFIG_EXT_PARTITION
+# define VP9INNERBORDERINPIXELS     160
+#else
+# define VP9INNERBORDERINPIXELS     96
+#endif  // CONFIG_EXT_PARTITION
 #define VP9_INTERP_EXTEND           4
 #define VP9_ENC_BORDER_IN_PIXELS    160
 #define VP9_DEC_BORDER_IN_PIXELS    160
-- 
2.40.0