From: Ronald S. Bultje <rbultje@google.com>
Date: Mon, 20 Aug 2012 21:43:34 +0000 (-0700)
Subject: Superblock coding.
X-Git-Tag: v1.3.0~1217^2~292^2
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5d4cffb35f4bc23462eedc95a4802c65e32d7d5a;p=libvpx

Superblock coding.

This commit adds a pick_sb_mode() function which selects the best 32x32
superblock coding mode. Then it selects the best per-MB modes, compares
the two and encodes that in the bitstream.

The bitstream coding is rather simplistic right now. At the SB level,
we code a bit to indicate whether this block uses SB-coding (32x32
prediction) or MB-coding (anything else), and then we follow with the
actual modes. This could and should be modified in the future, but is
omitted from this commit because it will likely involve reorganizing
much more code rather than just adding SB coding, so it's better to let
that be judged on its own merits.

Gains on derf: about even, YT/HD: +0.75%, STD/HD: +1.5%.

Change-Id: Iae313a7cbd8f75b3c66d04a68b991cb096eaaba6
---

diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 1926f20bd..1cba5d35a 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -148,6 +148,7 @@ typedef enum {
 #define VP8_YMODES  (B_PRED + 1)
 #define VP8_UV_MODES (TM_PRED + 1)
 #define VP8_I8X8_MODES (TM_PRED + 1)
+#define VP8_I32X32_MODES (TM_PRED + 1)
 
 #define VP8_MVREFS (1 + SPLITMV - NEARESTMV)
 
@@ -293,6 +294,11 @@ typedef struct {
     INTERPOLATIONFILTERTYPE interp_filter;
 #endif
 
+#if CONFIG_SUPERBLOCKS
+  // FIXME need a SB array of 4 MB_MODE_INFOs that
+  // only needs one encoded_as_sb.
+  unsigned char encoded_as_sb;
+#endif
 } MB_MODE_INFO;
 
 typedef struct {
diff --git a/vp8/common/entropymode.c b/vp8/common/entropymode.c
index 8d43ce827..5627aa43a 100644
--- a/vp8/common/entropymode.c
+++ b/vp8/common/entropymode.c
@@ -227,6 +227,14 @@ const vp8_tree_index vp8_mv_ref_tree[8] = {
   -NEWMV, -SPLITMV
 };
 
+#if CONFIG_SUPERBLOCKS
+const vp8_tree_index vp8_sb_mv_ref_tree[6] = {
+  -ZEROMV, 2,
+  -NEARESTMV, 4,
+  -NEARMV, -NEWMV
+};
+#endif
+
 const vp8_tree_index vp8_sub_mv_ref_tree[6] = {
   -LEFT4X4, 2,
   -ABOVE4X4, 4,
@@ -236,12 +244,18 @@ const vp8_tree_index vp8_sub_mv_ref_tree[6] = {
 
 struct vp8_token_struct vp8_bmode_encodings   [VP8_BINTRAMODES];
 struct vp8_token_struct vp8_ymode_encodings   [VP8_YMODES];
+#if CONFIG_SUPERBLOCKS
+struct vp8_token_struct vp8_sb_kf_ymode_encodings [VP8_I32X32_MODES];
+#endif
 struct vp8_token_struct vp8_kf_ymode_encodings [VP8_YMODES];
 struct vp8_token_struct vp8_uv_mode_encodings  [VP8_UV_MODES];
-struct vp8_token_struct vp8_i8x8_mode_encodings  [VP8_UV_MODES];
+struct vp8_token_struct vp8_i8x8_mode_encodings  [VP8_I8X8_MODES];
 struct vp8_token_struct vp8_mbsplit_encodings [VP8_NUMMBSPLITS];
 
 struct vp8_token_struct vp8_mv_ref_encoding_array    [VP8_MVREFS];
+#if CONFIG_SUPERBLOCKS
+struct vp8_token_struct vp8_sb_mv_ref_encoding_array  [VP8_MVREFS];
+#endif
 struct vp8_token_struct vp8_sub_mv_ref_encoding_array [VP8_SUBMVREFS];
 
 
@@ -253,11 +267,18 @@ void vp8_init_mbmode_probs(VP8_COMMON *x) {
     vp8_ymode_tree, x->fc.ymode_prob, bct, y_mode_cts, 256, 1);
   {
     int i;
-    for (i = 0; i < 8; i++)
+    for (i = 0; i < 8; i++) {
       vp8_tree_probs_from_distribution(
         VP8_YMODES, vp8_kf_ymode_encodings, vp8_kf_ymode_tree,
         x->kf_ymode_prob[i], bct, kf_y_mode_cts[i],
         256, 1);
+#if CONFIG_SUPERBLOCKS
+      vp8_tree_probs_from_distribution(
+        VP8_I32X32_MODES, vp8_sb_kf_ymode_encodings, vp8_sb_ymode_tree,
+        x->sb_kf_ymode_prob[i], bct, kf_y_mode_cts[i],
+        256, 1);
+#endif
+    }
   }
   {
     int i;
@@ -360,6 +381,9 @@ void vp8_entropy_mode_init() {
   vp8_tokens_from_tree(vp8_bmode_encodings,   vp8_bmode_tree);
   vp8_tokens_from_tree(vp8_ymode_encodings,   vp8_ymode_tree);
   vp8_tokens_from_tree(vp8_kf_ymode_encodings, vp8_kf_ymode_tree);
+#if CONFIG_SUPERBLOCKS
+  vp8_tokens_from_tree(vp8_sb_kf_ymode_encodings, vp8_sb_ymode_tree);
+#endif
   vp8_tokens_from_tree(vp8_uv_mode_encodings,  vp8_uv_mode_tree);
   vp8_tokens_from_tree(vp8_i8x8_mode_encodings,  vp8_i8x8_mode_tree);
   vp8_tokens_from_tree(vp8_mbsplit_encodings, vp8_mbsplit_tree);
@@ -370,6 +394,10 @@ void vp8_entropy_mode_init() {
 
   vp8_tokens_from_tree_offset(vp8_mv_ref_encoding_array,
                               vp8_mv_ref_tree, NEARESTMV);
+#if CONFIG_SUPERBLOCKS
+  vp8_tokens_from_tree_offset(vp8_sb_mv_ref_encoding_array,
+                              vp8_sb_mv_ref_tree, NEARESTMV);
+#endif
   vp8_tokens_from_tree_offset(vp8_sub_mv_ref_encoding_array,
                               vp8_sub_mv_ref_tree, LEFT4X4);
 }
diff --git a/vp8/common/entropymode.h b/vp8/common/entropymode.h
index f9cc263b9..430c949a6 100644
--- a/vp8/common/entropymode.h
+++ b/vp8/common/entropymode.h
@@ -40,21 +40,25 @@ extern const vp8_tree_index vp8_bmode_tree[];
 extern const vp8_tree_index  vp8_ymode_tree[];
 extern const vp8_tree_index  vp8_kf_ymode_tree[];
 extern const vp8_tree_index  vp8_uv_mode_tree[];
+#define vp8_sb_ymode_tree vp8_uv_mode_tree
 extern const vp8_tree_index  vp8_i8x8_mode_tree[];
 extern const vp8_tree_index  vp8_mbsplit_tree[];
 extern const vp8_tree_index  vp8_mv_ref_tree[];
+extern const vp8_tree_index  vp8_sb_mv_ref_tree[];
 extern const vp8_tree_index  vp8_sub_mv_ref_tree[];
 
 extern struct vp8_token_struct vp8_bmode_encodings   [VP8_BINTRAMODES];
 extern struct vp8_token_struct vp8_ymode_encodings   [VP8_YMODES];
+extern struct vp8_token_struct vp8_sb_kf_ymode_encodings [VP8_I32X32_MODES];
 extern struct vp8_token_struct vp8_kf_ymode_encodings [VP8_YMODES];
-extern struct vp8_token_struct vp8_i8x8_mode_encodings  [VP8_UV_MODES];
+extern struct vp8_token_struct vp8_i8x8_mode_encodings  [VP8_I8X8_MODES];
 extern struct vp8_token_struct vp8_uv_mode_encodings  [VP8_UV_MODES];
 extern struct vp8_token_struct vp8_mbsplit_encodings  [VP8_NUMMBSPLITS];
 
 /* Inter mode values do not start at zero */
 
 extern struct vp8_token_struct vp8_mv_ref_encoding_array    [VP8_MVREFS];
+extern struct vp8_token_struct vp8_sb_mv_ref_encoding_array    [VP8_MVREFS];
 extern struct vp8_token_struct vp8_sub_mv_ref_encoding_array [VP8_SUBMVREFS];
 
 void vp8_entropy_mode_init(void);
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index b71ef750d..d28024cda 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -47,6 +47,12 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) {
   rtcd->recon.recon4      = vp8_recon4b_c;
   rtcd->recon.recon_mb    = vp8_recon_mb_c;
   rtcd->recon.recon_mby   = vp8_recon_mby_c;
+#if CONFIG_SUPERBLOCKS
+  rtcd->recon.build_intra_predictors_sby_s =
+    vp8_build_intra_predictors_sby_s;
+  rtcd->recon.build_intra_predictors_sbuv_s =
+    vp8_build_intra_predictors_sbuv_s;
+#endif
   rtcd->recon.build_intra_predictors_mby =
     vp8_build_intra_predictors_mby;
 #if CONFIG_COMP_INTRA_PRED
diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c
index 05c00ef4e..d9c4b54be 100644
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -325,7 +325,13 @@ void vp8_loop_filter_frame
           lfi.lim = lfi_n->lim[filter_level];
           lfi.hev_thr = lfi_n->hev_thr[hev_index];
 
-          if (mb_col > 0)
+          if (mb_col > 0
+#if CONFIG_SUPERBLOCKS
+              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
+                   mode_info_context[0].mbmi.mb_skip_coeff &&
+                   mode_info_context[-1].mbmi.mb_skip_coeff)
+#endif
+              )
             vp8_loop_filter_mbv_c
             (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
 
@@ -344,7 +350,13 @@ void vp8_loop_filter_frame
           }
 
           /* don't apply across umv border */
-          if (mb_row > 0)
+          if (mb_row > 0
+#if CONFIG_SUPERBLOCKS
+              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
+                   mode_info_context[0].mbmi.mb_skip_coeff &&
+                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
+#endif
+              )
             vp8_loop_filter_mbh_c
             (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
 
@@ -362,7 +374,13 @@ void vp8_loop_filter_frame
           }
         } else {
           // FIXME: Not 8x8 aware
-          if (mb_col > 0)
+          if (mb_col > 0
+#if CONFIG_SUPERBLOCKS
+              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
+                   mode_info_context[0].mbmi.mb_skip_coeff &&
+                   mode_info_context[-1].mbmi.mb_skip_coeff)
+#endif
+              )
             LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
             (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
 
@@ -371,7 +389,13 @@ void vp8_loop_filter_frame
             (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
 
           /* don't apply across umv border */
-          if (mb_row > 0)
+          if (mb_row > 0
+#if CONFIG_SUPERBLOCKS
+              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
+                   mode_info_context[0].mbmi.mb_skip_coeff &&
+                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
+#endif
+              )
             LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
             (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
 
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index a36347dca..b7a543220 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -226,12 +226,15 @@ typedef struct VP8Common {
 
   /* Y,U,V,Y2 */
   ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */
-  ENTROPY_CONTEXT_PLANES left_context;  /* (up to) 4 contexts "" */
+  ENTROPY_CONTEXT_PLANES left_context[2];  /* (up to) 4 contexts "" */
 
   /* keyframe block modes are predicted by their above, left neighbors */
 
   vp8_prob kf_bmode_prob [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES - 1];
   vp8_prob kf_ymode_prob[8][VP8_YMODES - 1]; /* keyframe "" */
+#if CONFIG_SUPERBLOCKS
+  vp8_prob sb_kf_ymode_prob[8][VP8_I32X32_MODES - 1];
+#endif
   int kf_ymode_probs_index;
   int kf_ymode_probs_update;
   vp8_prob kf_uv_mode_prob[VP8_YMODES] [VP8_UV_MODES - 1];
@@ -239,6 +242,9 @@ typedef struct VP8Common {
   vp8_prob prob_intra_coded;
   vp8_prob prob_last_coded;
   vp8_prob prob_gf_coded;
+#if CONFIG_SUPERBLOCKS
+  vp8_prob sb_coded;
+#endif
 
   // Context probabilities when using predictive coding of segment id
   vp8_prob segment_pred_probs[PREDICTION_PROBS];
diff --git a/vp8/common/pred_common.c b/vp8/common/pred_common.c
index ac5d86009..cb80a0f7e 100644
--- a/vp8/common/pred_common.c
+++ b/vp8/common/pred_common.c
@@ -1,3 +1,4 @@
+
 /*
  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
  *
@@ -224,10 +225,24 @@ void set_pred_flag(MACROBLOCKD *const xd,
   switch (pred_id) {
     case PRED_SEG_ID:
       xd->mode_info_context->mbmi.seg_id_predicted = pred_flag;
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        xd->mode_info_context[1].mbmi.seg_id_predicted = pred_flag;
+        xd->mode_info_context[xd->mode_info_stride].mbmi.seg_id_predicted = pred_flag;
+        xd->mode_info_context[xd->mode_info_stride+1].mbmi.seg_id_predicted = pred_flag;
+      }
+#endif
       break;
 
     case PRED_REF:
       xd->mode_info_context->mbmi.ref_predicted = pred_flag;
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        xd->mode_info_context[1].mbmi.ref_predicted = pred_flag;
+        xd->mode_info_context[xd->mode_info_stride].mbmi.ref_predicted = pred_flag;
+        xd->mode_info_context[xd->mode_info_stride+1].mbmi.ref_predicted = pred_flag;
+      }
+#endif
       break;
 
     case PRED_MBSKIP:
diff --git a/vp8/common/recon.c b/vp8/common/recon.c
index 8fc320863..cf2d2fb85 100644
--- a/vp8/common/recon.c
+++ b/vp8/common/recon.c
@@ -124,6 +124,52 @@ void vp8_recon2b_c
   }
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp8_recon_mby_s_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd, uint8_t *dst) {
+  int x, y;
+  BLOCKD *b = &xd->block[0];
+  int stride = b->dst_stride;
+  short *diff = b->diff;
+
+  for (y = 0; y < 16; y++) {
+    for (x = 0; x < 16; x++) {
+      int a = dst[x] + diff[x];
+      if (a < 0)
+        a = 0;
+      else if (a > 255)
+        a = 255;
+      dst[x] = a;
+    }
+    dst += stride;
+    diff += 16;
+  }
+}
+
+void vp8_recon_mbuv_s_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
+  int x, y, i;
+  uint8_t *dst = udst;
+
+  for (i = 0; i < 2; i++, dst = vdst) {
+    BLOCKD *b = &xd->block[16 + 4 * i];
+    int stride = b->dst_stride;
+    short *diff = b->diff;
+
+    for (y = 0; y < 8; y++) {
+      for (x = 0; x < 8; x++) {
+        int a = dst[x] + diff[x];
+        if (a < 0)
+          a = 0;
+        else if (a > 255)
+          a = 255;
+        dst[x] = a;
+      }
+      dst += stride;
+      diff += 8;
+    }
+  }
+}
+#endif
+
 void vp8_recon_mby_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd) {
 #if ARCH_ARM
   BLOCKD *b = &xd->block[0];
diff --git a/vp8/common/recon.h b/vp8/common/recon.h
index 2626a218d..3527fc14d 100644
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h
@@ -100,6 +100,11 @@ extern prototype_recon_macroblock(vp8_recon_recon_mb);
 #endif
 extern prototype_recon_macroblock(vp8_recon_recon_mby);
 
+#ifndef vp8_recon_build_intra_predictors_sby_s
+#define vp8_recon_build_intra_predictors_sby_s vp8_build_intra_predictors_sby_s
+#endif
+extern prototype_build_intra_predictors(vp8_recon_build_intra_predictors_sby_s);
+
 #ifndef vp8_recon_build_intra_predictors_mby
 #define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby
 #endif
@@ -126,6 +131,11 @@ extern prototype_build_intra_predictors\
 extern prototype_build_intra_predictors\
 (vp8_recon_build_intra_predictors_mby_s);
 
+#ifndef vp8_recon_build_intra_predictors_sbuv_s
+#define vp8_recon_build_intra_predictors_sbuv_s vp8_build_intra_predictors_sbuv_s
+#endif
+extern prototype_build_intra_predictors(vp8_recon_build_intra_predictors_sbuv_s);
+
 #ifndef vp8_recon_build_intra_predictors_mbuv
 #define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv
 #endif
@@ -214,10 +224,16 @@ typedef struct vp8_recon_rtcd_vtable {
   vp8_recon_fn_t       recon4;
   vp8_recon_mb_fn_t    recon_mb;
   vp8_recon_mb_fn_t    recon_mby;
+#if CONFIG_SUPERBLOCKS
+  vp8_build_intra_pred_fn_t  build_intra_predictors_sby_s;
+#endif
   vp8_build_intra_pred_fn_t  build_intra_predictors_mby_s;
   vp8_build_intra_pred_fn_t  build_intra_predictors_mby;
 #if CONFIG_COMP_INTRA_PRED
   vp8_build_intra_pred_fn_t  build_comp_intra_predictors_mby;
+#endif
+#if CONFIG_SUPERBLOCKS
+  vp8_build_intra_pred_fn_t  build_intra_predictors_sbuv_s;
 #endif
   vp8_build_intra_pred_fn_t  build_intra_predictors_mbuv_s;
   vp8_build_intra_pred_fn_t  build_intra_predictors_mbuv;
diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c
index 0d82db784..1b5ef837f 100644
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -759,6 +759,56 @@ void vp8_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
   vp8_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp8_build_inter32x32_predictors_sb(MACROBLOCKD *x,
+                                        unsigned char *dst_y,
+                                        unsigned char *dst_u,
+                                        unsigned char *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride) {
+  uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
+  uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,
+          *v2 = x->second_pre.v_buffer;
+  int n;
+
+  for (n = 0; n < 4; n++)
+  {
+    const int x_idx = n & 1, y_idx = n >> 1;
+
+    x->pre.y_buffer = y1 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;
+    x->pre.u_buffer = u1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+    x->pre.v_buffer = v1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+
+    vp8_build_1st_inter16x16_predictors_mb(x,
+      dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,
+      dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
+      dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
+      dst_ystride, dst_uvstride);
+    if (x->mode_info_context->mbmi.second_ref_frame) {
+      x->second_pre.y_buffer = y2 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;
+      x->second_pre.u_buffer = u2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+      x->second_pre.v_buffer = v2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+
+      vp8_build_2nd_inter16x16_predictors_mb(x,
+        dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,
+        dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
+        dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
+        dst_ystride, dst_uvstride);
+    }
+  }
+
+  x->pre.y_buffer = y1;
+  x->pre.u_buffer = u1;
+  x->pre.v_buffer = v1;
+
+  if (x->mode_info_context->mbmi.second_ref_frame) {
+    x->second_pre.y_buffer = y2;
+    x->second_pre.u_buffer = u2;
+    x->second_pre.v_buffer = v2;
+  }
+}
+#endif
+
 /*
  * The following functions should be called after an initial
  * call to vp8_build_inter16x16_predictors_mb() or _mby()/_mbuv().
diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c
index 96bebc5be..d858cd153 100644
--- a/vp8/common/reconintra.c
+++ b/vp8/common/reconintra.c
@@ -207,17 +207,18 @@ void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd,
   }
 }
 
-void vp8_build_intra_predictors_mby_internal(MACROBLOCKD *xd,
-                                             unsigned char *ypred_ptr,
-                                             int y_stride, int mode) {
+void vp8_build_intra_predictors_internal(MACROBLOCKD *xd,
+                                         unsigned char *src, int src_stride,
+                                         unsigned char *ypred_ptr,
+                                         int y_stride, int mode, int bsize) {
 
-  unsigned char *yabove_row = xd->dst.y_buffer - xd->dst.y_stride;
-  unsigned char yleft_col[16];
+  unsigned char *yabove_row = src - src_stride;
+  unsigned char yleft_col[32];
   unsigned char ytop_left = yabove_row[-1];
   int r, c, i;
 
-  for (i = 0; i < 16; i++) {
-    yleft_col[i] = xd->dst.y_buffer [i * xd->dst.y_stride - 1];
+  for (i = 0; i < bsize; i++) {
+    yleft_col[i] = xd->dst.y_buffer [i * src_stride - 1];
   }
 
   /* for Y */
@@ -227,58 +228,58 @@ void vp8_build_intra_predictors_mby_internal(MACROBLOCKD *xd,
       int i;
       int shift;
       int average = 0;
-
+      int log2_bsize_minus_1;
+
+      assert(bsize == 8 || bsize == 16 || bsize == 32);
+      if (bsize == 8) {
+        log2_bsize_minus_1 = 2;
+      } else if (bsize == 16) {
+        log2_bsize_minus_1 = 3;
+      } else /* bsize == 32 */ {
+        log2_bsize_minus_1 = 4;
+      }
 
       if (xd->up_available || xd->left_available) {
         if (xd->up_available) {
-          for (i = 0; i < 16; i++) {
+          for (i = 0; i < bsize; i++) {
             average += yabove_row[i];
           }
         }
 
         if (xd->left_available) {
-          for (i = 0; i < 16; i++) {
+          for (i = 0; i < bsize; i++) {
             average += yleft_col[i];
           }
         }
-        shift = 3 + xd->up_available + xd->left_available;
+        shift = log2_bsize_minus_1 + xd->up_available + xd->left_available;
         expected_dc = (average + (1 << (shift - 1))) >> shift;
       } else {
         expected_dc = 128;
       }
 
-      for (r = 0; r < 16; r++) {
-        vpx_memset(ypred_ptr, expected_dc, 16);
-        ypred_ptr += y_stride; /*16;*/
+      for (r = 0; r < bsize; r++) {
+        vpx_memset(ypred_ptr, expected_dc, bsize);
+        ypred_ptr += y_stride;
       }
     }
     break;
     case V_PRED: {
-
-      for (r = 0; r < 16; r++) {
-
-        ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0];
-        ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1];
-        ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2];
-        ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3];
+      for (r = 0; r < bsize; r++) {
+        memcpy(ypred_ptr, yabove_row, bsize);
         ypred_ptr += y_stride;
       }
     }
     break;
     case H_PRED: {
-
-      for (r = 0; r < 16; r++) {
-
-        vpx_memset(ypred_ptr, yleft_col[r], 16);
+      for (r = 0; r < bsize; r++) {
+        vpx_memset(ypred_ptr, yleft_col[r], bsize);
         ypred_ptr += y_stride;
       }
-
     }
     break;
     case TM_PRED: {
-
-      for (r = 0; r < 16; r++) {
-        for (c = 0; c < 16; c++) {
+      for (r = 0; r < bsize; r++) {
+        for (c = 0; c < bsize; c++) {
           int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
 
           if (pred < 0)
@@ -292,31 +293,30 @@ void vp8_build_intra_predictors_mby_internal(MACROBLOCKD *xd,
 
         ypred_ptr += y_stride;
       }
-
     }
     break;
     case D45_PRED: {
-      d45_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d45_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case D135_PRED: {
-      d135_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d135_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case D117_PRED: {
-      d117_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d117_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case D153_PRED: {
-      d153_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d153_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case D27_PRED: {
-      d27_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d27_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case D63_PRED: {
-      d63_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d63_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case I8X8_PRED:
@@ -332,25 +332,36 @@ void vp8_build_intra_predictors_mby_internal(MACROBLOCKD *xd,
 }
 
 void vp8_build_intra_predictors_mby(MACROBLOCKD *xd) {
-  vp8_build_intra_predictors_mby_internal(xd, xd->predictor, 16,
-                                          xd->mode_info_context->mbmi.mode);
+  vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->predictor, 16,
+                                      xd->mode_info_context->mbmi.mode, 16);
 }
 
 void vp8_build_intra_predictors_mby_s(MACROBLOCKD *xd) {
-  vp8_build_intra_predictors_mby_internal(xd, xd->dst.y_buffer,
-                                          xd->dst.y_stride,
-                                          xd->mode_info_context->mbmi.mode);
+  vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->mode_info_context->mbmi.mode, 16);
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp8_build_intra_predictors_sby_s(MACROBLOCKD *x) {
+  vp8_build_intra_predictors_internal(x, x->dst.y_buffer, x->dst.y_stride,
+                                      x->dst.y_buffer, x->dst.y_stride,
+                                      x->mode_info_context->mbmi.mode, 32);
+}
+#endif
+
 #if CONFIG_COMP_INTRA_PRED
 void vp8_build_comp_intra_predictors_mby(MACROBLOCKD *xd) {
   unsigned char predictor[2][256];
   int i;
 
-  vp8_build_intra_predictors_mby_internal(
-    xd, predictor[0], 16, xd->mode_info_context->mbmi.mode);
-  vp8_build_intra_predictors_mby_internal(
-    xd, predictor[1], 16, xd->mode_info_context->mbmi.second_mode);
+  vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+                                      predictor[0], 16,
+                                      xd->mode_info_context->mbmi.mode);
+  vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+                                      predictor[1], 16,
+                                      xd->mode_info_context->mbmi.second_mode);
 
   for (i = 0; i < 256; i++) {
     xd->predictor[i] = (predictor[0][i] + predictor[1][i] + 1) >> 1;
@@ -362,172 +373,37 @@ void vp8_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd,
                                               unsigned char *upred_ptr,
                                               unsigned char *vpred_ptr,
                                               int uv_stride,
-                                              int mode) {
-  YV12_BUFFER_CONFIG * dst = &xd->dst;
-  unsigned char *uabove_row = dst->u_buffer - dst->uv_stride;
-  unsigned char uleft_col[16];
-  unsigned char utop_left = uabove_row[-1];
-  unsigned char *vabove_row = dst->v_buffer - dst->uv_stride;
-  unsigned char vleft_col[20];
-  unsigned char vtop_left = vabove_row[-1];
-
-  int i, j;
-
-  for (i = 0; i < 8; i++) {
-    uleft_col[i] = dst->u_buffer [i * dst->uv_stride - 1];
-    vleft_col[i] = dst->v_buffer [i * dst->uv_stride - 1];
-  }
-
-  switch (mode) {
-    case DC_PRED: {
-      int expected_udc;
-      int expected_vdc;
-      int i;
-      int shift;
-      int Uaverage = 0;
-      int Vaverage = 0;
-
-      if (xd->up_available) {
-        for (i = 0; i < 8; i++) {
-          Uaverage += uabove_row[i];
-          Vaverage += vabove_row[i];
-        }
-      }
-
-      if (xd->left_available) {
-        for (i = 0; i < 8; i++) {
-          Uaverage += uleft_col[i];
-          Vaverage += vleft_col[i];
-        }
-      }
-
-      if (!xd->up_available && !xd->left_available) {
-        expected_udc = 128;
-        expected_vdc = 128;
-      } else {
-        shift = 2 + xd->up_available + xd->left_available;
-        expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
-        expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
-      }
-
-
-      /*vpx_memset(upred_ptr,expected_udc,64);*/
-      /*vpx_memset(vpred_ptr,expected_vdc,64);*/
-      for (i = 0; i < 8; i++) {
-        vpx_memset(upred_ptr, expected_udc, 8);
-        vpx_memset(vpred_ptr, expected_vdc, 8);
-        upred_ptr += uv_stride; /*8;*/
-        vpred_ptr += uv_stride; /*8;*/
-      }
-    }
-    break;
-    case V_PRED: {
-      int i;
-
-      for (i = 0; i < 8; i++) {
-        vpx_memcpy(upred_ptr, uabove_row, 8);
-        vpx_memcpy(vpred_ptr, vabove_row, 8);
-        upred_ptr += uv_stride; /*8;*/
-        vpred_ptr += uv_stride; /*8;*/
-      }
-
-    }
-    break;
-    case H_PRED: {
-      int i;
-
-      for (i = 0; i < 8; i++) {
-        vpx_memset(upred_ptr, uleft_col[i], 8);
-        vpx_memset(vpred_ptr, vleft_col[i], 8);
-        upred_ptr += uv_stride; /*8;*/
-        vpred_ptr += uv_stride; /*8;*/
-      }
-    }
-
-    break;
-    case TM_PRED: {
-      int i;
-
-      for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++) {
-          int predu = uleft_col[i] + uabove_row[j] - utop_left;
-          int predv = vleft_col[i] + vabove_row[j] - vtop_left;
-
-          if (predu < 0)
-            predu = 0;
-
-          if (predu > 255)
-            predu = 255;
-
-          if (predv < 0)
-            predv = 0;
-
-          if (predv > 255)
-            predv = 255;
-
-          upred_ptr[j] = predu;
-          vpred_ptr[j] = predv;
-        }
-
-        upred_ptr += uv_stride; /*8;*/
-        vpred_ptr += uv_stride; /*8;*/
-      }
-
-    }
-    break;
-    case D45_PRED: {
-      d45_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d45_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case D135_PRED: {
-      d135_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d135_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case D117_PRED: {
-      d117_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d117_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case D153_PRED: {
-      d153_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d153_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case D27_PRED: {
-      d27_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d27_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case D63_PRED: {
-      d63_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d63_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case B_PRED:
-    case NEARESTMV:
-    case NEARMV:
-    case ZEROMV:
-    case NEWMV:
-    case SPLITMV:
-    case MB_MODE_COUNT:
-      break;
-  }
+                                              int mode, int bsize) {
+  vp8_build_intra_predictors_internal(xd, xd->dst.u_buffer, xd->dst.uv_stride,
+                                      upred_ptr, uv_stride, mode, bsize);
+  vp8_build_intra_predictors_internal(xd, xd->dst.v_buffer, xd->dst.uv_stride,
+                                      vpred_ptr, uv_stride, mode, bsize);
 }
 
 void vp8_build_intra_predictors_mbuv(MACROBLOCKD *xd) {
-  vp8_build_intra_predictors_mbuv_internal(
-    xd, &xd->predictor[256], &xd->predictor[320],
-    8, xd->mode_info_context->mbmi.uv_mode);
+  vp8_build_intra_predictors_mbuv_internal(xd, &xd->predictor[256],
+                                           &xd->predictor[320], 8,
+                                           xd->mode_info_context->mbmi.uv_mode,
+                                           8);
 }
 
 void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *xd) {
-  vp8_build_intra_predictors_mbuv_internal(
-    xd, xd->dst.u_buffer, xd->dst.v_buffer,
-    xd->dst.uv_stride, xd->mode_info_context->mbmi.uv_mode);
+  vp8_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
+                                           xd->dst.v_buffer,
+                                           xd->dst.uv_stride,
+                                           xd->mode_info_context->mbmi.uv_mode,
+                                           8);
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp8_build_intra_predictors_sbuv_s(MACROBLOCKD *xd) {
+  vp8_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
+                                           xd->dst.v_buffer, xd->dst.uv_stride,
+                                           xd->mode_info_context->mbmi.uv_mode,
+                                           16);
+}
+#endif
+
 #if CONFIG_COMP_INTRA_PRED
 void vp8_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) {
   unsigned char predictor[2][2][64];
@@ -541,7 +417,8 @@ void vp8_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) {
     xd->mode_info_context->mbmi.second_uv_mode);
   for (i = 0; i < 64; i++) {
     xd->predictor[256 + i] = (predictor[0][0][i] + predictor[0][1][i] + 1) >> 1;
-    xd->predictor[256 + 64 + i] = (predictor[1][0][i] + predictor[1][1][i] + 1) >> 1;
+    xd->predictor[256 + 64 + i] = (predictor[1][0][i] +
+                                   predictor[1][1][i] + 1) >> 1;
   }
 }
 #endif
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 94826ef6c..5e0600c2d 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -29,34 +29,31 @@ int dec_mvcount = 0;
 #endif
 
 static int vp8_read_bmode(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_bmode_tree, p);
-
-  return i;
+  return vp8_treed_read(bc, vp8_bmode_tree, p);
 }
 
 
 static int vp8_read_ymode(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_ymode_tree, p);
+  return vp8_treed_read(bc, vp8_ymode_tree, p);
+}
 
-  return i;
+#if CONFIG_SUPERBLOCKS
+static int vp8_sb_kfread_ymode(vp8_reader *bc, const vp8_prob *p) {
+  return vp8_treed_read(bc, vp8_uv_mode_tree, p);
 }
+#endif
 
 static int vp8_kfread_ymode(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_kf_ymode_tree, p);
-
-  return i;
+  return vp8_treed_read(bc, vp8_kf_ymode_tree, p);
 }
-static int vp8_read_i8x8_mode(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_i8x8_mode_tree, p);
 
-  return i;
+static int vp8_read_i8x8_mode(vp8_reader *bc, const vp8_prob *p) {
+  return vp8_treed_read(bc, vp8_i8x8_mode_tree, p);
 }
 
 
 static int vp8_read_uv_mode(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_uv_mode_tree, p);
-
-  return i;
+  return vp8_treed_read(bc, vp8_uv_mode_tree, p);
 }
 
 // This function reads the current macro block's segnent id from the bitstream
@@ -112,8 +109,14 @@ static void vp8_kfread_modes(VP8D_COMP *pbi,
       m->mbmi.mb_skip_coeff = 0;
   }
 
+#if CONFIG_SUPERBLOCKS
+  if (m->mbmi.encoded_as_sb) {
+    y_mode = (MB_PREDICTION_MODE) vp8_sb_kfread_ymode(bc,
+      pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
+  } else
+#endif
   y_mode = (MB_PREDICTION_MODE) vp8_kfread_ymode(bc,
-                                                 pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
+    pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
 #if CONFIG_COMP_INTRA_PRED
   m->mbmi.second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
 #endif
@@ -398,16 +401,18 @@ static MV_REFERENCE_FRAME read_ref_frame(VP8D_COMP *pbi,
   return (MV_REFERENCE_FRAME)ref_frame;
 }
 
-static MB_PREDICTION_MODE read_mv_ref(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_mv_ref_tree, p);
+#if CONFIG_SUPERBLOCKS
+static MB_PREDICTION_MODE read_sb_mv_ref(vp8_reader *bc, const vp8_prob *p) {
+  return (MB_PREDICTION_MODE) vp8_treed_read(bc, vp8_sb_mv_ref_tree, p);
+}
+#endif
 
-  return (MB_PREDICTION_MODE)i;
+static MB_PREDICTION_MODE read_mv_ref(vp8_reader *bc, const vp8_prob *p) {
+  return (MB_PREDICTION_MODE) vp8_treed_read(bc, vp8_mv_ref_tree, p);
 }
 
 static B_PREDICTION_MODE sub_mv_ref(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_sub_mv_ref_tree, p);
-
-  return (B_PREDICTION_MODE)i;
+  return (B_PREDICTION_MODE) vp8_treed_read(bc, vp8_sub_mv_ref_tree, p);
 }
 
 #ifdef VPX_MODE_COUNT
@@ -537,15 +542,36 @@ static void read_mb_segment_id(VP8D_COMP *pbi,
         // Else .... decode it explicitly
         else {
           vp8_read_mb_segid(bc, mbmi, xd);
-          cm->last_frame_seg_map[index] = mbmi->segment_id;
         }
-
       }
       // Normal unpredicted coding mode
       else {
         vp8_read_mb_segid(bc, mbmi, xd);
+      }
+#if CONFIG_SUPERBLOCKS
+      if (mbmi->encoded_as_sb) {
+        cm->last_frame_seg_map[index] =
+        cm->last_frame_seg_map[index + 1] =
+        cm->last_frame_seg_map[index + cm->mb_cols] =
+        cm->last_frame_seg_map[index + cm->mb_cols + 1] = mbmi->segment_id;
+      } else
+#endif
+      {
         cm->last_frame_seg_map[index] = mbmi->segment_id;
       }
+    } else {
+#if CONFIG_SUPERBLOCKS
+      if (mbmi->encoded_as_sb) {
+        mbmi->segment_id =
+              cm->last_frame_seg_map[index] &&
+              cm->last_frame_seg_map[index + 1] &&
+              cm->last_frame_seg_map[index + cm->mb_cols] &&
+              cm->last_frame_seg_map[index + cm->mb_cols + 1];
+      } else
+#endif
+      {
+        mbmi->segment_id = cm->last_frame_seg_map[index];
+      }
     }
   } else {
     // The encoder explicitly sets the segment_id to 0
@@ -667,6 +693,11 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
       mbmi->mode =
         get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
     } else {
+#if CONFIG_SUPERBLOCKS
+      if (mbmi->encoded_as_sb) {
+        mbmi->mode = read_sb_mv_ref(bc, mv_ref_p);
+      } else
+#endif
       mbmi->mode = read_mv_ref(bc, mv_ref_p);
 
       vp8_accum_mv_refs(&pbi->common, mbmi->mode, rct);
@@ -963,6 +994,7 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
       mbmi->mode = (MB_PREDICTION_MODE)
                    get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
     else {
+      // FIXME write using SB mode tree
       mbmi->mode = (MB_PREDICTION_MODE)
                    vp8_read_ymode(bc, pbi->common.fc.ymode_prob);
       pbi->common.fc.ymode_counts[mbmi->mode]++;
@@ -1045,6 +1077,9 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi) {
     int mb_row = (sb_row << 1);
 
     for (sb_col = 0; sb_col < sb_cols; sb_col++) {
+#if CONFIG_SUPERBLOCKS
+      mi->mbmi.encoded_as_sb = vp8_read(&pbi->bc, cm->sb_coded);
+#endif
       for (i = 0; i < 4; i++) {
 
         int dy = row_delta[i];
@@ -1059,6 +1094,10 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi) {
           prev_mi += offset_extended;
           continue;
         }
+#if CONFIG_SUPERBLOCKS
+        if (i)
+          mi->mbmi.encoded_as_sb = 0;
+#endif
 
         // Make sure the MacroBlockD mode info pointer is set correctly
         xd->mode_info_context = mi;
@@ -1074,6 +1113,18 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi) {
           read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row,
                            mb_col);
 
+#if CONFIG_SUPERBLOCKS
+        if (mi->mbmi.encoded_as_sb) {
+          assert(!i);
+          mb_col += 2;
+          mi[1] = mi[cm->mode_info_stride] =
+            mi[cm->mode_info_stride + 1] = mi[0];
+          mi += 2;
+          prev_mi += 2;
+          break;
+        }
+#endif
+
         /* next macroblock */
         mb_row += dy;
         mb_col += dx;
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 6ff914893..61d3c8d2c 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -175,10 +175,27 @@ void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd) {
  */
 static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd) {
   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_sbuv_s)(xd);
+      RECON_INVOKE(&pbi->common.rtcd.recon,
+                   build_intra_predictors_sby_s)(xd);
+    } else {
+#endif
     RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv_s)(xd);
     RECON_INVOKE(&pbi->common.rtcd.recon,
                  build_intra_predictors_mby_s)(xd);
+#if CONFIG_SUPERBLOCKS
+    }
+#endif
   } else {
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      vp8_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                         xd->dst.u_buffer, xd->dst.v_buffer,
+                                         xd->dst.y_stride, xd->dst.uv_stride);
+    } else {
+#endif
     vp8_build_1st_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
                                            xd->dst.u_buffer, xd->dst.v_buffer,
                                            xd->dst.y_stride, xd->dst.uv_stride);
@@ -188,6 +205,9 @@ static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd) {
                                              xd->dst.u_buffer, xd->dst.v_buffer,
                                              xd->dst.y_stride, xd->dst.uv_stride);
     }
+#if CONFIG_SUPERBLOCKS
+    }
+#endif
   }
 #ifdef DEC_DEBUG
   if (dec_debug) {
@@ -204,11 +224,15 @@ static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd) {
 
 extern const int vp8_i8x8_block[4];
 static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
-                              unsigned int mb_idx) {
+                              unsigned int mb_col) {
   int eobtotal = 0;
   MB_PREDICTION_MODE mode;
   int i;
   int tx_type;
+#if CONFIG_SUPERBLOCKS
+  VP8_COMMON *pc = &pbi->common;
+  int orig_skip_flag = xd->mode_info_context->mbmi.mb_skip_coeff;
+#endif
 
 #if CONFIG_HYBRIDTRANSFORM
   int QIndex = xd->q_index;
@@ -264,11 +288,25 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
     xd->mode_info_context->mbmi.txfm_size = TX_8X8;
   }
 #endif
+#if CONFIG_SUPERBLOCKS
+  if (xd->mode_info_context->mbmi.encoded_as_sb) {
+    xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+  }
+#endif
 
   tx_type = xd->mode_info_context->mbmi.txfm_size;
 
   if (xd->mode_info_context->mbmi.mb_skip_coeff) {
     vp8_reset_mb_tokens_context(xd);
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      xd->above_context++;
+      xd->left_context++;
+      vp8_reset_mb_tokens_context(xd);
+      xd->above_context--;
+      xd->left_context--;
+    }
+#endif
   } else if (!vp8dx_bool_error(xd->current_bc)) {
     for (i = 0; i < 25; i++) {
       xd->block[i].eob = 0;
@@ -311,8 +349,13 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
      * */
     xd->mode_info_context->mbmi.mb_skip_coeff = 1;
 
-    skip_recon_mb(pbi, xd);
-    return;
+#if CONFIG_SUPERBLOCKS
+    if (!xd->mode_info_context->mbmi.encoded_as_sb || orig_skip_flag)
+#endif
+    {
+      skip_recon_mb(pbi, xd);
+      return;
+    }
   }
 
 #ifdef DEC_DEBUG
@@ -343,6 +386,12 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
   /* do prediction */
   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_sby_s)(xd);
+      RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_sbuv_s)(xd);
+    } else
+#endif
     if (mode != I8X8_PRED) {
       RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv)(xd);
       if (mode != B_PRED) {
@@ -358,6 +407,13 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
 #endif
     }
   } else {
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      vp8_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                         xd->dst.u_buffer, xd->dst.v_buffer,
+                                         xd->dst.y_stride, xd->dst.uv_stride);
+    } else
+#endif
     vp8_build_inter_predictors_mb(xd);
   }
 
@@ -481,6 +537,32 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
     else
 #endif
     if (tx_type == TX_8X8) {
+#if CONFIG_SUPERBLOCKS
+      void *orig = xd->mode_info_context;
+      int n, num = xd->mode_info_context->mbmi.encoded_as_sb ? 4 : 1;
+      for (n = 0; n < num; n++) {
+        if (n != 0) {
+          for (i = 0; i < 25; i++) {
+            xd->block[i].eob = 0;
+            xd->eobs[i] = 0;
+          }
+          xd->above_context = pc->above_context + mb_col + (n & 1);
+          xd->left_context = pc->left_context + (n >> 1);
+          xd->mode_info_context = orig;
+          xd->mode_info_context += (n & 1);
+          xd->mode_info_context += (n >> 1) * pc->mode_info_stride;
+          if (!orig_skip_flag) {
+            eobtotal = vp8_decode_mb_tokens_8x8(pbi, xd);
+            if (eobtotal == 0) // skip loopfilter
+              xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+          } else {
+            vp8_reset_mb_tokens_context(xd);
+          }
+        }
+
+        if (xd->mode_info_context->mbmi.mb_skip_coeff)
+          continue; // only happens for SBs, which are already in dest buffer
+#endif
       DEQUANT_INVOKE(&pbi->dequant, block_2x2)(b);
 #ifdef DEC_DEBUG
       if (dec_debug) {
@@ -501,10 +583,27 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
       ((int *)b->qcoeff)[5] = 0;
       ((int *)b->qcoeff)[6] = 0;
       ((int *)b->qcoeff)[7] = 0;
-      DEQUANT_INVOKE(&pbi->dequant, dc_idct_add_y_block_8x8)
-      (xd->qcoeff, xd->block[0].dequant,
-       xd->predictor, xd->dst.y_buffer,
-       xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        vp8_dequant_dc_idct_add_y_block_8x8_inplace_c(xd->qcoeff,
+          xd->block[0].dequant,
+          xd->dst.y_buffer + (n >> 1) * 16 * xd->dst.y_stride + (n & 1) * 16,
+          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+        // do UV inline also
+        vp8_dequant_idct_add_uv_block_8x8_inplace_c(xd->qcoeff + 16 * 16,
+          xd->block[16].dequant,
+          xd->dst.u_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
+          xd->dst.v_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
+          xd->dst.uv_stride, xd->eobs + 16, xd);
+      } else
+#endif
+        DEQUANT_INVOKE(&pbi->dequant, dc_idct_add_y_block_8x8)(xd->qcoeff,
+          xd->block[0].dequant, xd->predictor, xd->dst.y_buffer,
+          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+#if CONFIG_SUPERBLOCKS
+      }
+      xd->mode_info_context = orig;
+#endif
     } else {
       DEQUANT_INVOKE(&pbi->dequant, block)(b);
       if (xd->eobs[24] > 1) {
@@ -529,7 +628,10 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
     }
   }
 
-  if (tx_type == TX_8X8
+#if CONFIG_SUPERBLOCKS
+    if (!xd->mode_info_context->mbmi.encoded_as_sb) {
+#endif
+      if (tx_type == TX_8X8
 #if CONFIG_TX16X16
       || tx_type == TX_16X16
 #endif
@@ -543,6 +645,9 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
     (xd->qcoeff + 16 * 16, xd->block[16].dequant,
      xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
      xd->dst.uv_stride, xd->eobs + 16);
+#if CONFIG_SUPERBLOCKS
+  }
+#endif
 }
 
 
@@ -582,15 +687,21 @@ decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
   int row_delta[4] = { 0, +1,  0, -1};
   int col_delta[4] = { +1, -1, +1, +1};
   int sb_cols = (pc->mb_cols + 1) >> 1;
-  ENTROPY_CONTEXT_PLANES left_context[2];
 
   // For a SB there are 2 left contexts, each pertaining to a MB row within
-  vpx_memset(left_context, 0, sizeof(left_context));
+  vpx_memset(pc->left_context, 0, sizeof(pc->left_context));
 
   mb_row = mbrow;
   mb_col = 0;
 
   for (sb_col = 0; sb_col < sb_cols; sb_col++) {
+    MODE_INFO *mi = xd->mode_info_context;
+
+#if CONFIG_SUPERBLOCKS
+    if (pbi->interleaved_decoding)
+      mi->mbmi.encoded_as_sb = vp8_read(&pbi->bc, pc->sb_coded);
+#endif
+
     // Process the 4 MBs within the SB in the order:
     // top-left, top-right, bottom-left, bottom-right
     for (i = 0; i < 4; i++) {
@@ -598,6 +709,7 @@ decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
       int dx = col_delta[i];
       int offset_extended = dy * xd->mode_info_stride + dx;
 
+      mi = xd->mode_info_context;
       if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {
         // MB lies outside frame, skip on to next
         mb_row += dy;
@@ -610,13 +722,10 @@ decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
 #ifdef DEC_DEBUG
       dec_debug = (pc->current_video_frame == 0 && mb_row == 0 && mb_col == 0);
 #endif
-      // Copy in the appropriate left context for this MB row
-      vpx_memcpy(&pc->left_context,
-                 &left_context[i >> 1],
-                 sizeof(ENTROPY_CONTEXT_PLANES));
 
       // Set above context pointer
       xd->above_context = pc->above_context + mb_col;
+      xd->left_context = pc->left_context + (i >> 1);
 
       /* Distance of Mb to the various image edges.
        * These are specified to 8th pel as they are always compared to
@@ -639,6 +748,10 @@ decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
       xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
       xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
 
+#if CONFIG_SUPERBLOCKS
+      if (i)
+        mi->mbmi.encoded_as_sb = 0;
+#endif
       if(pbi->interleaved_decoding)
         vpx_decode_mb_mode_mv(pbi, xd, mb_row, mb_col);
 
@@ -681,15 +794,34 @@ decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
         xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
       }
 
-      decode_macroblock(pbi, xd, mb_row * pc->mb_cols + mb_col);
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        mi[1] = mi[0];
+        mi[pc->mode_info_stride] = mi[0];
+        mi[pc->mode_info_stride + 1] = mi[0];
+      }
+#endif
+      decode_macroblock(pbi, xd, mb_col);
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        mi[1].mbmi.txfm_size = mi[0].mbmi.txfm_size;
+        mi[pc->mode_info_stride].mbmi.txfm_size = mi[0].mbmi.txfm_size;
+        mi[pc->mode_info_stride + 1].mbmi.txfm_size = mi[0].mbmi.txfm_size;
+      }
+#endif
 
       /* check if the boolean decoder has suffered an error */
       xd->corrupted |= vp8dx_bool_error(xd->current_bc);
 
-      // Store the modified left context for the MB row locally
-      vpx_memcpy(&left_context[i >> 1],
-                 &pc->left_context,
-                 sizeof(ENTROPY_CONTEXT_PLANES));
+#if CONFIG_SUPERBLOCKS
+      if (mi->mbmi.encoded_as_sb) {
+        assert(!i);
+        mb_col += 2;
+        xd->mode_info_context += 2;
+        xd->prev_mode_info_context += 2;
+        break;
+      }
+#endif
 
       // skip to next MB
       xd->mode_info_context += offset_extended;
@@ -806,7 +938,6 @@ static void init_frame(VP8D_COMP *pbi) {
     vp8_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
   }
 
-  xd->left_context = &pc->left_context;
   xd->mode_info_context = pc->mi;
   xd->prev_mode_info_context = pc->prev_mi;
   xd->frame_type = pc->frame_type;
@@ -1151,6 +1282,10 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
     }
   }
 
+#if CONFIG_SUPERBLOCKS
+  pc->sb_coded = vp8_read_literal(bc, 8);
+#endif
+
   /* Read the loop filter level and type */
   pc->txfm_mode = (TXFM_MODE) vp8_read_bit(bc);
 
diff --git a/vp8/decoder/idct_blk.c b/vp8/decoder/idct_blk.c
index 36eea5d6f..e97d3298f 100644
--- a/vp8/decoder/idct_blk.c
+++ b/vp8/decoder/idct_blk.c
@@ -127,6 +127,19 @@ void vp8_dequant_dc_idct_add_y_block_8x8_c
 
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp8_dequant_dc_idct_add_y_block_8x8_inplace_c
+(short *q, short *dq,
+ unsigned char *dst, int stride, char *eobs, short *dc, MACROBLOCKD *xd) {
+
+  vp8_dequant_dc_idct_add_8x8_c(q, dq, dst, dst, stride, stride, dc[0]);
+  vp8_dequant_dc_idct_add_8x8_c(&q[64], dq, dst + 8, dst + 8, stride, stride, dc[1]);
+  vp8_dequant_dc_idct_add_8x8_c(&q[128], dq, dst + 8 * stride, dst + 8 * stride, stride, stride, dc[4]);
+  vp8_dequant_dc_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8, dst + 8 * stride + 8, stride, stride, dc[8]);
+
+}
+#endif
+
 void vp8_dequant_idct_add_y_block_8x8_c
 (short *q, short *dq, unsigned char *pre,
  unsigned char *dst, int stride, char *eobs, MACROBLOCKD *xd) {
@@ -153,6 +166,18 @@ void vp8_dequant_idct_add_uv_block_8x8_c
   vp8_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride);
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp8_dequant_idct_add_uv_block_8x8_inplace_c
+(short *q, short *dq,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs, MACROBLOCKD *xd) {
+  vp8_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride);
+
+  q    += 64;
+
+  vp8_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride);
+}
+#endif
+
 #if CONFIG_LOSSLESS
 void vp8_dequant_dc_idct_add_y_block_lossless_c
 (short *q, short *dq, unsigned char *pre,
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 63499a8f7..2e1364817 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -149,7 +149,7 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf) {
 
   pbi->decoded_key_frame = 0;
 
-  pbi->interleaved_decoding = CONFIG_NEWBESTREFMV;
+  pbi->interleaved_decoding = CONFIG_NEWBESTREFMV || CONFIG_SUPERBLOCKS;
 
   return (VP8D_PTR) pbi;
 }
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 76aed7e2d..90bc8e987 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -288,6 +288,12 @@ static void kfwrite_ymode(vp8_writer *bc, int m, const vp8_prob *p) {
   vp8_write_token(bc, vp8_kf_ymode_tree, p, vp8_kf_ymode_encodings + m);
 }
 
+#if CONFIG_SUPERBLOCKS
+static void sb_kfwrite_ymode(vp8_writer *bc, int m, const vp8_prob *p) {
+  vp8_write_token(bc, vp8_uv_mode_tree, p, vp8_sb_kf_ymode_encodings + m);
+}
+#endif
+
 static void write_i8x8_mode(vp8_writer *bc, int m, const vp8_prob *p) {
   vp8_write_token(bc, vp8_i8x8_mode_tree, p, vp8_i8x8_mode_encodings + m);
 }
@@ -533,6 +539,16 @@ static void write_mv_ref
                   vp8_mv_ref_encoding_array - NEARESTMV + m);
 }
 
+#if CONFIG_SUPERBLOCKS
+static void write_sb_mv_ref(vp8_writer *w, MB_PREDICTION_MODE m, const vp8_prob *p) {
+#if CONFIG_DEBUG
+  assert(NEARESTMV <= m  &&  m < SPLITMV);
+#endif
+  vp8_write_token(w, vp8_sb_mv_ref_tree, p,
+                  vp8_sb_mv_ref_encoding_array - NEARESTMV + m);
+}
+#endif
+
 static void write_sub_mv_ref
 (
   vp8_writer *w, B_PREDICTION_MODE m, const vp8_prob *p
@@ -810,6 +826,9 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
 
       // Process the 4 MBs in the order:
       // top-left, top-right, bottom-left, bottom-right
+#if CONFIG_SUPERBLOCKS
+      vp8_write(w, m->mbmi.encoded_as_sb, pc->sb_coded);
+#endif
       for (i = 0; i < 4; i++) {
         MB_MODE_INFO *mi;
         MV_REFERENCE_FRAME rf;
@@ -872,7 +891,15 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
         if (pc->mb_no_coeff_skip &&
             (!segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
              (get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
-          vp8_encode_bool(w, mi->mb_skip_coeff,
+          int skip_coeff = mi->mb_skip_coeff;
+#if CONFIG_SUPERBLOCKS
+          if (mi->encoded_as_sb) {
+            skip_coeff &= m[1].mbmi.mb_skip_coeff;
+            skip_coeff &= m[mis].mbmi.mb_skip_coeff;
+            skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
+          }
+#endif
+          vp8_encode_bool(w, skip_coeff,
                           get_pred_prob(pc, xd, PRED_MBSKIP));
         }
 
@@ -884,6 +911,8 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
           active_section = 6;
 #endif
 
+          // TODO(rbultje) write using SB tree structure
+
           if (!segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
             write_ymode(w, mode, pc->fc.ymode_prob);
           }
@@ -949,7 +978,14 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
 
           // Is the segment coding of mode enabled
           if (!segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
-            write_mv_ref(w, mode, mv_ref_p);
+#if CONFIG_SUPERBLOCKS
+            if (mi->encoded_as_sb) {
+              write_sb_mv_ref(w, mode, mv_ref_p);
+            } else
+#endif
+            {
+              write_mv_ref(w, mode, mv_ref_p);
+            }
             vp8_accum_mv_refs(&cpi->common, mode, ct);
           }
 
@@ -1085,6 +1121,17 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
           }
         }
 
+#if CONFIG_SUPERBLOCKS
+        if (m->mbmi.encoded_as_sb) {
+          assert(!i);
+          mb_col += 2;
+          m += 2;
+          cpi->mb.partition_info += 2;
+          prev_m += 2;
+          break;
+        }
+#endif
+
         // Next MB
         mb_row += dy;
         mb_col += dx;
@@ -1151,6 +1198,9 @@ static void write_kfmodes(VP8_COMP *cpi) {
 
     mb_col = 0;
     for (col = 0; col < c->mb_cols; col += 2) {
+#if CONFIG_SUPERBLOCKS
+      vp8_write(bc, m->mbmi.encoded_as_sb, c->sb_coded);
+#endif
       // Process the 4 MBs in the order:
       // top-left, top-right, bottom-left, bottom-right
       for (i = 0; i < 4; i++) {
@@ -1181,11 +1231,27 @@ static void write_kfmodes(VP8_COMP *cpi) {
         if (c->mb_no_coeff_skip &&
             (!segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
              (get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
-          vp8_encode_bool(bc, m->mbmi.mb_skip_coeff,
+              int skip_coeff = m->mbmi.mb_skip_coeff;
+#if CONFIG_SUPERBLOCKS
+              if (m->mbmi.encoded_as_sb) {
+                skip_coeff &= m[1].mbmi.mb_skip_coeff;
+                skip_coeff &= m[mis].mbmi.mb_skip_coeff;
+                skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
+              }
+#endif
+              vp8_encode_bool(bc, skip_coeff,
                           get_pred_prob(c, xd, PRED_MBSKIP));
         }
-        kfwrite_ymode(bc, ym,
-                      c->kf_ymode_prob[c->kf_ymode_probs_index]);
+#if CONFIG_SUPERBLOCKS
+        if (m->mbmi.encoded_as_sb) {
+          sb_kfwrite_ymode(bc, ym,
+                           c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
+        } else
+#endif
+        {
+          kfwrite_ymode(bc, ym,
+                        c->kf_ymode_prob[c->kf_ymode_probs_index]);
+        }
 
         if (ym == B_PRED) {
           const int mis = c->mode_info_stride;
@@ -1233,6 +1299,14 @@ static void write_kfmodes(VP8_COMP *cpi) {
         } else
           write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
 
+#if CONFIG_SUPERBLOCKS
+        if (m->mbmi.encoded_as_sb) {
+          assert(!i);
+          mb_col += 2;
+          m += 2;
+          break;
+        }
+#endif
         // Next MB
         mb_row += dy;
         mb_col += dx;
@@ -1793,7 +1867,7 @@ static void put_delta_q(vp8_writer *bc, int delta_q) {
   } else
     vp8_write_bit(bc, 0);
 }
-extern const unsigned int kf_y_mode_cts[8][VP8_YMODES];
+
 static void decide_kf_ymode_entropy(VP8_COMP *cpi) {
 
   int mode_cost[MB_MODE_COUNT];
@@ -1808,6 +1882,13 @@ static void decide_kf_ymode_entropy(VP8_COMP *cpi) {
     for (j = 0; j < VP8_YMODES; j++) {
       cost += mode_cost[j] * cpi->ymode_count[j];
     }
+#if CONFIG_SUPERBLOCKS
+    vp8_cost_tokens(mode_cost, cpi->common.sb_kf_ymode_prob[i],
+                    vp8_sb_ymode_tree);
+    for (j = 0; j < VP8_I32X32_MODES; j++) {
+      cost += mode_cost[j] * cpi->sb_ymode_count[j];
+    }
+#endif
     if (cost < bestcost) {
       bestindex = i;
       bestcost = cost;
@@ -1906,11 +1987,6 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
       // Select the coding strategy (temporal or spatial)
       choose_segmap_coding_method(cpi);
 
-      // Take a copy of the segment map if it changed for
-      // future comparison
-      vpx_memcpy(pc->last_frame_seg_map,
-                 cpi->segmentation_map, pc->MBs);
-
       // Write out the chosen coding method.
       vp8_write_bit(bc, (pc->temporal_update) ? 1 : 0);
     }
@@ -2048,6 +2124,19 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
     }
   }
 
+#if CONFIG_SUPERBLOCKS
+  {
+    /* sb mode probability */
+    int sb_coded = 256 - (cpi->sb_count << 8) / (((pc->mb_rows + 1) >> 1) * ((pc->mb_cols + 1) >> 1));
+    if (sb_coded <= 0)
+      sb_coded = 1;
+    else if (sb_coded >= 256)
+      sb_coded = 255;
+    pc->sb_coded = sb_coded;
+    vp8_write_literal(bc, pc->sb_coded, 8);
+  }
+#endif
+
   vp8_write_bit(bc, pc->txfm_mode);
 
   // Encode the loop filter level and type
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index dfc1d743e..d73af4faa 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -82,7 +82,9 @@ typedef struct {
   int best_mode_index;
   int rddiv;
   int rdmult;
-
+  int hybrid_pred_diff;
+  int comp_pred_diff;
+  int single_pred_diff;
 } PICK_MODE_CONTEXT;
 
 typedef struct {
@@ -139,12 +141,6 @@ typedef struct {
   int mv_col_max;
   int mv_row_min;
   int mv_row_max;
-#if CONFIG_SUPERBLOCKS
-  int mv_col_min_sb;
-  int mv_col_max_sb;
-  int mv_row_min_sb;
-  int mv_row_max_sb;
-#endif
 
   int skip;
 
@@ -163,8 +159,6 @@ typedef struct {
   int optimize;
   int q_index;
 
-  int encode_as_sb;
-
   // Structure to hold context for each of the 4 MBs within a SB:
   // when encoded as 4 independent MBs:
   PICK_MODE_CONTEXT mb_context[4];
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index e58c852a7..4472497e0 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -57,16 +57,24 @@ extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
                                       MB_ROW_COMP *mbr_ei,
                                       int mb_row,
                                       int count);
-extern int vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
+                              int recon_yoffset, int recon_uvoffset,
+                              int *returnrate, int *returndistortion);
+extern void vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
                                             int recon_yoffset,
-                                            int recon_uvoffset);
+                                            int recon_uvoffset, int *r, int *d);
 void vp8_build_block_offsets(MACROBLOCK *x);
 void vp8_setup_block_ptrs(MACROBLOCK *x);
 void vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
                                    int recon_yoffset, int recon_uvoffset,
                                    int output_enabled);
+void vp8cx_encode_inter_superblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+                                   int recon_yoffset, int recon_uvoffset, int mb_col, int mb_row);
 void vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x,
                                     TOKENEXTRA **t, int output_enabled);
+void vp8cx_encode_intra_super_block(VP8_COMP *cpi,
+                                    MACROBLOCK *x,
+                                    TOKENEXTRA **t, int mb_col);
 static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x);
 
 
@@ -378,6 +386,13 @@ static void update_state(VP8_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
   vpx_memcpy(xd->mode_info_context, mi, sizeof(MODE_INFO));
+#if CONFIG_SUPERBLOCKS
+  if (mi->mbmi.encoded_as_sb) {
+    vpx_memcpy(xd->mode_info_context + 1, mi, sizeof(MODE_INFO));
+    vpx_memcpy(xd->mode_info_context + cpi->common.mode_info_stride, mi, sizeof(MODE_INFO));
+    vpx_memcpy(xd->mode_info_context + cpi->common.mode_info_stride + 1, mi, sizeof(MODE_INFO));
+  }
+#endif
 
   if (mb_mode == B_PRED) {
     for (i = 0; i < 16; i++) {
@@ -448,6 +463,10 @@ static void update_state(VP8_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
 
     cpi->prediction_error += ctx->distortion;
     cpi->intra_error += ctx->intra_error;
+
+    cpi->rd_comp_pred_diff[0] += ctx->single_pred_diff;
+    cpi->rd_comp_pred_diff[1] += ctx->comp_pred_diff;
+    cpi->rd_comp_pred_diff[2] += ctx->hybrid_pred_diff;
   }
 }
 
@@ -458,7 +477,8 @@ static void pick_mb_modes(VP8_COMP *cpi,
                           MACROBLOCK  *x,
                           MACROBLOCKD *xd,
                           TOKENEXTRA **tp,
-                          int *totalrate) {
+                          int *totalrate,
+                          int *totaldist) {
   int i;
   int map_index;
   int recon_yoffset, recon_uvoffset;
@@ -477,7 +497,7 @@ static void pick_mb_modes(VP8_COMP *cpi,
 
   /* Function should not modify L & A contexts; save and restore on exit */
   vpx_memcpy(left_context,
-             cpi->left_context,
+             cm->left_context,
              sizeof(left_context));
   vpx_memcpy(above_context,
              initial_above_context_ptr,
@@ -525,9 +545,7 @@ static void pick_mb_modes(VP8_COMP *cpi,
 
     // Restore the appropriate left context depending on which
     // row in the SB the MB is situated
-    vpx_memcpy(&cm->left_context,
-               &cpi->left_context[i >> 1],
-               sizeof(ENTROPY_CONTEXT_PLANES));
+    xd->left_context = cm->left_context + (i >> 1);
 
     // Set up distance of MB to edge of frame in 1/8th pel units
     xd->mb_to_top_edge    = -((mb_row * 16) << 3);
@@ -568,9 +586,11 @@ static void pick_mb_modes(VP8_COMP *cpi,
     // Is segmentation enabled
     if (xd->segmentation_enabled) {
       // Code to set segment id in xd->mbmi.segment_id
-      if (cpi->segmentation_map[map_index] <= 3)
+      if (xd->update_mb_segmentation_map)
         mbmi->segment_id = cpi->segmentation_map[map_index];
       else
+        mbmi->segment_id = cm->last_frame_seg_map[map_index];
+      if (mbmi->segment_id > 3)
         mbmi->segment_id = 0;
 
       vp8cx_mb_init_quantizer(cpi, x);
@@ -583,22 +603,29 @@ static void pick_mb_modes(VP8_COMP *cpi,
     /* force 4x4 transform for mode selection */
     mbmi->txfm_size = TX_4X4; // TODO IS this right??
 
+#if CONFIG_SUPERBLOCKS
+    xd->mode_info_context->mbmi.encoded_as_sb = 0;
+#endif
+
     cpi->update_context = 0;    // TODO Do we need this now??
 
     // Find best coding mode & reconstruct the MB so it is available
     // as a predictor for MBs that follow in the SB
     if (cm->frame_type == KEY_FRAME) {
-      *totalrate += vp8_rd_pick_intra_mode(cpi, x);
-
-      // Save the coding context
-      vpx_memcpy(&x->mb_context[i].mic, xd->mode_info_context,
-                 sizeof(MODE_INFO));
+      int r, d;
+      vp8_rd_pick_intra_mode(cpi, x, &r, &d);
+      *totalrate += r;
+      *totaldist += d;
 
       // Dummy encode, do not do the tokenization
       vp8cx_encode_intra_macro_block(cpi, x, tp, 0);
       // Note the encoder may have changed the segment_id
+
+      // Save the coding context
+      vpx_memcpy(&x->mb_context[i].mic, xd->mode_info_context,
+                 sizeof(MODE_INFO));
     } else {
-      int seg_id;
+      int seg_id, r, d;
 
       if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
           !segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
@@ -612,9 +639,10 @@ static void pick_mb_modes(VP8_COMP *cpi,
         cpi->seg0_progress = (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols + i) << 16) / cm->MBs;
       }
 
-      *totalrate += vp8cx_pick_mode_inter_macroblock(cpi, x,
-                                                     recon_yoffset,
-                                                     recon_uvoffset);
+      vp8cx_pick_mode_inter_macroblock(cpi, x, recon_yoffset,
+                                       recon_uvoffset, &r, &d);
+      *totalrate += r;
+      *totaldist += d;
 
       // Dummy encode, do not do the tokenization
       vp8cx_encode_inter_macroblock(cpi, x, tp,
@@ -639,11 +667,6 @@ static void pick_mb_modes(VP8_COMP *cpi,
       }
     }
 
-    // Keep a copy of the updated left context
-    vpx_memcpy(&cpi->left_context[i >> 1],
-               &cm->left_context,
-               sizeof(ENTROPY_CONTEXT_PLANES));
-
     // Next MB
     mb_row += dy;
     mb_col += dx;
@@ -664,7 +687,7 @@ static void pick_mb_modes(VP8_COMP *cpi,
   }
 
   /* Restore L & A coding context to those in place on entry */
-  vpx_memcpy(cpi->left_context,
+  vpx_memcpy(cm->left_context,
              left_context,
              sizeof(left_context));
   vpx_memcpy(initial_above_context_ptr,
@@ -672,6 +695,156 @@ static void pick_mb_modes(VP8_COMP *cpi,
              sizeof(above_context));
 }
 
+#if CONFIG_SUPERBLOCKS
+static void pick_sb_modes (VP8_COMP *cpi,
+                           VP8_COMMON *cm,
+                           int mb_row,
+                           int mb_col,
+                           MACROBLOCK  *x,
+                           MACROBLOCKD *xd,
+                           TOKENEXTRA **tp,
+                           int *totalrate,
+                           int *totaldist)
+{
+  int map_index;
+  int recon_yoffset, recon_uvoffset;
+  int ref_fb_idx = cm->lst_fb_idx;
+  int dst_fb_idx = cm->new_fb_idx;
+  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+  ENTROPY_CONTEXT_PLANES left_context[2];
+  ENTROPY_CONTEXT_PLANES above_context[2];
+  ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
+    + mb_col;
+
+  /* Function should not modify L & A contexts; save and restore on exit */
+  vpx_memcpy (left_context,
+              cm->left_context,
+              sizeof(left_context));
+  vpx_memcpy (above_context,
+              initial_above_context_ptr,
+              sizeof(above_context));
+
+  map_index = (mb_row * cpi->common.mb_cols) + mb_col;
+  x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+
+  /* set above context pointer */
+  xd->above_context = cm->above_context + mb_col;
+
+  /* Restore the appropriate left context depending on which
+   * row in the SB the MB is situated */
+  xd->left_context = cm->left_context;
+
+  // Set up distance of MB to edge of frame in 1/8th pel units
+  xd->mb_to_top_edge    = -((mb_row * 16) << 3);
+  xd->mb_to_left_edge   = -((mb_col * 16) << 3);
+  xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+  xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+
+  /* Set up limit values for MV components to prevent them from
+   * extending beyond the UMV borders assuming 16x16 block size */
+  x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+  x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+  x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                   (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+  x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                   (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+
+  xd->up_available   = (mb_row != 0);
+  xd->left_available = (mb_col != 0);
+
+  recon_yoffset  = (mb_row * recon_y_stride * 16) + (mb_col * 16);
+  recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col *  8);
+
+  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+#if 0 // FIXME
+  /* Copy current MB to a work buffer */
+  RECON_INVOKE(&xd->rtcd->recon, copy16x16)(x->src.y_buffer,
+                                            x->src.y_stride,
+                                            x->thismb, 16);
+#endif
+  x->rddiv = cpi->RDDIV;
+  x->rdmult = cpi->RDMULT;
+  if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+    vp8_activity_masking(cpi, x);
+  /* Is segmentation enabled */
+  if (xd->segmentation_enabled)
+  {
+    /* Code to set segment id in xd->mbmi.segment_id */
+    if (xd->update_mb_segmentation_map)
+      xd->mode_info_context->mbmi.segment_id =
+            cpi->segmentation_map[map_index] &&
+            cpi->segmentation_map[map_index + 1] &&
+            cpi->segmentation_map[map_index + cm->mb_cols] &&
+            cpi->segmentation_map[map_index + cm->mb_cols + 1];
+    else
+      xd->mode_info_context->mbmi.segment_id =
+            cm->last_frame_seg_map[map_index] &&
+            cm->last_frame_seg_map[map_index + 1] &&
+            cm->last_frame_seg_map[map_index + cm->mb_cols] &&
+            cm->last_frame_seg_map[map_index + cm->mb_cols + 1];
+    if (xd->mode_info_context->mbmi.segment_id > 3)
+      xd->mode_info_context->mbmi.segment_id = 0;
+
+    vp8cx_mb_init_quantizer(cpi, x);
+  }
+  else
+    /* Set to Segment 0 by default */
+    xd->mode_info_context->mbmi.segment_id = 0;
+
+  x->active_ptr = cpi->active_map + map_index;
+  
+  cpi->update_context = 0;    // TODO Do we need this now??
+
+  /* Find best coding mode & reconstruct the MB so it is available
+   * as a predictor for MBs that follow in the SB */
+  if (cm->frame_type == KEY_FRAME)
+  {
+    vp8_rd_pick_intra_mode_sb(cpi, x,
+                              totalrate,
+                              totaldist);
+
+    /* Save the coding context */
+    vpx_memcpy(&x->sb_context[0].mic, xd->mode_info_context,
+               sizeof(MODE_INFO));
+  }
+  else
+  {
+    if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
+        !segfeature_active( xd, 0, SEG_LVL_REF_FRAME ) &&
+        segfeature_active( xd, 1, SEG_LVL_REF_FRAME ) &&
+        check_segref(xd, 1, INTRA_FRAME)  +
+        check_segref(xd, 1, LAST_FRAME)   +
+        check_segref(xd, 1, GOLDEN_FRAME) +
+        check_segref(xd, 1, ALTREF_FRAME) == 1)
+    {
+      cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
+    }
+    else
+    {
+      cpi->seg0_progress =
+        (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols) << 16) / cm->MBs;
+    }
+
+    vp8_rd_pick_inter_mode_sb(cpi, x,
+                              recon_yoffset,
+                              recon_uvoffset,
+                              totalrate,
+                              totaldist);
+  }
+
+  /* Restore L & A coding context to those in place on entry */
+  vpx_memcpy (cm->left_context,
+              left_context,
+              sizeof(left_context));
+  vpx_memcpy (initial_above_context_ptr,
+              above_context,
+              sizeof(above_context));
+}
+#endif
+
 static void encode_sb(VP8_COMP *cpi,
                       VP8_COMMON *cm,
                       int mbrow,
@@ -679,6 +852,7 @@ static void encode_sb(VP8_COMP *cpi,
                       MACROBLOCK  *x,
                       MACROBLOCKD *xd,
                       TOKENEXTRA **tp) {
+  VP8_COMMON *pc = cm;
   int i;
   int map_index;
   int mb_row, mb_col;
@@ -733,22 +907,19 @@ static void encode_sb(VP8_COMP *cpi,
 
     // Restore MB state to that when it was picked
 #if CONFIG_SUPERBLOCKS
-    if (x->encode_as_sb)
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
       update_state(cpi, x, &x->sb_context[i]);
-    else
+      cpi->sb_count++;
+    } else
 #endif
       update_state(cpi, x, &x->mb_context[i]);
 
-    // Copy in the appropriate left context
-    vpx_memcpy(&cm->left_context,
-               &cpi->left_context[i >> 1],
-               sizeof(ENTROPY_CONTEXT_PLANES));
-
     map_index = (mb_row * cpi->common.mb_cols) + mb_col;
     x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
 
     // reset above block coeffs
     xd->above_context = cm->above_context + mb_col;
+    xd->left_context  = cm->left_context + (i >> 1);
 
     // Set up distance of MB to edge of the frame in 1/8th pel units
     xd->mb_to_top_edge    = -((mb_row * 16) << 3);
@@ -756,24 +927,28 @@ static void encode_sb(VP8_COMP *cpi,
     xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
     xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
 
-    // Set up limit values for MV components to prevent them from
-    // extending beyond the UMV borders assuming 16x16 block size
-    x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
-                     (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
-    x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
-                     (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
-
 #if CONFIG_SUPERBLOCKS
-    // Set up limit values for MV components to prevent them from
-    // extending beyond the UMV borders assuming 32x32 block size
-    x->mv_row_min_sb = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_col_min_sb = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_row_max_sb = ((cm->mb_rows - mb_row) * 16 +
-                        (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
-    x->mv_col_max_sb = ((cm->mb_cols - mb_col) * 16 +
-                        (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      // Set up limit values for MV components to prevent them from
+      // extending beyond the UMV borders assuming 32x32 block size
+      x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                       (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                       (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+    } else {
+#endif
+      // Set up limit values for MV components to prevent them from
+      // extending beyond the UMV borders assuming 16x16 block size
+      x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                       (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
+      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                       (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
+#if CONFIG_SUPERBLOCKS
+    }
 #endif
 
     xd->up_available = (mb_row != 0);
@@ -796,24 +971,21 @@ static void encode_sb(VP8_COMP *cpi,
 
     // Is segmentation enabled
     if (xd->segmentation_enabled) {
-      // Code to set segment id in xd->mbmi.segment_id
-      if (cpi->segmentation_map[map_index] <= 3)
-        mbmi->segment_id = cpi->segmentation_map[map_index];
-      else
-        mbmi->segment_id = 0;
-
       vp8cx_mb_init_quantizer(cpi, x);
-    } else
-      // Set to Segment 0 by default
-      mbmi->segment_id = 0;
+    }
 
     x->active_ptr = cpi->active_map + map_index;
 
     cpi->update_context = 0;
 
     if (cm->frame_type == KEY_FRAME) {
-      vp8cx_encode_intra_macro_block(cpi, x, tp, 1);
-      // Note the encoder may have changed the segment_id
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb)
+        vp8cx_encode_intra_super_block(cpi, x, tp, mb_col);
+      else
+#endif
+        vp8cx_encode_intra_macro_block(cpi, x, tp, 1);
+        // Note the encoder may have changed the segment_id
 
 #ifdef MODE_STATS
       y_modes[mbmi->mode]++;
@@ -822,9 +994,25 @@ static void encode_sb(VP8_COMP *cpi,
       unsigned char *segment_id;
       int seg_ref_active;
 
-      vp8cx_encode_inter_macroblock(cpi, x, tp,
-                                    recon_yoffset, recon_uvoffset, 1);
-      // Note the encoder may have changed the segment_id
+      if (xd->mode_info_context->mbmi.ref_frame) {
+        unsigned char pred_context;
+
+        pred_context = get_pred_context(cm, xd, PRED_COMP);
+
+        if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME)
+          cpi->single_pred_count[pred_context]++;
+        else
+          cpi->comp_pred_count[pred_context]++;
+      }
+
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb)
+        vp8cx_encode_inter_superblock(cpi, x, tp, recon_yoffset, recon_uvoffset, mb_col, mb_row);
+      else
+#endif
+        vp8cx_encode_inter_macroblock(cpi, x, tp,
+                                      recon_yoffset, recon_uvoffset, 1);
+        // Note the encoder may have changed the segment_id
 
 #ifdef MODE_STATS
       inter_y_modes[mbmi->mode]++;
@@ -864,10 +1052,20 @@ static void encode_sb(VP8_COMP *cpi,
     // TODO Partitioning is broken!
     cpi->tplist[mb_row].stop = *tp;
 
-    // Copy back updated left context
-    vpx_memcpy(&cpi->left_context[i >> 1],
-               &cm->left_context,
-               sizeof(ENTROPY_CONTEXT_PLANES));
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      x->src.y_buffer += 32;
+      x->src.u_buffer += 16;
+      x->src.v_buffer += 16;
+
+      x->gf_active_ptr      += 2;
+      x->partition_info     += 2;
+      xd->mode_info_context += 2;
+      xd->prev_mode_info_context += 2;
+      
+      break;
+    }
+#endif
 
     // Next MB
     mb_row += dy;
@@ -911,14 +1109,13 @@ void encode_sb_row(VP8_COMP *cpi,
   int mb_cols = cm->mb_cols;
 
   // Initialize the left context for the new SB row
-  vpx_memset(cpi->left_context, 0, sizeof(cpi->left_context));
-  vpx_memset(&cm->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memset(cm->left_context, 0, sizeof(cm->left_context));
 
   // Code each SB in the row
   for (mb_col = 0; mb_col < mb_cols; mb_col += 2) {
-    int mb_rate = 0;
+    int mb_rate = 0, mb_dist = 0;
 #if CONFIG_SUPERBLOCKS
-    int sb_rate = INT_MAX;
+    int sb_rate = INT_MAX, sb_dist;
 #endif
 
 #if CONFIG_DEBUG
@@ -930,8 +1127,14 @@ void encode_sb_row(VP8_COMP *cpi,
     unsigned char *vb = x->src.v_buffer;
 #endif
 
+#if CONFIG_SUPERBLOCKS
     // Pick modes assuming the SB is coded as 4 independent MBs
-    pick_mb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &mb_rate);
+    xd->mode_info_context->mbmi.encoded_as_sb = 0;
+#endif
+    pick_mb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &mb_rate, &mb_dist);
+#if CONFIG_SUPERBLOCKS
+    mb_rate += vp8_cost_bit(cm->sb_coded, 0);
+#endif
 
     x->src.y_buffer -= 32;
     x->src.u_buffer -= 16;
@@ -952,21 +1155,40 @@ void encode_sb_row(VP8_COMP *cpi,
 #endif
 
 #if CONFIG_SUPERBLOCKS
-    // Pick a mode assuming that it applies all 4 of the MBs in the SB
-    pick_sb_modes(cpi, cm, mb_row, mb_col, x, xd, &sb_rate);
+    if (!(((    mb_cols & 1) && mb_col ==     mb_cols - 1) ||
+          ((cm->mb_rows & 1) && mb_row == cm->mb_rows - 1))) {
+      /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
+      xd->mode_info_context->mbmi.encoded_as_sb = 1;
+      pick_sb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &sb_rate, &sb_dist);
+      sb_rate += vp8_cost_bit(cm->sb_coded, 1);
+    }
 
-    // Decide whether to encode as a SB or 4xMBs
-    if (sb_rate < mb_rate) {
-      x->encode_as_sb = 1;
+    /* Decide whether to encode as a SB or 4xMBs */
+    if (sb_rate < INT_MAX &&
+        RDCOST(x->rdmult, x->rddiv, sb_rate, sb_dist) <
+          RDCOST(x->rdmult, x->rddiv, mb_rate, mb_dist)) {
+      xd->mode_info_context->mbmi.encoded_as_sb = 1;
+      xd->mode_info_context[1].mbmi.encoded_as_sb = 1;
+      xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 1;
+      xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 1;
       *totalrate += sb_rate;
     } else
 #endif
     {
-      x->encode_as_sb = 0;
+#if CONFIG_SUPERBLOCKS
+      xd->mode_info_context->mbmi.encoded_as_sb = 0;
+      if (cm->mb_cols - 1 > mb_col)
+        xd->mode_info_context[1].mbmi.encoded_as_sb = 0;
+      if (cm->mb_rows - 1 > mb_row) {
+        xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 0;
+        if (cm->mb_cols - 1 > mb_col)
+          xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 0;
+      }
+#endif
       *totalrate += mb_rate;
     }
 
-    // Encode SB using best computed mode(s)
+    /* Encode SB using best computed mode(s) */
     encode_sb(cpi, cm, mb_row, mb_col, x, xd, tp);
 
 #if CONFIG_DEBUG
@@ -1038,8 +1260,6 @@ void init_encode_frame_mb_context(VP8_COMP *cpi) {
   xd->mode_info_context->mbmi.mode = DC_PRED;
   xd->mode_info_context->mbmi.uv_mode = DC_PRED;
 
-  xd->left_context = &cm->left_context;
-
   vp8_zero(cpi->count_mb_ref_frame_usage)
   vp8_zero(cpi->bmode_count)
   vp8_zero(cpi->ymode_count)
@@ -1049,6 +1269,10 @@ void init_encode_frame_mb_context(VP8_COMP *cpi) {
   vp8_zero(cpi->mbsplit_count)
   vp8_zero(cpi->common.fc.mv_ref_ct)
   vp8_zero(cpi->common.fc.mv_ref_ct_a)
+#if CONFIG_SUPERBLOCKS
+  vp8_zero(cpi->sb_ymode_count)
+  cpi->sb_count = 0;
+#endif
   // vp8_zero(cpi->uv_mode_count)
 
   x->mvc = cm->fc.mvc;
@@ -1380,7 +1604,12 @@ static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x) {
   }
 #endif
 
-  ++cpi->ymode_count[m];
+#if CONFIG_SUPERBLOCKS
+  if (xd->mode_info_context->mbmi.encoded_as_sb) {
+    ++cpi->sb_ymode_count[m];
+  } else
+#endif
+    ++cpi->ymode_count[m];
   if (m != I8X8_PRED)
     ++cpi->y_uv_mode_count[m][uvm];
   else {
@@ -1418,6 +1647,160 @@ static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x) {
 #endif
 }
 
+#if CONFIG_SUPERBLOCKS
+static void update_sb_skip_coeff_state(VP8_COMP *cpi,
+                                       MACROBLOCK *x,
+                                       ENTROPY_CONTEXT_PLANES ta[4],
+                                       ENTROPY_CONTEXT_PLANES tl[4],
+                                       TOKENEXTRA *t[4],
+                                       TOKENEXTRA **tp,
+                                       int skip[4])
+{
+  TOKENEXTRA tokens[4][16 * 24];
+  int n_tokens[4], n;
+
+  // if there were no skips, we don't need to do anything
+  if (!skip[0] && !skip[1] && !skip[2] && !skip[3])
+    return;
+
+  // if we don't do coeff skipping for this frame, we don't
+  // need to do anything here
+  if (!cpi->common.mb_no_coeff_skip)
+    return;
+
+  // if all 4 MBs skipped coeff coding, nothing to be done
+  if (skip[0] && skip[1] && skip[2] && skip[3])
+    return;
+
+  // so the situation now is that we want to skip coeffs
+  // for some MBs, but not all, and we didn't code EOB
+  // coefficients for them. However, the skip flag for this
+  // SB will be 0 overall, so we need to insert EOBs in the
+  // middle of the token tree. Do so here.
+  n_tokens[0] = t[1] - t[0];
+  n_tokens[1] = t[2] - t[1];
+  n_tokens[2] = t[3] - t[2];
+  n_tokens[3] = *tp  - t[3];
+  if (n_tokens[0])
+    memcpy(tokens[0], t[0], n_tokens[0] * sizeof(*t[0]));
+  if (n_tokens[1])
+    memcpy(tokens[1], t[1], n_tokens[1] * sizeof(*t[0]));
+  if (n_tokens[2])
+    memcpy(tokens[2], t[2], n_tokens[2] * sizeof(*t[0]));
+  if (n_tokens[3])
+    memcpy(tokens[3], t[3], n_tokens[3] * sizeof(*t[0]));
+
+  // reset pointer, stuff EOBs where necessary
+  *tp = t[0];
+  for (n = 0; n < 4; n++) {
+    TOKENEXTRA *tbak = *tp;
+    if (skip[n]) {
+      x->e_mbd.above_context = &ta[n];
+      x->e_mbd.left_context  = &tl[n];
+      vp8_stuff_mb_8x8(cpi, &x->e_mbd, tp, 0);
+    } else {
+      if (n_tokens[n]) {
+        memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
+      }
+      (*tp) += n_tokens[n];
+    }
+  }
+}
+
+void vp8cx_encode_intra_super_block(VP8_COMP *cpi,
+                                    MACROBLOCK *x,
+                                    TOKENEXTRA **t,
+                                    int mb_col) {
+  const int output_enabled = 1;
+  int n;
+  MACROBLOCKD *xd = &x->e_mbd;
+  VP8_COMMON *cm = &cpi->common;
+  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  const VP8_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
+  TOKENEXTRA *tp[4];
+  int skip[4];
+  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  ENTROPY_CONTEXT_PLANES ta[4], tl[4];
+
+  if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {
+    adjust_act_zbin(cpi, x);
+    vp8_update_zbin_extra(cpi, x);
+  }
+
+  /* test code: set transform size based on mode selection */
+  if (cpi->common.txfm_mode == ALLOW_8X8) {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_8X8;
+    x->e_mbd.mode_info_context[1].mbmi.txfm_size = TX_8X8;
+    x->e_mbd.mode_info_context[cm->mode_info_stride].mbmi.txfm_size = TX_8X8;
+    x->e_mbd.mode_info_context[cm->mode_info_stride+1].mbmi.txfm_size = TX_8X8;
+    cpi->t8x8_count++;
+  } else {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
+    cpi->t4x4_count++;
+  }
+
+  RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sby_s)(&x->e_mbd);
+  RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sbuv_s)(&x->e_mbd);
+
+  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
+  for (n = 0; n < 4; n++)
+  {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    xd->above_context = cm->above_context + mb_col + (n & 1);
+    xd->left_context = cm->left_context + (n >> 1);
+
+    vp8_subtract_mby_s_c(x->src_diff,
+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                         src_y_stride,
+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                         dst_y_stride);
+    vp8_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+    vp8_transform_intra_mby_8x8(x);
+    vp8_transform_mbuv_8x8(x);
+    vp8_quantize_mby_8x8(x);
+    vp8_quantize_mbuv_8x8(x);
+    if (x->optimize) {
+      vp8_optimize_mby_8x8(x, rtcd);
+      vp8_optimize_mbuv_8x8(x, rtcd);
+    }
+    vp8_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp8_inverse_transform_mbuv_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp8_recon_mby_s_c(IF_RTCD(&rtcd->common->recon), &x->e_mbd,
+                      dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
+    vp8_recon_mbuv_s_c(IF_RTCD(&rtcd->common->recon), &x->e_mbd,
+                       udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                       vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
+
+    if (output_enabled) {
+      memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+      memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+      tp[n] = *t;
+      xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
+      vp8_tokenize_mb(cpi, &x->e_mbd, t, 0);
+      skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+    }
+  }
+
+  if (output_enabled) {
+    // Tokenize
+    xd->mode_info_context = mi;
+    sum_intra_stats(cpi, x);
+    update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+  }
+}
+#endif
+
 void vp8cx_encode_intra_macro_block(VP8_COMP *cpi,
                                     MACROBLOCK *x,
                                     TOKENEXTRA **t,
@@ -1484,6 +1867,9 @@ void vp8cx_encode_inter_macroblock (VP8_COMP *cpi, MACROBLOCK *x,
   unsigned char ref_pred_flag;
 
   x->skip = 0;
+#if CONFIG_SUPERBLOCKS
+  assert(!xd->mode_info_context->mbmi.encoded_as_sb);
+#endif
 
 #if CONFIG_SWITCHABLE_INTERP
   vp8_setup_interp_filters(xd, mbmi->interp_filter, cm);
@@ -1648,3 +2034,190 @@ void vp8cx_encode_inter_macroblock (VP8_COMP *cpi, MACROBLOCK *x,
     }
   }
 }
+
+#if CONFIG_SUPERBLOCKS
+void vp8cx_encode_inter_superblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+                                   int recon_yoffset, int recon_uvoffset, int mb_col, int mb_row) {
+  const int output_enabled = 1;
+  VP8_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  const VP8_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
+  int mis = xd->mode_info_stride;
+  unsigned int segment_id = xd->mode_info_context->mbmi.segment_id;
+  int seg_ref_active;
+  unsigned char ref_pred_flag;
+  int n;
+  TOKENEXTRA *tp[4];
+  int skip[4];
+  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  ENTROPY_CONTEXT_PLANES ta[4], tl[4];
+
+  x->skip = 0;
+
+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
+    // Adjust the zbin based on this MB rate.
+    adjust_act_zbin(cpi, x);
+  }
+
+  {
+    // Experimental code. Special case for gf and arf zeromv modes.
+    // Increase zbin size to suppress noise
+    cpi->zbin_mode_boost = 0;
+    if (cpi->zbin_mode_boost_enabled) {
+      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
+        if (xd->mode_info_context->mbmi.mode == ZEROMV) {
+          if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
+            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+          else
+            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+        } else if (xd->mode_info_context->mbmi.mode == SPLITMV)
+          cpi->zbin_mode_boost = 0;
+        else
+          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+      }
+    }
+
+    vp8_update_zbin_extra(cpi, x);
+  }
+
+  seg_ref_active = segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
+
+  // SET VARIOUS PREDICTION FLAGS
+
+  // Did the chosen reference frame match its predicted value.
+  ref_pred_flag = ((xd->mode_info_context->mbmi.ref_frame ==
+                    get_pred_ref(cm, xd)));
+  set_pred_flag(xd, PRED_REF, ref_pred_flag);
+
+  /* test code: set transform size based on mode selection */
+  if (cpi->common.txfm_mode == ALLOW_8X8
+      && x->e_mbd.mode_info_context->mbmi.mode != I8X8_PRED
+      && x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+      && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_8X8;
+    cpi->t8x8_count++;
+  } else {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
+    cpi->t4x4_count++;
+  }
+
+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sby_s)(&x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sbuv_s)(&x->e_mbd);
+  } else {
+    int ref_fb_idx;
+
+    if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
+      ref_fb_idx = cpi->common.lst_fb_idx;
+    else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
+      ref_fb_idx = cpi->common.gld_fb_idx;
+    else
+      ref_fb_idx = cpi->common.alt_fb_idx;
+
+    xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+    xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+    xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+    if (xd->mode_info_context->mbmi.second_ref_frame) {
+      int second_ref_fb_idx;
+
+      if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
+        second_ref_fb_idx = cpi->common.lst_fb_idx;
+      else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
+        second_ref_fb_idx = cpi->common.gld_fb_idx;
+      else
+        second_ref_fb_idx = cpi->common.alt_fb_idx;
+
+      xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +
+                                    recon_yoffset;
+      xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +
+                                    recon_uvoffset;
+      xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +
+                                    recon_uvoffset;
+    }
+
+    vp8_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                       xd->dst.u_buffer, xd->dst.v_buffer,
+                                       xd->dst.y_stride, xd->dst.uv_stride);
+  }
+
+  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
+  for (n = 0; n < 4; n++)
+  {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp8_subtract_mby_s_c(x->src_diff,
+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                         src_y_stride,
+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                         dst_y_stride);
+    vp8_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+      vp8_transform_intra_mby_8x8(x);
+    } else {
+      vp8_transform_mby_8x8(x);
+    }
+    vp8_transform_mbuv_8x8(x);
+    vp8_quantize_mby_8x8(x);
+    vp8_quantize_mbuv_8x8(x);
+    if (x->optimize) {
+      vp8_optimize_mby_8x8(x, rtcd);
+      vp8_optimize_mbuv_8x8(x, rtcd);
+    }
+    vp8_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp8_inverse_transform_mbuv_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp8_recon_mby_s_c(IF_RTCD(&rtcd->common->recon), &x->e_mbd,
+                      dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
+    vp8_recon_mbuv_s_c(IF_RTCD(&rtcd->common->recon), &x->e_mbd,
+                       udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                       vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
+
+    if (!x->skip) {
+      if (output_enabled) {
+        xd->left_context = cm->left_context + (n >> 1);
+        xd->above_context = cm->above_context + mb_col + (n >> 1);
+        memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+        memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+        tp[n] = *t;
+        xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
+        vp8_tokenize_mb(cpi, &x->e_mbd, t, 0);
+        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+      }
+    } else {
+      int mb_skip_context =
+        cpi->common.mb_no_coeff_skip ?
+          (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
+            (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :
+          0;
+      if (cpi->common.mb_no_coeff_skip) {
+        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+        xd->left_context = cm->left_context + (n >> 1);
+        xd->above_context = cm->above_context + mb_col + (n >> 1);
+        memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+        memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+        tp[n] = *t;
+        cpi->skip_true_count[mb_skip_context]++;
+        vp8_fix_contexts(xd);
+      } else {
+        vp8_stuff_mb(cpi, xd, t, 0);
+        xd->mode_info_context->mbmi.mb_skip_coeff = 0;
+        cpi->skip_false_count[mb_skip_context]++;
+      }
+    }
+  }
+
+  xd->mode_info_context = mi;
+  update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+}
+#endif
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index e03b47e2c..473f8ba3d 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -67,11 +67,10 @@ void vp8_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) {
   }
 }
 
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) {
+void vp8_subtract_mbuv_s_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride,
+                           unsigned char *upred, unsigned char *vpred, int dst_stride) {
   short *udiff = diff + 256;
   short *vdiff = diff + 320;
-  unsigned char *upred = pred + 256;
-  unsigned char *vpred = pred + 320;
 
   int r, c;
 
@@ -81,8 +80,8 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
     }
 
     udiff += 8;
-    upred += 8;
-    usrc  += stride;
+    upred += dst_stride;
+    usrc  += src_stride;
   }
 
   for (r = 0; r < 8; r++) {
@@ -91,12 +90,19 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
     }
 
     vdiff += 8;
-    vpred += 8;
-    vsrc  += stride;
+    vpred += dst_stride;
+    vsrc  += src_stride;
   }
 }
 
-void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride) {
+void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) {
+  unsigned char *upred = pred + 256;
+  unsigned char *vpred = pred + 320;
+
+  vp8_subtract_mbuv_s_c(diff, usrc, vsrc, stride, upred, vpred, 8);
+}
+
+void vp8_subtract_mby_s_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int dst_stride) {
   int r, c;
 
   for (r = 0; r < 16; r++) {
@@ -105,11 +111,16 @@ void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, in
     }
 
     diff += 16;
-    pred += 16;
-    src  += stride;
+    pred += dst_stride;
+    src  += src_stride;
   }
 }
 
+void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride)
+{
+  vp8_subtract_mby_s_c(diff, src, stride, pred, 16);
+}
+
 static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
   BLOCK *b = &x->block[0];
 
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index 6390f3fe4..6a5bf59d5 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -23,24 +23,36 @@ extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER
 void vp8_cmachine_specific_config(VP8_COMP *cpi) {
 #if CONFIG_RUNTIME_CPU_DETECT
   cpi->rtcd.common                    = &cpi->common.rtcd;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.sad32x32              = vp8_sad32x32_c;
+#endif
   cpi->rtcd.variance.sad16x16              = vp8_sad16x16_c;
   cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;
   cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;
   cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;
   cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;
 
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.sad32x32x3            = vp8_sad32x32x3_c;
+#endif
   cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_c;
   cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_c;
   cpi->rtcd.variance.sad8x16x3             = vp8_sad8x16x3_c;
   cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_c;
   cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_c;
 
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.sad32x32x8            = vp8_sad32x32x8_c;
+#endif
   cpi->rtcd.variance.sad16x16x8            = vp8_sad16x16x8_c;
   cpi->rtcd.variance.sad16x8x8             = vp8_sad16x8x8_c;
   cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_c;
   cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_c;
   cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_c;
 
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.sad32x32x4d           = vp8_sad32x32x4d_c;
+#endif
   cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_c;
   cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_c;
   cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_c;
@@ -54,16 +66,34 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi) {
   cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
   cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;
   cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.var32x32              = vp8_variance32x32_c;
+#endif
 
   cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
   cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
   cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
   cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
   cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.subpixvar32x32        = vp8_sub_pixel_variance32x32_c;
+#endif
   cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.halfpixvar32x32_h     = vp8_variance_halfpixvar32x32_h_c;
+#endif
   cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.halfpixvar32x32_v     = vp8_variance_halfpixvar32x32_v_c;
+#endif
   cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.halfpixvar32x32_hv    = vp8_variance_halfpixvar32x32_hv_c;
+#endif
   cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.subpixmse32x32        = vp8_sub_pixel_mse32x32_c;
+#endif
 
   cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
   cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index ba4cd897d..a0621b649 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -243,7 +243,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
   int y_stride;
   int offset;
 
-#if ARCH_X86 || ARCH_X86_64
+#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
   unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
   unsigned char *y;
   int buf_r1, buf_r2, buf_c1, buf_c2;
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index bcbc85766..deff0db08 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -620,6 +620,42 @@ static void print_seg_map(VP8_COMP *cpi) {
   fclose(statsfile);
 }
 
+static void update_reference_segmentation_map(VP8_COMP *cpi) {
+  VP8_COMMON *cm = &cpi->common;
+  int row, col, sb_rows = (cm->mb_rows + 1) >> 1, sb_cols = (cm->mb_cols + 1) >> 1;
+  MODE_INFO *mi = cm->mi;
+  uint8_t *segmap = cpi->segmentation_map;
+  uint8_t *segcache = cm->last_frame_seg_map;
+
+  for (row = 0; row < sb_rows; row++) {
+    for (col = 0; col < sb_cols; col++) {
+      MODE_INFO *miptr = mi + col * 2;
+      uint8_t *seg = segmap + col * 2;
+      uint8_t *cache = segcache + col * 2;
+#if CONFIG_SUPERBLOCKS
+      if (miptr->mbmi.encoded_as_sb) {
+        cache[0] = cache[1] = cache[cm->mb_cols] = cache[cm->mb_cols + 1] =
+          miptr->mbmi.segment_id;
+      } else
+#endif
+      {
+        cache[0] = miptr[0].mbmi.segment_id;
+        if (!(cm->mb_cols & 1) || col < sb_cols - 1)
+          cache[1] = miptr[1].mbmi.segment_id;
+        if (!(cm->mb_rows & 1) || row < sb_rows - 1) {
+          cache[cm->mb_cols] = miptr[cm->mode_info_stride].mbmi.segment_id;
+          if (!(cm->mb_cols & 1) || col < sb_cols - 1)
+            cache[1] = miptr[1].mbmi.segment_id;
+          cache[cm->mb_cols + 1] = miptr[cm->mode_info_stride + 1].mbmi.segment_id;
+        }
+      }
+    }
+    segmap += 2 * cm->mb_cols;
+    segcache += 2 * cm->mb_cols;
+    mi += 2 * cm->mode_info_stride;
+  }
+}
+
 static void set_default_lf_deltas(VP8_COMP *cpi) {
   cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
   cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
@@ -1736,6 +1772,9 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) {
   cm->prob_last_coded               = 128;
   cm->prob_gf_coded                 = 128;
   cm->prob_intra_coded              = 63;
+#if CONFIG_SUPERBLOCKS
+  cm->sb_coded                      = 200;
+#endif
   for (i = 0; i < COMP_PRED_CONTEXTS; i++)
     cm->prob_comppred[i]         = 128;
 
@@ -1919,6 +1958,18 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) {
   init_mv_ref_counts();
 #endif
 
+#if CONFIG_SUPERBLOCKS
+  cpi->fn_ptr[BLOCK_32X32].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32);
+  cpi->fn_ptr[BLOCK_32X32].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var32x32);
+  cpi->fn_ptr[BLOCK_32X32].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar32x32);
+  cpi->fn_ptr[BLOCK_32X32].svf_halfpix_h  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar32x32_h);
+  cpi->fn_ptr[BLOCK_32X32].svf_halfpix_v  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar32x32_v);
+  cpi->fn_ptr[BLOCK_32X32].svf_halfpix_hv = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar32x32_hv);
+  cpi->fn_ptr[BLOCK_32X32].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32x3);
+  cpi->fn_ptr[BLOCK_32X32].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32x8);
+  cpi->fn_ptr[BLOCK_32X32].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32x4d);
+#endif
+
   cpi->fn_ptr[BLOCK_16X16].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16);
   cpi->fn_ptr[BLOCK_16X16].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16);
   cpi->fn_ptr[BLOCK_16X16].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x16);
@@ -3618,6 +3669,10 @@ static void encode_frame_to_data_rate
   cpi->dummy_packing = 0;
   vp8_pack_bitstream(cpi, dest, size);
 
+  if (cpi->mb.e_mbd.update_mb_segmentation_map) {
+    update_reference_segmentation_map(cpi);
+  }
+
 #if CONFIG_PRED_FILTER
   // Select the prediction filtering mode to use for the
   // next frame based on the current frame selections
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index ff3a21107..7fb7dd2ff 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -359,7 +359,9 @@ enum {
   BLOCK_8X8,
   BLOCK_4X4,
   BLOCK_16X16,
-  BLOCK_MAX_SEGMENTS
+  BLOCK_MAX_SEGMENTS,
+  BLOCK_32X32 = BLOCK_MAX_SEGMENTS,
+  BLOCK_MAX_SB_SEGMENTS,
 };
 
 typedef struct VP8_COMP {
@@ -528,6 +530,10 @@ typedef struct VP8_COMP {
 
   int cq_target_quality;
 
+#if CONFIG_SUPERBLOCKS
+  int sb_count;
+  int sb_ymode_count [VP8_I32X32_MODES];
+#endif
   int ymode_count [VP8_YMODES];        /* intra MB type cts this frame */
   int bmode_count [VP8_BINTRAMODES];
   int i8x8_mode_count [VP8_I8X8_MODES];
@@ -628,7 +634,7 @@ typedef struct VP8_COMP {
   vp8_full_search_fn_t full_search_sad;
   vp8_refining_search_fn_t refining_search_sad;
   vp8_diamond_search_fn_t diamond_search_sad;
-  vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS];
+  vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SB_SEGMENTS];
   uint64_t time_receive_data;
   uint64_t time_compress_data;
   uint64_t time_pick_lpf;
@@ -732,9 +738,6 @@ typedef struct VP8_COMP {
 
   int droppable;
 
-  // Global store for SB left contexts, one for each MB row in the SB
-  ENTROPY_CONTEXT_PLANES left_context[2];
-
   // TODO Do we still need this??
   int update_context;
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 720736f33..92a80ecba 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -718,7 +718,7 @@ static void macro_block_yrd(MACROBLOCK *mb,
   *Rate = vp8_rdcost_mby(mb);
 }
 
-static int vp8_rdcost_mby_8x8(MACROBLOCK *mb) {
+static int vp8_rdcost_mby_8x8(MACROBLOCK *mb, int backup) {
   int cost = 0;
   int b;
   MACROBLOCKD *xd = &mb->e_mbd;
@@ -726,11 +726,16 @@ static int vp8_rdcost_mby_8x8(MACROBLOCK *mb) {
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
 
-  vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  if (backup) {
+    vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
+    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
+  }
 
   for (b = 0; b < 16; b += 4)
     cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,
@@ -775,7 +780,7 @@ static void macro_block_yrd_8x8(MACROBLOCK *mb,
 
   *Distortion = (d >> 2);
   // rate
-  *Rate = vp8_rdcost_mby_8x8(mb);
+  *Rate = vp8_rdcost_mby_8x8(mb, 1);
 }
 
 #if CONFIG_TX16X16
@@ -823,6 +828,66 @@ static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
   d[12] = p[12];
 }
 
+#if CONFIG_SUPERBLOCKS
+static void super_block_yrd_8x8(MACROBLOCK *x,
+                                int *rate,
+                                int *distortion,
+                                const VP8_ENCODER_RTCD *rtcd, int *skip)
+{
+  MACROBLOCKD *const xd = &x->e_mbd;
+  BLOCK *const by2 = x->block + 24;
+  BLOCKD *const bdy2  = xd->block + 24;
+  int d = 0, r = 0, n;
+  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+  ENTROPY_CONTEXT_PLANES t_above[2];
+  ENTROPY_CONTEXT_PLANES t_left[2];
+  int skippable = 1;
+
+  vpx_memcpy(t_above, xd->above_context, sizeof(t_above));
+  vpx_memcpy(t_left, xd->left_context, sizeof(t_left));
+
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp8_subtract_mby_s_c(x->src_diff,
+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                         src_y_stride,
+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                         dst_y_stride);
+    vp8_transform_mby_8x8(x);
+    vp8_quantize_mby_8x8(x);
+
+    /* remove 1st order dc to properly combine 1st/2nd order distortion */
+    x->coeff[  0] = 0;
+    x->coeff[ 64] = 0;
+    x->coeff[128] = 0;
+    x->coeff[192] = 0;
+    xd->dqcoeff[  0] = 0;
+    xd->dqcoeff[ 64] = 0;
+    xd->dqcoeff[128] = 0;
+    xd->dqcoeff[192] = 0;
+
+    d += ENCODEMB_INVOKE(&rtcd->encodemb, mberr)(x, 0);
+    d += ENCODEMB_INVOKE(&rtcd->encodemb, berr)(by2->coeff, bdy2->dqcoeff, 16);
+    xd->above_context = ta + x_idx;
+    xd->left_context = tl + y_idx;
+    r += vp8_rdcost_mby_8x8(x, 0);
+    skippable = skippable && mby_is_skippable_8x8(xd);
+  }
+
+  *distortion = (d >> 2);
+  *rate       = r;
+  if (skip) *skip = skippable;
+  xd->above_context = ta;
+  xd->left_context = tl;
+  vpx_memcpy(xd->above_context, &t_above, sizeof(t_above));
+  vpx_memcpy(xd->left_context, &t_left, sizeof(t_left));
+}
+#endif
+
 static void copy_predictor_8x8(unsigned char *dst, const unsigned char *predictor) {
   const unsigned int *p = (const unsigned int *)predictor;
   unsigned int *d = (unsigned int *)dst;
@@ -1062,6 +1127,45 @@ static int64_t rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rat
   return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
 }
 
+#if CONFIG_SUPERBLOCKS
+static int64_t rd_pick_intra_sby_mode(VP8_COMP *cpi,
+                                      MACROBLOCK *x,
+                                      int *rate,
+                                      int *rate_tokenonly,
+                                      int *distortion) {
+  MB_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  int this_rate, this_rate_tokenonly;
+  int this_distortion;
+  int64_t best_rd = INT64_MAX, this_rd;
+
+  /* Y Search for 32x32 intra prediction mode */
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    x->e_mbd.mode_info_context->mbmi.mode = mode;
+    RECON_INVOKE(&cpi->common.rtcd.recon,
+                 build_intra_predictors_sby_s)(&x->e_mbd);
+
+    super_block_yrd_8x8(x, &this_rate_tokenonly,
+                        &this_distortion, IF_RTCD(&cpi->rtcd), NULL);
+    this_rate = this_rate_tokenonly +
+                x->mbmode_cost[x->e_mbd.frame_type]
+                              [x->e_mbd.mode_info_context->mbmi.mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+    }
+  }
+
+  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
+
+  return best_rd;
+}
+#endif
 
 static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
                                       MACROBLOCK *x,
@@ -1372,18 +1476,23 @@ static int64_t rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
-static int rd_cost_mbuv_8x8(MACROBLOCK *mb) {
+static int rd_cost_mbuv_8x8(MACROBLOCK *mb, int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *xd = &mb->e_mbd;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta, *tl;
 
-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  if (backup) {
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
+    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
+  }
 
   for (b = 16; b < 24; b += 4)
     cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
@@ -1393,6 +1502,54 @@ static int rd_cost_mbuv_8x8(MACROBLOCK *mb) {
   return cost;
 }
 
+#if CONFIG_SUPERBLOCKS
+static int64_t rd_inter32x32_uv_8x8(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
+                                int *distortion, int fullpixel, int *skip) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int n, r = 0, d = 0;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  int skippable = 1;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+
+  memcpy(t_above, xd->above_context, sizeof(t_above));
+  memcpy(t_left, xd->left_context, sizeof(t_left));
+
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp8_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+
+    vp8_transform_mbuv_8x8(x);
+    vp8_quantize_mbuv_8x8(x);
+
+    xd->above_context = ta + x_idx;
+    xd->left_context = tl + y_idx;
+    r += rd_cost_mbuv_8x8(x, 0);
+    d += ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
+    skippable = skippable && mbuv_is_skippable_8x8(xd);
+  }
+
+  *rate = r;
+  *distortion = d;
+  if (skip) *skip = skippable;
+  xd->left_context = tl;
+  xd->above_context = ta;
+  memcpy(xd->above_context, t_above, sizeof(t_above));
+  memcpy(xd->left_context, t_left, sizeof(t_left));
+
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+#endif
 
 static int64_t rd_inter16x16_uv_8x8(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
                                     int *distortion, int fullpixel) {
@@ -1403,7 +1560,7 @@ static int64_t rd_inter16x16_uv_8x8(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
 
   vp8_quantize_mbuv_8x8(x);
 
-  *rate       = rd_cost_mbuv_8x8(x);
+  *rate       = rd_cost_mbuv_8x8(x, 1);
   *distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
 
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
@@ -1527,7 +1684,7 @@ static void rd_pick_intra_mbuv_mode_8x8(VP8_COMP *cpi,
 
     vp8_quantize_mbuv_8x8(x);
 
-    rate_to = rd_cost_mbuv_8x8(x);
+    rate_to = rd_cost_mbuv_8x8(x, 1);
     rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
 
     distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
@@ -1546,6 +1703,91 @@ static void rd_pick_intra_mbuv_mode_8x8(VP8_COMP *cpi,
   mbmi->uv_mode = mode_selected;
 }
 
+#if CONFIG_SUPERBLOCKS
+static void super_block_uvrd_8x8(MACROBLOCK *x,
+                                 int *rate,
+                                 int *distortion,
+                                 const VP8_ENCODER_RTCD *rtcd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int d = 0, r = 0, n;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+
+  memcpy(t_above, xd->above_context, sizeof(t_above));
+  memcpy(t_left,  xd->left_context,  sizeof(t_left));
+
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp8_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+    vp8_transform_mbuv_8x8(x);
+    vp8_quantize_mbuv_8x8(x);
+
+    d += ENCODEMB_INVOKE(&rtcd->encodemb, mbuverr)(x) >> 2;
+    xd->above_context = ta + x_idx;
+    xd->left_context = tl + y_idx;
+    r += rd_cost_mbuv_8x8(x, 0);
+  }
+
+  xd->above_context = ta;
+  xd->left_context = tl;
+  *distortion = (d >> 2);
+  *rate       = r;
+
+  xd->left_context = tl;
+  xd->above_context = ta;
+  memcpy(xd->above_context, t_above, sizeof(t_above));
+  memcpy(xd->left_context,  t_left,  sizeof(t_left));
+}
+
+static int64_t rd_pick_intra_sbuv_mode(VP8_COMP *cpi,
+                                       MACROBLOCK *x,
+                                       int *rate,
+                                       int *rate_tokenonly,
+                                       int *distortion) {
+  MB_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  int64_t best_rd = INT64_MAX, this_rd;
+  int this_rate_tokenonly, this_rate;
+  int this_distortion;
+
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
+    RECON_INVOKE(&cpi->rtcd.common->recon,
+                 build_intra_predictors_sbuv_s)(&x->e_mbd);
+
+    super_block_uvrd_8x8(x, &this_rate_tokenonly,
+                         &this_distortion, IF_RTCD(&cpi->rtcd));
+    this_rate = this_rate_tokenonly +
+                x->mbmode_cost[x->e_mbd.frame_type]
+                              [x->e_mbd.mode_info_context->mbmi.mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+    }
+  }
+
+  x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;
+
+  return best_rd;
+}
+#endif
+
 int vp8_cost_mv_ref(VP8_COMP *cpi,
                     MB_PREDICTION_MODE m,
                     const int near_mv_ref_ct[4]) {
@@ -2568,25 +2810,33 @@ static void vp8_estimate_ref_frame_costs(VP8_COMP *cpi, int segment_id, unsigned
   }
 }
 
-static void store_coding_context(MACROBLOCK *x, int mb_index,
+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                                  int mode_index,
                                  PARTITION_INFO *partition,
                                  int_mv *ref_mv,
-                                 int_mv *second_ref_mv) {
+                                 int_mv *second_ref_mv,
+                                 int single_pred_diff,
+                                 int comp_pred_diff,
+                                 int hybrid_pred_diff) {
   MACROBLOCKD *xd = &x->e_mbd;
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
-  x->mb_context[mb_index].best_mode_index = mode_index;
-  vpx_memcpy(&x->mb_context[mb_index].mic, xd->mode_info_context,
+  ctx->best_mode_index = mode_index;
+  vpx_memcpy(&ctx->mic, xd->mode_info_context,
              sizeof(MODE_INFO));
-  vpx_memcpy(&x->mb_context[mb_index].partition_info, partition,
-             sizeof(PARTITION_INFO));
-  x->mb_context[mb_index].best_ref_mv.as_int = ref_mv->as_int;
-  x->mb_context[mb_index].second_best_ref_mv.as_int = second_ref_mv->as_int;
-
-  // x->mb_context[mb_index].rddiv = x->rddiv;
-  // x->mb_context[mb_index].rdmult = x->rdmult;
+  if (partition)
+    vpx_memcpy(&ctx->partition_info, partition,
+               sizeof(PARTITION_INFO));
+  ctx->best_ref_mv.as_int = ref_mv->as_int;
+  ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
+
+  // ctx[mb_index].rddiv = x->rddiv;
+  // ctx[mb_index].rdmult = x->rdmult;
+
+  ctx->single_pred_diff = single_pred_diff;
+  ctx->comp_pred_diff   = comp_pred_diff;
+  ctx->hybrid_pred_diff = hybrid_pred_diff;
 }
 
 static void inter_mode_cost(VP8_COMP *cpi, MACROBLOCK *x, int this_mode,
@@ -3464,7 +3714,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     }
 #endif
 
-    if (x->skip)
+    if (x->skip && !mode_excluded)
       break;
   }
 
@@ -3557,16 +3807,36 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
   }
 
 end:
-  // TODO Save these to add in only if MB coding mode is selected?
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
-    cpi->rd_comp_pred_diff[i] += best_pred_diff[i];
+  store_coding_context(x, &x->mb_context[xd->mb_index], best_mode_index, &best_partition,
+                       &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
+                       &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
+                       best_pred_diff[0], best_pred_diff[1], best_pred_diff[2]);
+}
 
-  store_coding_context(x, xd->mb_index, best_mode_index, &best_partition,
-                       &frame_best_ref_mv[mbmi->ref_frame],
-                       &frame_best_ref_mv[mbmi->second_ref_frame]);
+#if CONFIG_SUPERBLOCKS
+void vp8_rd_pick_intra_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
+                               int *returnrate,
+                               int *returndist) {
+  int rate_y, rate_uv;
+  int rate_y_tokenonly, rate_uv_tokenonly;
+  int error_y, error_uv;
+  int dist_y, dist_uv;
+
+  x->e_mbd.mode_info_context->mbmi.txfm_size = TX_8X8;
+
+  error_uv = rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+                                     &dist_uv);
+  error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                                   &dist_y);
+
+  // TODO(rbultje): add rate_uv
+  *returnrate = rate_y;
+  *returndist = dist_y + (dist_uv >> 2);
 }
+#endif
 
-int vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x) {
+void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
+                            int *returnrate, int *returndist) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   int64_t error4x4, error16x16;
@@ -3585,6 +3855,8 @@ int vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x) {
   int rate8x8, dist8x8;
   int mode16x16;
   int mode8x8[2][4];
+  int dist;
+  int rateuv8, rateuv_tokenonly8, distuv8;
 
   mbmi->ref_frame = INTRA_FRAME;
   rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
@@ -3646,9 +3918,11 @@ int vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x) {
       rate += rate4x4;
 #endif
       mbmi->mode = B_PRED;
+      dist = dist4x4;
     } else {
       mbmi->mode = mode16x16;
       rate += rate16x16;
+      dist = dist16x16;
     }
   } else {
     if (error4x4 < error8x8) {
@@ -3663,17 +3937,727 @@ int vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x) {
       rate += rate4x4;
 #endif
       mbmi->mode = B_PRED;
+      dist = dist4x4;
     } else {
       mbmi->mode = I8X8_PRED;
       set_i8x8_block_modes(x, mode8x8);
       rate += rate8x8;
+      dist = dist8x8;
     }
   }
-  return rate;
+
+  // TODO(rbultje): should add rateuv here also
+  *returnrate = rate - rateuv;
+  *returndist = dist + (distuv >> 2);
 }
 
-int vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
-                                     int recon_yoffset, int recon_uvoffset) {
+#if CONFIG_SUPERBLOCKS
+int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
+                                  int recon_yoffset, int recon_uvoffset,
+                                  int *returnrate, int *returndistortion) {
+  VP8_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  BLOCK *b = &x->block[0];
+  BLOCKD *d = &xd->block[0];
+  MB_PREDICTION_MODE this_mode;
+  MV_REFERENCE_FRAME ref_frame;
+  int mis = xd->mode_info_stride;
+  unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
+  int comp_pred;
+  int_mv best_ref_mv, second_best_ref_mv;
+  int_mv mode_mv[MB_MODE_COUNT];
+  int_mv frame_nearest_mv[4];
+  int_mv frame_near_mv[4];
+  int_mv frame_best_ref_mv[4];
+  int_mv mc_search_result[4];
+  int frame_mdcounts[4][4];
+  unsigned char *y_buffer[4];
+  unsigned char *u_buffer[4];
+  unsigned char *v_buffer[4];
+  static const int flag_list[4] = { 0, VP8_LAST_FLAG, VP8_GOLD_FLAG, VP8_ALT_FLAG };
+  int idx_list[4] = { 0, cpi->common.lst_fb_idx, cpi->common.gld_fb_idx, cpi->common.alt_fb_idx };
+  int mdcounts[4];
+  int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  int saddone = 0;
+  int sr = 0;  // search range got from mv_pred(). It uses step_param levels. (0-7)
+  int64_t best_rd = INT64_MAX;
+  int64_t best_comp_rd = INT64_MAX;
+  int64_t best_single_rd = INT64_MAX;
+  int64_t best_hybrid_rd = INT64_MAX;
+  int64_t best_yrd = INT64_MAX;
+  MB_MODE_INFO best_mbmode;
+  int mode_index = 0;
+#if 0
+  PARTITION_INFO best_partition;
+  union b_mode_info best_bmodes[16];
+#endif
+  unsigned int ref_costs[MAX_REF_FRAMES];
+
+  xd->mode_info_context->mbmi.segment_id = segment_id;
+  vp8_estimate_ref_frame_costs(cpi, segment_id, ref_costs);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      YV12_BUFFER_CONFIG *ref_buf = &cpi->common.yv12_fb[idx_list[ref_frame]];
+
+      vp8_find_near_mvs(xd, xd->mode_info_context,
+                        xd->prev_mode_info_context,
+                        &frame_nearest_mv[ref_frame], &frame_near_mv[ref_frame],
+                        &frame_best_ref_mv[ref_frame], frame_mdcounts[ref_frame],
+                        ref_frame, cpi->common.ref_frame_sign_bias);
+
+      y_buffer[ref_frame] = ref_buf->y_buffer + recon_yoffset;
+      u_buffer[ref_frame] = ref_buf->u_buffer + recon_uvoffset;
+      v_buffer[ref_frame] = ref_buf->v_buffer + recon_uvoffset;
+    }
+    mc_search_result[ref_frame].as_int = INVALID_MV;
+  }
+
+  for (mode_index = 0; mode_index < MAX_MODES; mode_index++) {
+    int_mv mvp;
+    int mode_excluded;
+    int64_t this_rd = INT64_MAX;
+    int disable_skip = 0;
+    int other_cost = 0;
+    int compmode_cost = 0;
+    int rate2 = 0;
+    int distortion2 = 0;
+    int rate_y = 0;
+    int rate_uv = 0;
+    int distortion_uv;
+    int distortion;
+    int skippable_y, skippable_uv;
+
+    // Test best rd so far against threshold for trying this mode.
+    if (best_rd <= cpi->rd_threshes[mode_index]) {
+      continue;
+    }
+
+    this_mode = vp8_mode_order[mode_index].mode;
+    ref_frame = vp8_mode_order[mode_index].ref_frame;
+    xd->mode_info_context->mbmi.ref_frame = ref_frame;
+    comp_pred = vp8_mode_order[mode_index].second_ref_frame != INTRA_FRAME;
+    xd->mode_info_context->mbmi.mode = this_mode;
+    xd->mode_info_context->mbmi.uv_mode = DC_PRED;
+#if 0 && CONFIG_PRED_FILTER
+    xd->mode_info_context->mbmi.pred_filter_enabled = 0;
+#endif
+
+#if 0 && CONFIG_COMP_INTRA_PRED
+    xd->mode_info_context->mbmi.second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+    xd->mode_info_context->mbmi.second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
+      continue;
+
+    // not yet supported or not superblocky
+    // TODO(rbultje): support intra coding
+    if (ref_frame == INTRA_FRAME || this_mode == SPLITMV)
+      continue;
+
+    if (comp_pred) {
+      int second_ref;
+
+      if (ref_frame == ALTREF_FRAME) {
+        second_ref = LAST_FRAME;
+      } else {
+        second_ref = ref_frame + 1;
+      }
+      if (!(cpi->ref_frame_flags & flag_list[second_ref]))
+        continue;
+      xd->mode_info_context->mbmi.second_ref_frame = second_ref;
+
+      xd->second_pre.y_buffer = y_buffer[second_ref];
+      xd->second_pre.u_buffer = u_buffer[second_ref];
+      xd->second_pre.v_buffer = v_buffer[second_ref];
+      second_best_ref_mv  = frame_best_ref_mv[second_ref];
+      mode_excluded = cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+    } else {
+      xd->mode_info_context->mbmi.second_ref_frame = INTRA_FRAME;
+      mode_excluded = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
+    }
+
+    xd->pre.y_buffer = y_buffer[ref_frame];
+    xd->pre.u_buffer = u_buffer[ref_frame];
+    xd->pre.v_buffer = v_buffer[ref_frame];
+    mode_mv[ZEROMV].as_int = 0;
+    mode_mv[NEARESTMV] = frame_nearest_mv[ref_frame];
+    mode_mv[NEARMV] = frame_near_mv[ref_frame];
+    best_ref_mv = frame_best_ref_mv[ref_frame];
+    vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));
+
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+        !check_segref(xd, segment_id, ref_frame)) {
+      continue;
+    }
+    // If the segment mode feature is enabled....
+    // then do nothing if the current mode is not allowed..
+    else if (segfeature_active(xd, segment_id, SEG_LVL_MODE)  &&
+             (this_mode != get_segdata(xd, segment_id, SEG_LVL_MODE))) {
+      continue;
+    }
+    // Disable this drop out case if either the mode or ref frame
+    // segment level feature is enabled for this segment. This is to
+    // prevent the possibility that we end up unable to pick any mode.
+    else if (!segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+             !segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+      // unless ARNR filtering is enabled in which case we want
+      // an unfiltered alternative
+      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+        if (this_mode != ZEROMV || ref_frame != ALTREF_FRAME) {
+          continue;
+        }
+      }
+    }
+
+    if (!comp_pred) {
+      switch (this_mode) {
+        case NEWMV: {
+          int thissme;
+          int bestsme = INT_MAX;
+          int step_param = cpi->sf.first_step;
+          int further_steps;
+          int n;
+          int do_refine = 1;   /* If last step (1-away) of n-step search doesn't pick the center point as the best match,
+                                  we will do a final 1-away diamond refining search  */
+          int num00;
+
+          int sadpb = x->sadperbit16;
+          int_mv mvp_full;
+
+          int col_min = (best_ref_mv.as_mv.col >> 3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.col & 7) ? 1 : 0);
+          int row_min = (best_ref_mv.as_mv.row >> 3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.row & 7) ? 1 : 0);
+          int col_max = (best_ref_mv.as_mv.col >> 3) + MAX_FULL_PEL_VAL;
+          int row_max = (best_ref_mv.as_mv.row >> 3) + MAX_FULL_PEL_VAL;
+
+          int tmp_col_min = x->mv_col_min;
+          int tmp_col_max = x->mv_col_max;
+          int tmp_row_min = x->mv_row_min;
+          int tmp_row_max = x->mv_row_max;
+
+          if (!saddone) {
+            vp8_cal_sad(cpi, xd, x, recon_yoffset, &near_sadidx[0]);
+            saddone = 1;
+          }
+
+          vp8_mv_pred(cpi, xs, xd->mode_info_context, &mvp,
+                      xd->mode_info_context->mbmi.ref_frame,
+                      cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
+
+          mvp_full.as_mv.col = mvp.as_mv.col >> 3;
+          mvp_full.as_mv.row = mvp.as_mv.row >> 3;
+
+          // Get intersection of UMV window and valid MV window to reduce # of checks in diamond search.
+          if (x->mv_col_min < col_min)
+            x->mv_col_min = col_min;
+          if (x->mv_col_max > col_max)
+            x->mv_col_max = col_max;
+          if (x->mv_row_min < row_min)
+            x->mv_row_min = row_min;
+          if (x->mv_row_max > row_max)
+            x->mv_row_max = row_max;
+
+          // adjust search range according to sr from mv prediction
+          if (sr > step_param)
+            step_param = sr;
+
+          // Initial step/diamond search
+          {
+            bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full, &d->bmi.as_mv.first,
+                                              step_param, sadpb, &num00,
+                                              &cpi->fn_ptr[BLOCK_32X32],
+                                              XMVCOST, &best_ref_mv);
+            mode_mv[NEWMV].as_int = d->bmi.as_mv.first.as_int;
+
+            // Further step/diamond searches as necessary
+            n = 0;
+            further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+            n = num00;
+            num00 = 0;
+
+            /* If there won't be more n-step search, check to see if refining search is needed. */
+            if (n > further_steps)
+              do_refine = 0;
+
+            while (n < further_steps) {
+              n++;
+
+              if (num00)
+                num00--;
+              else {
+                thissme = cpi->diamond_search_sad(x, b, d, &mvp_full,
+                                                  &d->bmi.as_mv.first, step_param + n, sadpb, &num00,
+                                                  &cpi->fn_ptr[BLOCK_32X32],
+                                                  XMVCOST, &best_ref_mv);
+
+                /* check to see if refining search is needed. */
+                if (num00 > (further_steps - n))
+                  do_refine = 0;
+
+                if (thissme < bestsme) {
+                  bestsme = thissme;
+                  mode_mv[NEWMV].as_int = d->bmi.as_mv.first.as_int;
+                } else {
+                  d->bmi.as_mv.first.as_int = mode_mv[NEWMV].as_int;
+                }
+              }
+            }
+          }
+
+          /* final 1-away diamond refining search */
+          if (do_refine == 1) {
+            int search_range;
+
+            // It seems not a good way to set search_range. Need further investigation.
+            // search_range = MAXF(abs((mvp.row>>3) - d->bmi.mv.as_mv.row), abs((mvp.col>>3) - d->bmi.mv.as_mv.col));
+            search_range = 8;
+
+            thissme = cpi->refining_search_sad(x, b, d, &d->bmi.as_mv.first, sadpb,
+                                               search_range, &cpi->fn_ptr[BLOCK_32X32],
+                                               XMVCOST, &best_ref_mv);
+
+            if (thissme < bestsme) {
+              bestsme = thissme;
+              mode_mv[NEWMV].as_int = d->bmi.as_mv.first.as_int;
+            } else {
+              d->bmi.as_mv.first.as_int = mode_mv[NEWMV].as_int;
+            }
+          }
+
+          x->mv_col_min = tmp_col_min;
+          x->mv_col_max = tmp_col_max;
+          x->mv_row_min = tmp_row_min;
+          x->mv_row_max = tmp_row_max;
+
+          if (bestsme < INT_MAX) {
+            int dis; /* TODO: use dis in distortion calculation later. */
+            unsigned int sse;
+            cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv.first, &best_ref_mv,
+                                         x->errorperbit,
+                                         &cpi->fn_ptr[BLOCK_32X32],
+                                         XMVCOST, &dis, &sse);
+          }
+          mc_search_result[xd->mode_info_context->mbmi.ref_frame].as_int =
+            d->bmi.as_mv.first.as_int;
+
+          mode_mv[NEWMV].as_int = d->bmi.as_mv.first.as_int;
+
+          // Add the new motion vector cost to our rolling cost variable
+          rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv,
+                                   XMVCOST, 96,
+                                   xd->allow_high_precision_mv);
+        }
+
+        case NEARESTMV:
+        case NEARMV:
+          // Clip "next_nearest" so that it does not extend to far out of image
+          vp8_clamp_mv2(&mode_mv[this_mode], xd);
+
+          // Do not bother proceeding if the vector (from newmv,nearest or near) is 0,0 as this should then be coded using the zeromv mode.
+          if (((this_mode == NEARMV) || (this_mode == NEARESTMV)) && (mode_mv[this_mode].as_int == 0)) {
+            continue;
+          }
+
+        case ZEROMV:
+          // Trap vectors that reach beyond the UMV borders
+          // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point
+          // because of the lack of break statements in the previous two cases.
+          if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+              ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
+            continue;
+          }
+
+          vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
+
+#if CONFIG_PRED_FILTER
+          // Filtered prediction:
+          xd->mode_info_context->mbmi.pred_filter_enabled =
+          vp8_mode_order[mode_index].pred_filter_flag;
+          rate2 += vp8_cost_bit(cpi->common.prob_pred_filter_off,
+                                xd->mode_info_context->mbmi.pred_filter_enabled);
+#endif
+
+          vp8_build_inter32x32_predictors_sb(xd,
+                                             xd->dst.y_buffer,
+                                             xd->dst.u_buffer,
+                                             xd->dst.v_buffer,
+                                             xd->dst.y_stride,
+                                             xd->dst.uv_stride);
+
+          compmode_cost =
+            vp8_cost_bit(get_pred_prob(cm, xd, PRED_COMP), 0);
+
+          if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
+            x->skip = 1;
+          } else if (x->encode_breakout) {
+            unsigned int sse;
+            unsigned int var;
+            int threshold = (xd->block[0].dequant[1] *
+                             xd->block[0].dequant[1] >> 4);
+
+            if (threshold < x->encode_breakout)
+              threshold = x->encode_breakout;
+
+            var = VARIANCE_INVOKE(&cpi->rtcd.variance, var32x32)(*(b->base_src),
+              b->src_stride, xd->dst.y_buffer, xd->dst.y_stride, &sse);
+
+            if (sse < threshold) {
+              unsigned int q2dc = xd->block[24].dequant[0];
+              /* If there is no codeable 2nd order dc
+                or a very small uniform pixel change change */
+              if ((sse - var < q2dc *q2dc >> 4) ||
+                  (sse / 2 > var && sse - var < 64)) {
+                // Check u and v to make sure skip is ok
+                int sse2, sse3;
+                int var2 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
+                                  (x->src.u_buffer, x->src.uv_stride,
+                                   xd->dst.u_buffer, xd->dst.uv_stride, &sse2);
+                int var3 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
+                                  (x->src.v_buffer, x->src.uv_stride,
+                                   xd->dst.v_buffer, xd->dst.uv_stride, &sse3);
+                sse2 += sse3;
+                if (sse2 * 2 < threshold) {
+                  x->skip = 1;
+                  distortion2 = sse + sse2;
+                  rate2 = 500;
+
+                  /* for best_yrd calculation */
+                  rate_uv = 0;
+                  distortion_uv = sse2;
+
+                  disable_skip = 1;
+                  this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+                  break;
+                }
+              }
+            }
+          }
+
+          // Add in the Mv/mode cost
+          rate2 += vp8_cost_mv_ref(cpi, this_mode, mdcounts);
+
+          // Y cost and distortion - FIXME support other transform sizes
+          super_block_yrd_8x8(x, &rate_y, &distortion,
+                              IF_RTCD(&cpi->rtcd), &skippable_y);
+          rate2 += rate_y;
+          distortion2 += distortion;
+
+          rd_inter32x32_uv_8x8(cpi, x, &rate_uv, &distortion_uv,
+                               cpi->common.full_pixel, &skippable_uv);
+
+          rate2 += rate_uv;
+          distortion2 += distortion_uv;
+          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
+          break;
+
+        default:
+          break;
+      }
+    } else { /* xd->mode_info_context->mbmi.second_ref_frame != 0 */
+      int ref1 = xd->mode_info_context->mbmi.ref_frame;
+      int ref2 = xd->mode_info_context->mbmi.second_ref_frame;
+
+      mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
+      switch (this_mode) {
+        case NEWMV:
+          if (mc_search_result[ref1].as_int == INVALID_MV ||
+              mc_search_result[ref2].as_int == INVALID_MV)
+            continue;
+          xd->mode_info_context->mbmi.mv[0].as_int = mc_search_result[ref1].as_int;
+          xd->mode_info_context->mbmi.mv[1].as_int = mc_search_result[ref2].as_int;
+          rate2 += vp8_mv_bit_cost(&mc_search_result[ref1],
+                                   &frame_best_ref_mv[ref1],
+                                   XMVCOST, 96,
+                                   xd->allow_high_precision_mv);
+          rate2 += vp8_mv_bit_cost(&mc_search_result[ref2],
+                                   &frame_best_ref_mv[ref2],
+                                   XMVCOST, 96,
+                                   xd->allow_high_precision_mv);
+          break;
+        case ZEROMV:
+          xd->mode_info_context->mbmi.mv[0].as_int = 0;
+          xd->mode_info_context->mbmi.mv[1].as_int = 0;
+          break;
+        case NEARMV:
+          if (frame_near_mv[ref1].as_int == 0 || frame_near_mv[ref2].as_int == 0) {
+            continue;
+          }
+          xd->mode_info_context->mbmi.mv[0].as_int = frame_near_mv[ref1].as_int;
+          xd->mode_info_context->mbmi.mv[1].as_int = frame_near_mv[ref2].as_int;
+          break;
+        case NEARESTMV:
+          if (frame_nearest_mv[ref1].as_int == 0 || frame_nearest_mv[ref2].as_int == 0) {
+            continue;
+          }
+          xd->mode_info_context->mbmi.mv[0].as_int = frame_nearest_mv[ref1].as_int;
+          xd->mode_info_context->mbmi.mv[1].as_int = frame_nearest_mv[ref2].as_int;
+          break;
+        default:
+          break;
+      }
+
+      /* Add in the Mv/mode cost */
+      rate2 += vp8_cost_mv_ref(cpi, this_mode, mdcounts);
+
+      vp8_clamp_mv2(&xd->mode_info_context->mbmi.mv[0], xd);
+      vp8_clamp_mv2(&xd->mode_info_context->mbmi.mv[1], xd);
+      if (((xd->mode_info_context->mbmi.mv[0].as_mv.row >> 3) < x->mv_row_min) ||
+          ((xd->mode_info_context->mbmi.mv[0].as_mv.row >> 3) > x->mv_row_max) ||
+          ((xd->mode_info_context->mbmi.mv[0].as_mv.col >> 3) < x->mv_col_min) ||
+          ((xd->mode_info_context->mbmi.mv[0].as_mv.col >> 3) > x->mv_col_max) ||
+          ((xd->mode_info_context->mbmi.mv[1].as_mv.row >> 3) < x->mv_row_min) ||
+          ((xd->mode_info_context->mbmi.mv[1].as_mv.row >> 3) > x->mv_row_max) ||
+          ((xd->mode_info_context->mbmi.mv[1].as_mv.col >> 3) < x->mv_col_min) ||
+          ((xd->mode_info_context->mbmi.mv[1].as_mv.col >> 3) > x->mv_col_max)) {
+        continue;
+      }
+
+      /* build first and second prediction */
+      vp8_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                         xd->dst.u_buffer, xd->dst.v_buffer,
+                                         xd->dst.y_stride, xd->dst.uv_stride);
+
+      /* Y cost and distortion - TODO(rbultje) support other transform sizes */
+      super_block_yrd_8x8(x, &rate_y, &distortion,
+                          IF_RTCD(&cpi->rtcd), &skippable_y);
+
+      rate2 += rate_y;
+      distortion2 += distortion;
+
+      /* UV cost and distortion */
+      rd_inter32x32_uv_8x8(cpi, x, &rate_uv, &distortion_uv,
+                           cpi->common.full_pixel, &skippable_uv);
+
+      rate2 += rate_uv;
+      distortion2 += distortion_uv;
+
+      /* don't bother w/ skip, we would never have come here if skip were
+       * enabled */
+      xd->mode_info_context->mbmi.mode = this_mode;
+
+      /* We don't include the cost of the second reference here, because there
+       * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in
+       * other words if you present them in that order, the second one is
+       * always known if the first is known */
+      compmode_cost = vp8_cost_bit(get_pred_prob(cm, xd, PRED_COMP), 1);
+    }
+
+    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+      rate2 += compmode_cost;
+    }
+
+
+    // Estimate the reference frame signaling cost and add it
+    // to the rolling cost variable.
+    rate2 += ref_costs[xd->mode_info_context->mbmi.ref_frame];
+
+    if (!disable_skip) {
+      // Test for the condition where skip block will be activated
+      // because there are no non zero coefficients and make any
+      // necessary adjustment for rate. Ignore if skip is coded at
+      // segment level as the cost wont have been added in.
+      if (cpi->common.mb_no_coeff_skip) {
+        int mb_skippable = skippable_y && skippable_uv;
+        int mb_skip_allowed;
+
+        // Is Mb level skip allowed for this mb.
+        mb_skip_allowed =
+          !segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+          get_segdata(xd, segment_id, SEG_LVL_EOB);
+
+        if (mb_skippable) {
+          // Back out the coefficient coding costs
+          rate2 -= (rate_y + rate_uv);
+          // for best_yrd calculation
+          rate_uv = 0;
+
+          if (mb_skip_allowed) {
+            int prob_skip_cost;
+
+            // Cost the skip mb case
+            vp8_prob skip_prob =
+              get_pred_prob(cm, xd, PRED_MBSKIP);
+
+            if (skip_prob) {
+              prob_skip_cost = vp8_cost_bit(skip_prob, 1);
+              rate2 += prob_skip_cost;
+              other_cost += prob_skip_cost;
+            }
+          }
+        }
+        // Add in the cost of the no skip flag.
+        else if (mb_skip_allowed) {
+          int prob_skip_cost = vp8_cost_bit(get_pred_prob(cm, xd,
+                                                          PRED_MBSKIP), 0);
+          rate2 += prob_skip_cost;
+          other_cost += prob_skip_cost;
+        }
+      }
+
+      // Calculate the final RD estimate for this mode.
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    }
+
+#if 0
+    // Keep record of best intra distortion
+    if ((xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
+        (this_rd < best_intra_rd)) {
+      best_intra_rd = this_rd;
+      *returnintra = distortion2;
+    }
+#endif
+
+    if (!disable_skip && xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+      if (this_rd < best_comp_rd)
+        best_comp_rd = this_rd;
+      if (this_rd < best_single_rd)
+        best_single_rd = this_rd;
+      if (this_rd < best_hybrid_rd)
+        best_hybrid_rd = this_rd;
+    }
+
+    // Did this mode help.. i.e. is it the new best mode
+    if (this_rd < best_rd || x->skip) {
+      if (!mode_excluded) {
+#if 0
+        // Note index of best mode so far
+        best_mode_index = mode_index;
+
+        if (this_mode <= B_PRED) {
+          xd->mode_info_context->mbmi.uv_mode = uv_intra_mode_8x8;
+          /* required for left and above block mv */
+          xd->mode_info_context->mbmi.mv.as_int = 0;
+        }
+#endif
+
+        other_cost += ref_costs[xd->mode_info_context->mbmi.ref_frame];
+
+        /* Calculate the final y RD estimate for this mode */
+        best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
+                          (distortion2 - distortion_uv));
+
+        *returnrate = rate2;
+        *returndistortion = distortion2;
+        best_rd = this_rd;
+        vpx_memcpy(&best_mbmode, &xd->mode_info_context->mbmi, sizeof(MB_MODE_INFO));
+      }
+#if 0
+      // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
+      cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+#endif
+    }
+    // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.
+    else {
+#if 0
+      cpi->rd_thresh_mult[mode_index] += 4;
+
+      if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+        cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+#endif
+    }
+
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip && xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
+      int single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+        single_rate = rate2 - compmode_cost;
+        hybrid_rate = rate2;
+      } else {
+        single_rate = rate2;
+        hybrid_rate = rate2 + compmode_cost;
+      }
+
+      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+      if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME &&
+          single_rd < best_single_rd) {
+        best_single_rd = single_rd;
+      } else if (xd->mode_info_context->mbmi.second_ref_frame != INTRA_FRAME &&
+                 single_rd < best_comp_rd) {
+        best_comp_rd = single_rd;
+      }
+      if (hybrid_rd < best_hybrid_rd) {
+        best_hybrid_rd = hybrid_rd;
+      }
+    }
+
+    if (x->skip && !mode_excluded)
+      break;
+  }
+
+  // TODO(rbultje) integrate with RD thresholding
+#if 0
+  // Reduce the activation RD thresholds for the best choice mode
+  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
+      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
+    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
+
+    cpi->rd_thresh_mult[best_mode_index] =
+      (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ?
+      cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
+    cpi->rd_threshes[best_mode_index] =
+      (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
+  }
+#endif
+
+  // This code forces Altref,0,0 and skip for the frame that overlays a
+  // an alrtef unless Altref is filtered. However, this is unsafe if
+  // segment level coding of ref frame or mode is enabled for this
+  // segment.
+  if (!segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+      !segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
+      cpi->is_src_frame_alt_ref &&
+      (cpi->oxcf.arnr_max_frames == 0) &&
+      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
+    xd->mode_info_context->mbmi.mode = ZEROMV;
+    xd->mode_info_context->mbmi.ref_frame = ALTREF_FRAME;
+    xd->mode_info_context->mbmi.mv[0].as_int = 0;
+    xd->mode_info_context->mbmi.uv_mode = DC_PRED;
+    xd->mode_info_context->mbmi.mb_skip_coeff =
+      (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+    xd->mode_info_context->mbmi.partitioning = 0;
+
+    xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+
+    if (best_rd != INT64_MAX)
+      store_coding_context(x, &x->sb_context[0], mode_index, NULL,
+        &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
+        &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
+        0, 0, 0);
+    return best_rd;
+  }
+
+  // macroblock modes
+  vpx_memcpy(&xd->mode_info_context->mbmi, &best_mbmode,
+             sizeof(MB_MODE_INFO));
+  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+
+  if (best_rd != INT64_MAX)
+    store_coding_context(x, &x->sb_context[0], mode_index, NULL,
+      &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
+      &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
+      (best_single_rd == INT64_MAX) ? INT_MIN : (best_rd - best_single_rd),
+      (best_comp_rd   == INT64_MAX) ? INT_MIN : (best_rd - best_comp_rd),
+      (best_hybrid_rd == INT64_MAX) ? INT_MIN : (best_rd - best_hybrid_rd));
+
+  return best_rd;
+}
+#endif
+
+void vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+                                      int recon_yoffset,
+                                      int recon_uvoffset,
+                                      int *totalrate, int *totaldist) {
   VP8_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
@@ -3694,17 +4678,6 @@ int vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
     vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
                            &distortion, &intra_error);
 
-    if (mbmi->ref_frame) {
-      unsigned char pred_context;
-
-      pred_context = get_pred_context(cm, xd, PRED_COMP);
-
-      if (mbmi->second_ref_frame == INTRA_FRAME)
-        cpi->single_pred_count[pred_context]++;
-      else
-        cpi->comp_pred_count[pred_context]++;
-    }
-
     /* restore cpi->zbin_mode_boost_enabled */
     cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
   }
@@ -3717,5 +4690,6 @@ int vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
   x->mb_context[xd->mb_index].distortion  = distortion;
   x->mb_context[xd->mb_index].intra_error = intra_error;
 
-  return rate;
+  *totalrate = rate;
+  *totaldist = distortion;
 }
diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h
index 2b5928de9..0e36a519d 100644
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -18,7 +18,8 @@
 extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
 extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset,
                                    int *returnrate, int *returndistortion, int64_t *returnintra);
-extern int vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x);
+extern void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *r, int *d);
+extern void vp8_rd_pick_intra_mode_sb(VP8_COMP *cpi, MACROBLOCK *x, int *r, int *d);
 
 extern void vp8_mv_pred
 (
diff --git a/vp8/encoder/sad_c.c b/vp8/encoder/sad_c.c
index 78a87f392..4fdfd1186 100644
--- a/vp8/encoder/sad_c.c
+++ b/vp8/encoder/sad_c.c
@@ -13,29 +13,6 @@
 #include "vpx_ports/config.h"
 #include "vpx/vpx_integer.h"
 
-unsigned int vp8_sad16x16_c(
-  const unsigned char *src_ptr,
-  int  src_stride,
-  const unsigned char *ref_ptr,
-  int  ref_stride,
-  int max_sad) {
-
-  int r, c;
-  unsigned int sad = 0;
-
-  for (r = 0; r < 16; r++) {
-    for (c = 0; c < 16; c++) {
-      sad += abs(src_ptr[c] - ref_ptr[c]);
-    }
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  }
-
-  return sad;
-}
-
-
 static __inline
 unsigned int sad_mx_n_c(
   const unsigned char *src_ptr,
@@ -60,6 +37,21 @@ unsigned int sad_mx_n_c(
   return sad;
 }
 
+unsigned int vp8_sad32x32_c(const unsigned char *src_ptr,
+                            int  src_stride,
+                            const unsigned char *ref_ptr,
+                            int  ref_stride,
+                            int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
+}
+
+unsigned int vp8_sad16x16_c(const unsigned char *src_ptr,
+                            int  src_stride,
+                            const unsigned char *ref_ptr,
+                            int  ref_stride,
+                            int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);
+}
 
 unsigned int vp8_sad8x8_c(
   const unsigned char *src_ptr,
@@ -104,6 +96,7 @@ unsigned int vp8_sad4x4_c(
 
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
 }
+
 #if CONFIG_NEWBESTREFMV
 unsigned int vp8_sad2x16_c(
   const unsigned char *src_ptr,
@@ -122,6 +115,34 @@ unsigned int vp8_sad16x2_c(
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 2);
 }
 #endif
+
+void vp8_sad32x32x3_c(const unsigned char *src_ptr,
+                      int  src_stride,
+                      const unsigned char *ref_ptr,
+                      int  ref_stride,
+                      unsigned int *sad_array
+                      ) {
+  sad_array[0] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp8_sad32x32x8_c(const unsigned char *src_ptr,
+                      int  src_stride,
+                      const unsigned char *ref_ptr,
+                      int  ref_stride,
+                      unsigned short *sad_array
+                      ) {
+  sad_array[0] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+  sad_array[3] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, 0x7fffffff);
+  sad_array[4] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+  sad_array[5] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+  sad_array[6] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, 0x7fffffff);
+  sad_array[7] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad16x16x3_c(
   const unsigned char *src_ptr,
   int  src_stride,
@@ -267,6 +288,18 @@ void vp8_sad4x4x8_c(
   sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
 }
 
+void vp8_sad32x32x4d_c(const unsigned char *src_ptr,
+                       int  src_stride,
+                       unsigned char *ref_ptr[],
+                       int  ref_stride,
+                       unsigned int *sad_array
+                       ) {
+  sad_array[0] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
 void vp8_sad16x16x4d_c(
   const unsigned char *src_ptr,
   int  src_stride,
diff --git a/vp8/encoder/segmentation.c b/vp8/encoder/segmentation.c
index e9d02cdd4..e88b80d34 100644
--- a/vp8/encoder/segmentation.c
+++ b/vp8/encoder/segmentation.c
@@ -200,42 +200,59 @@ void choose_segmap_coding_method(VP8_COMP *cpi) {
   // in the frame
   xd->mode_info_context = cm->mi;
 
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-      segment_id = xd->mode_info_context->mbmi.segment_id;
-
-      // Count the number of hits on each segment with no prediction
-      no_pred_segcounts[segment_id]++;
-
-      // Temporal prediction not allowed on key frames
-      if (cm->frame_type != KEY_FRAME) {
-        // Test to see if the segment id matches the predicted value.
-        int seg_predicted =
-          (segment_id == get_pred_mb_segid(cm, segmap_index));
-
-        // Get the segment id prediction context
-        pred_context =
-          get_pred_context(cm, xd, PRED_SEG_ID);
-
-        // Store the prediction status for this mb and update counts
-        // as appropriate
-        set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
-        temporal_predictor_count[pred_context][seg_predicted]++;
-
-        if (!seg_predicted)
-          // Update the "unpredicted" segment count
-          t_unpred_seg_counts[segment_id]++;
-      }
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 2) {
+      for (i = 0; i < 4; i++) {
+        static const int dx[4] = { +1, -1, +1, +1 };
+        static const int dy[4] = {  0, +1,  0, -1 };
+        int x_idx = i & 1, y_idx = i >> 1;
+
+        if (mb_col + x_idx >= cm->mb_cols ||
+            mb_row + y_idx >= cm->mb_rows) {
+          goto end;
+        }
+
+        segmap_index = (mb_row + y_idx) * cm->mb_cols + mb_col + x_idx;
+        segment_id = xd->mode_info_context->mbmi.segment_id;
+
+        // Count the number of hits on each segment with no prediction
+        no_pred_segcounts[segment_id]++;
+
+        // Temporal prediction not allowed on key frames
+        if (cm->frame_type != KEY_FRAME) {
+          // Test to see if the segment id matches the predicted value.
+          int seg_predicted =
+            (segment_id == get_pred_mb_segid(cm, segmap_index));
 
-      // Step on to the next mb
-      xd->mode_info_context++;
+          // Get the segment id prediction context
+          pred_context =
+            get_pred_context(cm, xd, PRED_SEG_ID);
 
-      // Step on to the next entry in the segment maps
-      segmap_index++;
+          // Store the prediction status for this mb and update counts
+          // as appropriate
+          set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
+          temporal_predictor_count[pred_context][seg_predicted]++;
+
+          if (!seg_predicted)
+            // Update the "unpredicted" segment count
+            t_unpred_seg_counts[segment_id]++;
+        }
+
+#if CONFIG_SUPERBLOCKS
+        if (xd->mode_info_context->mbmi.encoded_as_sb) {
+          assert(!i);
+          xd->mode_info_context += 2;
+          break;
+        }
+#endif
+      end:
+        xd->mode_info_context += dx[i] + dy[i] * cm->mode_info_stride;
+      }
     }
 
     // this is to account for the border in mode_info_context
-    xd->mode_info_context++;
+    xd->mode_info_context -= mb_col;
+    xd->mode_info_context += cm->mode_info_stride * 2;
   }
 
   // Work out probability tree for coding segments without prediction
diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h
index 73a0a6b99..e17733c58 100644
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -145,8 +145,18 @@ extern prototype_sad(vp8_variance_sad16x8);
 #endif
 extern prototype_sad(vp8_variance_sad16x16);
 
+#ifndef vp8_variance_sad32x32
+#define vp8_variance_sad32x32 vp8_sad32x32_c
+#endif
+extern prototype_sad(vp8_variance_sad32x32);
+
 // -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
+#ifndef vp8_variance_sad32x32x3
+#define vp8_variance_sad32x32x3 vp8_sad32x32x3_c
+#endif
+extern prototype_sad_multi_same_address(vp8_variance_sad32x32x3);
+
 #ifndef vp8_variance_sad16x16x3
 #define vp8_variance_sad16x16x3 vp8_sad16x16x3_c
 #endif
@@ -172,6 +182,11 @@ extern prototype_sad_multi_same_address(vp8_variance_sad8x16x3);
 #endif
 extern prototype_sad_multi_same_address(vp8_variance_sad4x4x3);
 
+#ifndef vp8_variance_sad32x32x8
+#define vp8_variance_sad32x32x8 vp8_sad32x32x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad32x32x8);
+
 #ifndef vp8_variance_sad16x16x8
 #define vp8_variance_sad16x16x8 vp8_sad16x16x8_c
 #endif
@@ -199,6 +214,11 @@ extern prototype_sad_multi_same_address_1(vp8_variance_sad4x4x8);
 
 // -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
+#ifndef vp8_variance_sad32x32x4d
+#define vp8_variance_sad32x32x4d vp8_sad32x32x4d_c
+#endif
+extern prototype_sad_multi_dif_address(vp8_variance_sad32x32x4d);
+
 #ifndef vp8_variance_sad16x16x4d
 #define vp8_variance_sad16x16x4d vp8_sad16x16x4d_c
 #endif
@@ -258,6 +278,11 @@ extern prototype_variance(vp8_variance_var16x8);
 #endif
 extern prototype_variance(vp8_variance_var16x16);
 
+#ifndef vp8_variance_var32x32
+#define vp8_variance_var32x32 vp8_variance32x32_c
+#endif
+extern prototype_variance(vp8_variance_var32x32);
+
 // -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
 #ifndef vp8_variance_subpixvar4x4
@@ -285,26 +310,51 @@ extern prototype_subpixvariance(vp8_variance_subpixvar16x8);
 #endif
 extern prototype_subpixvariance(vp8_variance_subpixvar16x16);
 
+#ifndef vp8_variance_subpixvar32x32
+#define vp8_variance_subpixvar32x32 vp8_sub_pixel_variance32x32_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixvar32x32);
+
 #ifndef vp8_variance_halfpixvar16x16_h
 #define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_c
 #endif
 extern prototype_variance(vp8_variance_halfpixvar16x16_h);
 
+#ifndef vp8_variance_halfpixvar32x32_h
+#define vp8_variance_halfpixvar32x32_h vp8_variance_halfpixvar32x32_h_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar32x32_h);
+
 #ifndef vp8_variance_halfpixvar16x16_v
 #define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c
 #endif
 extern prototype_variance(vp8_variance_halfpixvar16x16_v);
 
+#ifndef vp8_variance_halfpixvar32x32_v
+#define vp8_variance_halfpixvar32x32_v vp8_variance_halfpixvar32x32_v_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar32x32_v);
+
 #ifndef vp8_variance_halfpixvar16x16_hv
 #define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_c
 #endif
 extern prototype_variance(vp8_variance_halfpixvar16x16_hv);
 
+#ifndef vp8_variance_halfpixvar32x32_hv
+#define vp8_variance_halfpixvar32x32_hv vp8_variance_halfpixvar32x32_hv_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar32x32_hv);
+
 #ifndef vp8_variance_subpixmse16x16
 #define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_c
 #endif
 extern prototype_subpixvariance(vp8_variance_subpixmse16x16);
 
+#ifndef vp8_variance_subpixmse32x32
+#define vp8_variance_subpixmse32x32 vp8_sub_pixel_mse32x32_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixmse32x32);
+
 // -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
 #ifndef vp8_variance_getmbss
@@ -349,38 +399,66 @@ typedef struct {
   vp8_sad_fn_t             sad8x16;
   vp8_sad_fn_t             sad16x8;
   vp8_sad_fn_t             sad16x16;
+#if CONFIG_SUPERBLOCKS
+  vp8_sad_fn_t             sad32x32;
+#endif
 
   vp8_variance_fn_t        var4x4;
   vp8_variance_fn_t        var8x8;
   vp8_variance_fn_t        var8x16;
   vp8_variance_fn_t        var16x8;
   vp8_variance_fn_t        var16x16;
+#if CONFIG_SUPERBLOCKS
+  vp8_variance_fn_t        var32x32;
+#endif
 
   vp8_subpixvariance_fn_t  subpixvar4x4;
   vp8_subpixvariance_fn_t  subpixvar8x8;
   vp8_subpixvariance_fn_t  subpixvar8x16;
   vp8_subpixvariance_fn_t  subpixvar16x8;
   vp8_subpixvariance_fn_t  subpixvar16x16;
+#if CONFIG_SUPERBLOCKS
+  vp8_subpixvariance_fn_t  subpixvar32x32;
+#endif
   vp8_variance_fn_t        halfpixvar16x16_h;
+  vp8_variance_fn_t        halfpixvar32x32_h;
   vp8_variance_fn_t        halfpixvar16x16_v;
+#if CONFIG_SUPERBLOCKS
+  vp8_variance_fn_t        halfpixvar32x32_v;
+#endif
   vp8_variance_fn_t        halfpixvar16x16_hv;
+#if CONFIG_SUPERBLOCKS
+  vp8_variance_fn_t        halfpixvar32x32_hv;
+#endif
   vp8_subpixvariance_fn_t  subpixmse16x16;
+#if CONFIG_SUPERBLOCKS
+  vp8_subpixvariance_fn_t  subpixmse32x32;
+#endif
 
   vp8_getmbss_fn_t         getmbss;
   vp8_variance_fn_t        mse16x16;
 
+#if CONFIG_SUPERBLOCKS
+  vp8_sad_multi_fn_t       sad32x32x3;
+#endif
   vp8_sad_multi_fn_t       sad16x16x3;
   vp8_sad_multi_fn_t       sad16x8x3;
   vp8_sad_multi_fn_t       sad8x16x3;
   vp8_sad_multi_fn_t       sad8x8x3;
   vp8_sad_multi_fn_t       sad4x4x3;
 
+#if CONFIG_SUPERBLOCKS
+  vp8_sad_multi1_fn_t      sad32x32x8;
+#endif
   vp8_sad_multi1_fn_t      sad16x16x8;
   vp8_sad_multi1_fn_t      sad16x8x8;
   vp8_sad_multi1_fn_t      sad8x16x8;
   vp8_sad_multi1_fn_t      sad8x8x8;
   vp8_sad_multi1_fn_t      sad4x4x8;
 
+#if CONFIG_SUPERBLOCKS
+  vp8_sad_multi_d_fn_t     sad32x32x4d;
+#endif
   vp8_sad_multi_d_fn_t     sad16x16x4d;
   vp8_sad_multi_d_fn_t     sad16x8x4d;
   vp8_sad_multi_d_fn_t     sad8x16x4d;
diff --git a/vp8/encoder/variance_c.c b/vp8/encoder/variance_c.c
index 0b9d569b0..cbe2a51d6 100644
--- a/vp8/encoder/variance_c.c
+++ b/vp8/encoder/variance_c.c
@@ -55,6 +55,20 @@ static void variance(
   }
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_variance32x32_c(const unsigned char *src_ptr,
+                                 int  source_stride,
+                                 const unsigned char *ref_ptr,
+                                 int  recon_stride,
+                                 unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, &var, &avg);
+  *sse = var;
+  return (var - ((avg * avg) >> 10));
+}
+#endif
 
 unsigned int vp8_variance16x16_c(
   const unsigned char *src_ptr,
@@ -334,6 +348,27 @@ unsigned int vp8_sub_pixel_variance16x16_c
   return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_sub_pixel_variance32x32_c(const unsigned char  *src_ptr,
+                                           int  src_pixels_per_line,
+                                           int  xoffset,
+                                           int  yoffset,
+                                           const unsigned char *dst_ptr,
+                                           int dst_pixels_per_line,
+                                           unsigned int *sse) {
+  unsigned short FData3[33 * 32]; // Temp data bufffer used in filtering
+  unsigned char  temp2[36 * 32];
+  const short *HFilter, *VFilter;
+
+  HFilter = vp8_bilinear_filters[xoffset];
+  VFilter = vp8_bilinear_filters[yoffset];
+
+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);
+  var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);
+
+  return vp8_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+#endif
 
 unsigned int vp8_variance_halfpixvar16x16_h_c(
   const unsigned char *src_ptr,
@@ -345,17 +380,38 @@ unsigned int vp8_variance_halfpixvar16x16_h_c(
                                        ref_ptr, recon_stride, sse);
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_variance_halfpixvar32x32_h_c(const unsigned char *src_ptr,
+                                              int  source_stride,
+                                              const unsigned char *ref_ptr,
+                                              int  recon_stride,
+                                              unsigned int *sse) {
+  return vp8_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0,
+                                       ref_ptr, recon_stride, sse);
+}
+#endif
+
 
-unsigned int vp8_variance_halfpixvar16x16_v_c(
+unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr,
+                                              int  source_stride,
+                                              const unsigned char *ref_ptr,
+                                              int  recon_stride,
+                                              unsigned int *sse) {
+  return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
+                                       ref_ptr, recon_stride, sse);
+}
+
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_variance_halfpixvar32x32_v_c(
   const unsigned char *src_ptr,
   int  source_stride,
   const unsigned char *ref_ptr,
   int  recon_stride,
   unsigned int *sse) {
-  return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
+  return vp8_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8,
                                        ref_ptr, recon_stride, sse);
 }
-
+#endif
 
 unsigned int vp8_variance_halfpixvar16x16_hv_c(
   const unsigned char *src_ptr,
@@ -367,6 +423,16 @@ unsigned int vp8_variance_halfpixvar16x16_hv_c(
                                        ref_ptr, recon_stride, sse);
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_variance_halfpixvar32x32_hv_c(const unsigned char *src_ptr,
+                                               int  source_stride,
+                                               const unsigned char *ref_ptr,
+                                               int  recon_stride,
+                                               unsigned int *sse) {
+  return vp8_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8,
+                                       ref_ptr, recon_stride, sse);
+}
+#endif
 
 unsigned int vp8_sub_pixel_mse16x16_c
 (
@@ -382,6 +448,19 @@ unsigned int vp8_sub_pixel_mse16x16_c
   return *sse;
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_sub_pixel_mse32x32_c(const unsigned char  *src_ptr,
+                                      int  src_pixels_per_line,
+                                      int  xoffset,
+                                      int  yoffset,
+                                      const unsigned char *dst_ptr,
+                                      int dst_pixels_per_line,
+                                      unsigned int *sse) {
+  vp8_sub_pixel_variance32x32_c(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+  return *sse;
+}
+#endif
+
 unsigned int vp8_sub_pixel_variance16x8_c
 (
   const unsigned char  *src_ptr,