From: John Koleszar <jkoleszar@google.com>
Date: Tue, 8 Jan 2013 18:11:26 +0000 (-0800)
Subject: Merge vp9-preview changes into experimental branch
X-Git-Tag: v1.3.0~1210^2~27
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=879cb7d96259a71eea0038452a00241650589084;p=libvpx

Merge vp9-preview changes into experimental branch

Incorportate vp9-preview changes by merging master branch into experimental.

Conflicts:
	test/test.mk
	vp9/common/vp9_filter.c
	vp9/common/vp9_idctllm.c
	vp9/common/vp9_invtrans.h
	vp9/common/vp9_mbpitch.c
	vp9/common/vp9_rtcd_defs.sh
	vp9/common/vp9_systemdependent.h
	vp9/common/vp9_type_aliases.h
	vp9/common/x86/vp9_asm_stubs.c
	vp9/common/x86/vp9_subpixel_mmx.asm
	vp9/decoder/vp9_decodframe.c
	vp9/decoder/vp9_dequantize.c
	vp9/decoder/vp9_dequantize.h
	vp9/decoder/vp9_onyxd_int.h
	vp9/encoder/vp9_bitstream.c
	vp9/encoder/vp9_encodeframe.c
	vp9/encoder/vp9_rdopt.c

Change-Id: I17f51c3666d1b59cf1a699f87607cbc5d30a87c5
---

879cb7d96259a71eea0038452a00241650589084
diff --cc test/test.mk
index 919cf0438,cb15fcef8..28d387264
--- a/test/test.mk
+++ b/test/test.mk
@@@ -64,11 -68,7 +68,10 @@@ endi
  LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
  LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
  #LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
+ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 +ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_TX32X32),yesyes)
 +LIBVPX_TEST_SRCS-yes += dct32x32_test.cc
 +endif
- LIBVPX_TEST_SRCS-yes += idct8x8_test.cc
- LIBVPX_TEST_SRCS-yes += variance_test.cc
  endif # VP9
  
  
diff --cc vp9/common/vp9_idctllm.c
index 6cbc25967,893f378b5..4dd540e2a
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@@ -24,10 -24,10 +24,10 @@@
   **************************************************************************/
  #include <assert.h>
  #include <math.h>
- #include "vpx_ports/config.h"
+ #include "./vpx_config.h"
  #include "vp9/common/vp9_systemdependent.h"
 -
  #include "vp9/common/vp9_blockd.h"
 +#include "vp9/common/vp9_common.h"
  
  static const int cospi8sqrt2minus1 = 20091;
  static const int sinpi8sqrt2      = 35468;
@@@ -279,133 -157,16 +157,16 @@@ static const int16_t iadst_i16[256] = 
     3936, -3526,  3084, -2614,  2120, -1607,  1080,  -542
  };
  
- void vp9_ihtllm_float_c(const int16_t *input, int16_t *output, int pitch,
-                   TX_TYPE tx_type, int tx_dim) {
-   vp9_clear_system_state();  // Make it simd safe : __asm emms;
-   {
-     int i, j, k;
-     float bufa[256], bufb[256];  // buffers are for floating-point test purpose
-                                  // the implementation could be simplified in
-                                  // conjunction with integer transform
-     const int16_t *ip = input;
-     int16_t *op = output;
-     int shortpitch = pitch >> 1;
- 
-     float *pfa = &bufa[0];
-     float *pfb = &bufb[0];
- 
-     // pointers to vertical and horizontal transforms
-     const float *ptv, *pth;
- 
-     assert(tx_type != DCT_DCT);
-     // load and convert residual array into floating-point
-     for(j = 0; j < tx_dim; j++) {
-       for(i = 0; i < tx_dim; i++) {
-         pfa[i] = (float)ip[i];
-       }
-       pfa += tx_dim;
-       ip  += tx_dim;
-     }
- 
-     // vertical transformation
-     pfa = &bufa[0];
-     pfb = &bufb[0];
- 
-     switch(tx_type) {
-       case ADST_ADST :
-       case ADST_DCT  :
-         ptv = (tx_dim == 4) ? &iadst_4[0] :
-                               ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
-         break;
- 
-       default :
-         ptv = (tx_dim == 4) ? &idct_4[0] :
-                               ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
-         break;
-     }
- 
-     for(j = 0; j < tx_dim; j++) {
-       for(i = 0; i < tx_dim; i++) {
-         pfb[i] = 0 ;
-         for(k = 0; k < tx_dim; k++) {
-           pfb[i] += ptv[k] * pfa[(k * tx_dim)];
-         }
-         pfa += 1;
-       }
- 
-       pfb += tx_dim;
-       ptv += tx_dim;
-       pfa = &bufa[0];
-     }
- 
-     // horizontal transformation
-     pfa = &bufa[0];
-     pfb = &bufb[0];
- 
-     switch(tx_type) {
-       case ADST_ADST :
-       case  DCT_ADST :
-         pth = (tx_dim == 4) ? &iadst_4[0] :
-                               ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
-         break;
- 
-       default :
-         pth = (tx_dim == 4) ? &idct_4[0] :
-                               ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
-         break;
-     }
- 
-     for(j = 0; j < tx_dim; j++) {
-       for(i = 0; i < tx_dim; i++) {
-         pfa[i] = 0;
-         for(k = 0; k < tx_dim; k++) {
-           pfa[i] += pfb[k] * pth[k];
-         }
-         pth += tx_dim;
-        }
- 
-       pfa += tx_dim;
-       pfb += tx_dim;
- 
-       switch(tx_type) {
-         case ADST_ADST :
-         case  DCT_ADST :
-           pth = (tx_dim == 4) ? &iadst_4[0] :
-                                 ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
-           break;
- 
-         default :
-           pth = (tx_dim == 4) ? &idct_4[0] :
-                                 ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
-           break;
-       }
-     }
- 
-     // convert to short integer format and load BLOCKD buffer
-     op  = output;
-     pfa = &bufa[0];
- 
-     for(j = 0; j < tx_dim; j++) {
-       for(i = 0; i < tx_dim; i++) {
-         op[i] = (pfa[i] > 0 ) ? (int16_t)( pfa[i] / 8 + 0.49) :
-                                -(int16_t)( - pfa[i] / 8 + 0.49);
-       }
- 
-       op += shortpitch;
-       pfa += tx_dim;
-     }
-   }
-   vp9_clear_system_state(); // Make it simd safe : __asm emms;
- }
  
  /* Converted the transforms to integer form. */
 -#define VERTICAL_SHIFT 14  // 16
 -#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
 -#define HORIZONTAL_SHIFT 17  // 15
 +#define HORIZONTAL_SHIFT 14  // 16
  #define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
 +#define VERTICAL_SHIFT 17  // 15
 +#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
  void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
-                       TX_TYPE tx_type, int tx_dim) {
+                       TX_TYPE tx_type, int tx_dim, uint16_t eobs) {
    int i, j, k;
+   int nz_dim;
    int16_t imbuf[256];
  
    const int16_t *ip = input;
@@@ -444,47 -205,54 +205,60 @@@
        break;
    }
  
+   nz_dim = tx_dim;
+   if(tx_dim > 4) {
+     if(eobs < 36) {
+       vpx_memset(im, 0, 512);
+       nz_dim = 8;
+       if(eobs < 3) {
+         nz_dim = 2;
+       } else if(eobs < 10) {
+         nz_dim = 4;
+       }
+     }
+   }
+ 
 -  /* vertical transformation */
 +  /* 2-D inverse transform X = M1*Z*Transposed_M2 is calculated in 2 steps
 +   * from right to left:
 +   * 1. horizontal transform: Y= Z*Transposed_M2
 +   * 2. vertical transform: X = M1*Y
 +   * In SIMD, doing this way could eliminate the transpose needed if it is
 +   * calculated from left to right.
 +   */
 +  /* Horizontal transformation */
    for (j = 0; j < tx_dim; j++) {
-     for (i = 0; i < tx_dim; i++) {
+     for (i = 0; i < nz_dim; i++) {
        int temp = 0;
  
-       for (k = 0; k < tx_dim; k++) {
+       for (k = 0; k < nz_dim; k++) {
 -        temp += ptv[k] * ip[(k * tx_dim)];
 +        temp += ip[k] * pth[k];
        }
  
 -      im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
 -      ip++;
 +      /* Calculate im and store it in its transposed position. */
 +      im[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
 +      ip += tx_dim;
      }
 -    im += tx_dim;  // 16
 -    ptv += tx_dim;
 +    im += tx_dim;
 +    pth += tx_dim;
      ip = input;
    }
  
 -  /* horizontal transformation */
 +  /* Vertical transformation */
    im = &imbuf[0];
  
 -  for (j = 0; j < tx_dim; j++) {
 -    const int16_t *pthc = pth;
 -
 -    for (i = 0; i < tx_dim; i++) {
 +  for (i = 0; i < tx_dim; i++) {
 +    for (j = 0; j < tx_dim; j++) {
        int temp = 0;
  
-       for (k = 0; k < tx_dim; k++) {
+       for (k = 0; k < nz_dim; k++) {
 -        temp += im[k] * pthc[k];
 +        temp += ptv[k] * im[k];
        }
  
 -      op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
 -      pthc += tx_dim;
 +      op[j] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
 +      im += tx_dim;
      }
 -
 -    im += tx_dim;  // 16
 +    im = &imbuf[0];
 +    ptv += tx_dim;
      op += shortpitch;
    }
  }
diff --cc vp9/common/vp9_invtrans.h
index 586a3dc4b,4474ba477..3cfb45fed
--- a/vp9/common/vp9_invtrans.h
+++ b/vp9/common/vp9_invtrans.h
@@@ -11,8 -11,7 +11,8 @@@
  #ifndef VP9_COMMON_VP9_INVTRANS_H_
  #define VP9_COMMON_VP9_INVTRANS_H_
  
- #include "vpx_ports/config.h"
+ #include "./vpx_config.h"
 +#include "vpx/vpx_integer.h"
  #include "vp9/common/vp9_blockd.h"
  
  extern void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch);
diff --cc vp9/common/vp9_mbpitch.c
index 31162655d,1107402ea..e94144813
--- a/vp9/common/vp9_mbpitch.c
+++ b/vp9/common/vp9_mbpitch.c
@@@ -38,8 -43,9 +38,8 @@@ static void setup_block(BLOCKD *b
  static void setup_macroblock(MACROBLOCKD *xd, BLOCKSET bs) {
    int block;
  
 -  unsigned char **y, **u, **v;
 -  unsigned char **y2 = NULL, **u2 = NULL, **v2 = NULL;
 +  uint8_t **y, **u, **v;
-   uint8_t **y2, **u2, **v2;
++  uint8_t **y2 = NULL, **u2 = NULL, **v2 = NULL;
    BLOCKD *blockd = xd->block;
    int stride;
  
diff --cc vp9/common/vp9_rtcd_defs.sh
index f02ee0260,6af7b3bad..95253ef67
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@@ -47,35 -45,35 +47,35 @@@ specialize vp9_dequantize_
  prototype void vp9_dequantize_b_2x2 "struct blockd *x"
  specialize vp9_dequantize_b_2x2
  
 -prototype void vp9_dequant_dc_idct_add_y_block_8x8 "short *q, const short *dq, unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs, const short *dc, struct macroblockd *xd"
 +prototype void vp9_dequant_dc_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, const int16_t *dc, struct macroblockd *xd"
  specialize vp9_dequant_dc_idct_add_y_block_8x8
  
 -prototype void vp9_dequant_idct_add_y_block_8x8 "short *q, const short *dq, unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs, struct macroblockd *xd"
 +prototype void vp9_dequant_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, struct macroblockd *xd"
  specialize vp9_dequant_idct_add_y_block_8x8
  
 -prototype void vp9_dequant_idct_add_uv_block_8x8 "short *q, const short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, unsigned short *eobs, struct macroblockd *xd"
 +prototype void vp9_dequant_idct_add_uv_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs, struct macroblockd *xd"
  specialize vp9_dequant_idct_add_uv_block_8x8
  
 -prototype void vp9_dequant_idct_add_16x16 "short *input, const short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, unsigned short eobs"
 +prototype void vp9_dequant_idct_add_16x16 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob"
  specialize vp9_dequant_idct_add_16x16
  
 -prototype void vp9_dequant_idct_add_8x8 "short *input, const short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int dc, unsigned short eobs"
 +prototype void vp9_dequant_idct_add_8x8 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int dc, int eob"
  specialize vp9_dequant_idct_add_8x8
  
 -prototype void vp9_dequant_idct_add "short *input, const short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
 +prototype void vp9_dequant_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride"
  specialize vp9_dequant_idct_add
  
 -prototype void vp9_dequant_dc_idct_add "short *input, const short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc"
 +prototype void vp9_dequant_dc_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int dc"
  specialize vp9_dequant_dc_idct_add
  
 -prototype void vp9_dequant_dc_idct_add_y_block "short *q, const short *dq, unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs, const short *dc"
 +prototype void vp9_dequant_dc_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, const int16_t *dcs"
- specialize vp9_dequant_dc_idct_add_y_block mmx
+ specialize vp9_dequant_dc_idct_add_y_block
  
 -prototype void vp9_dequant_idct_add_y_block "short *q, const short *dq, unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs"
 +prototype void vp9_dequant_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs"
- specialize vp9_dequant_idct_add_y_block mmx
+ specialize vp9_dequant_idct_add_y_block
  
 -prototype void vp9_dequant_idct_add_uv_block "short *q, const short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, unsigned short *eobs"
 +prototype void vp9_dequant_idct_add_uv_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs"
- specialize vp9_dequant_idct_add_uv_block mmx
+ specialize vp9_dequant_idct_add_uv_block
  
  #
  # RECON
@@@ -218,7 -216,8 +218,8 @@@ vp9_loop_filter_simple_bh_sse2=vp9_loop
  #
  # post proc
  #
+ if [ "$CONFIG_POSTPROC" = "yes" ]; then
 -prototype void vp9_mbpost_proc_down "unsigned char *dst, int pitch, int rows, int cols, int flimit"
 +prototype void vp9_mbpost_proc_down "uint8_t *dst, int pitch, int rows, int cols, int flimit"
  specialize vp9_mbpost_proc_down mmx sse2
  vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm
  
@@@ -230,17 -229,18 +231,18 @@@ prototype void vp9_post_proc_down_and_a
  specialize vp9_post_proc_down_and_across mmx sse2
  vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm
  
 -prototype void vp9_plane_add_noise "unsigned char *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"
 +prototype void vp9_plane_add_noise "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"
  specialize vp9_plane_add_noise mmx sse2
  vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt
+ fi
  
 -prototype void vp9_blend_mb_inner "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
 +prototype void vp9_blend_mb_inner "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
  specialize vp9_blend_mb_inner
  
 -prototype void vp9_blend_mb_outer "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
 +prototype void vp9_blend_mb_outer "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
  specialize vp9_blend_mb_outer
  
 -prototype void vp9_blend_b "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
 +prototype void vp9_blend_b "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
  specialize vp9_blend_b
  
  #
@@@ -342,31 -342,28 +344,31 @@@ specialize vp9_bilinear_predict_avg4x
  #
  # dct
  #
 -prototype void vp9_short_idct4x4llm_1 "short *input, short *output, int pitch"
 +prototype void vp9_short_idct4x4llm_1 "int16_t *input, int16_t *output, int pitch"
- specialize vp9_short_idct4x4llm_1 mmx
+ specialize vp9_short_idct4x4llm_1
  
 -prototype void vp9_short_idct4x4llm "short *input, short *output, int pitch"
 +prototype void vp9_short_idct4x4llm "int16_t *input, int16_t *output, int pitch"
- specialize vp9_short_idct4x4llm mmx
+ specialize vp9_short_idct4x4llm
  
 -prototype void vp9_short_idct8x8 "short *input, short *output, int pitch"
 +prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch"
  specialize vp9_short_idct8x8
  
 -prototype void vp9_short_idct10_8x8 "short *input, short *output, int pitch"
 +prototype void vp9_short_idct10_8x8 "int16_t *input, int16_t *output, int pitch"
  specialize vp9_short_idct10_8x8
  
 -prototype void vp9_short_ihaar2x2 "short *input, short *output, int pitch"
 +prototype void vp9_short_ihaar2x2 "int16_t *input, int16_t *output, int pitch"
  specialize vp9_short_ihaar2x2
  
 -prototype void vp9_short_idct16x16 "short *input, short *output, int pitch"
 +prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"
  specialize vp9_short_idct16x16
  
 -prototype void vp9_short_idct10_16x16 "short *input, short *output, int pitch"
 +prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"
  specialize vp9_short_idct10_16x16
  
 -prototype void vp9_ihtllm "const short *input, short *output, int pitch, int tx_type, int tx_dim, short eobs"
 +prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
 +specialize vp9_short_idct32x32
 +
- prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim"
++prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim, int16_t eobs"
  specialize vp9_ihtllm
  
  #
diff --cc vp9/common/vp9_systemdependent.h
index d57a42df3,6f08e6906..1b9147ef4
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h
@@@ -11,11 -10,7 +11,11 @@@
  #ifndef VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
  #define VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
  
 +#ifdef _MSC_VER
 +#include <math.h>
 +#endif
 +
- #include "vpx_ports/config.h"
+ #include "./vpx_config.h"
  #if ARCH_X86 || ARCH_X86_64
  void vpx_reset_mmx_state(void);
  #define vp9_clear_system_state() vpx_reset_mmx_state()
diff --cc vp9/common/x86/vp9_asm_stubs.c
index 0d268a264,de1f0fa32..f09e2d78b
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@@ -13,8 -13,10 +13,8 @@@
  #include "vpx_ports/mem.h"
  #include "vp9/common/vp9_subpixel.h"
  
- extern const short vp9_six_tap_mmx[16][6 * 8];
+ extern const short vp9_six_tap_mmx[8][6 * 8];
  
 -extern const short vp9_bilinear_filters_8x_mmx[8][2 * 8];
 -
  extern void vp9_filter_block1d_h6_mmx(unsigned char   *src_ptr,
                                        unsigned short  *output_ptr,
                                        unsigned int     src_pixels_per_line,
diff --cc vp9/decoder/vp9_dboolhuff.h
index c8c5c3b01,635bd5b7d..5afdd67c8
--- a/vp9/decoder/vp9_dboolhuff.h
+++ b/vp9/decoder/vp9_dboolhuff.h
@@@ -8,12 -8,12 +8,12 @@@
   *  be found in the AUTHORS file in the root of the source tree.
   */
  
 -
  #ifndef VP9_DECODER_VP9_DBOOLHUFF_H_
  #define VP9_DECODER_VP9_DBOOLHUFF_H_
 +
  #include <stddef.h>
  #include <limits.h>
- #include "vpx_ports/config.h"
+ #include "./vpx_config.h"
  #include "vpx_ports/mem.h"
  #include "vpx/vpx_integer.h"
  
diff --cc vp9/decoder/vp9_dequantize.c
index 4376dc3d3,39a2de14b..72cd2771e
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@@@ -13,8 -13,6 +13,7 @@@
  #include "vp9/decoder/vp9_dequantize.h"
  #include "vpx_mem/vpx_mem.h"
  #include "vp9/decoder/vp9_onyxd_int.h"
 +#include "vp9/common/vp9_common.h"
- 
  static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch,
                           uint8_t *dest, int stride, int width, int height) {
    int r, c;
diff --cc vp9/decoder/vp9_dequantize.h
index c578608ba,f348b21b0..bbbc173a2
--- a/vp9/decoder/vp9_dequantize.h
+++ b/vp9/decoder/vp9_dequantize.h
@@@ -14,105 -14,90 +14,90 @@@
  #include "vp9/common/vp9_blockd.h"
  
  #if CONFIG_LOSSLESS
- extern void vp9_dequant_idct_add_lossless_c(int16_t *input,
-                                             const int16_t *dq,
-                                             uint8_t *pred,
-                                             uint8_t *output,
 -extern void vp9_dequant_idct_add_lossless_c(short *input, const short *dq,
++extern void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq,
+                                             unsigned char *pred,
+                                             unsigned char *output,
                                              int pitch, int stride);
- extern void vp9_dequant_dc_idct_add_lossless_c(int16_t *input,
-                                                const int16_t *dq,
-                                                uint8_t *pred,
-                                                uint8_t *output,
 -extern void vp9_dequant_dc_idct_add_lossless_c(short *input, const short *dq,
++extern void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq,
+                                                unsigned char *pred,
+                                                unsigned char *output,
                                                 int pitch, int stride, int dc);
 -extern void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q,
 -                                                       const short *dq,
 +extern void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q,
 +                                                       const int16_t *dq,
-                                                        uint8_t *pre,
-                                                        uint8_t *dst,
+                                                        unsigned char *pre,
+                                                        unsigned char *dst,
                                                         int stride,
 -                                                       unsigned short *eobs,
 -                                                       const short *dc);
 -extern void vp9_dequant_idct_add_y_block_lossless_c(short *q, const short *dq,
 +                                                       uint16_t *eobs,
 +                                                       const int16_t *dc);
- extern void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q,
-                                                     const int16_t *dq,
-                                                     uint8_t *pre,
-                                                     uint8_t *dst,
++extern void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,
+                                                     unsigned char *pre,
+                                                     unsigned char *dst,
                                                      int stride,
 -                                                    unsigned short *eobs);
 -extern void vp9_dequant_idct_add_uv_block_lossless_c(short *q, const short *dq,
 +                                                    uint16_t *eobs);
- extern void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q,
-                                                      const int16_t *dq,
-                                                      uint8_t *pre,
-                                                      uint8_t *dst_u,
-                                                      uint8_t *dst_v,
++extern void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,
+                                                      unsigned char *pre,
+                                                      unsigned char *dst_u,
+                                                      unsigned char *dst_v,
                                                       int stride,
 -                                                     unsigned short *eobs);
 +                                                     uint16_t *eobs);
- #endif  // CONFIG_LOSSLESS
+ #endif
  
 -typedef void (*vp9_dequant_idct_add_fn_t)(short *input, const short *dq,
 +typedef void (*vp9_dequant_idct_add_fn_t)(int16_t *input, const int16_t *dq,
-                                           uint8_t *pred, uint8_t *output,
-                                           int pitch, int stride);
+     unsigned char *pred, unsigned char *output, int pitch, int stride);
 -typedef void(*vp9_dequant_dc_idct_add_fn_t)(short *input, const short *dq,
 +typedef void(*vp9_dequant_dc_idct_add_fn_t)(int16_t *input, const int16_t *dq,
-                                             uint8_t *pred, uint8_t *output,
-                                             int pitch, int stride, int dc);
+     unsigned char *pred, unsigned char *output, int pitch, int stride, int dc);
  
- typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(int16_t *q,
-                                                     const int16_t *dq,
-                                                     uint8_t *pre, uint8_t *dst,
-                                                     int stride, uint16_t *eobs,
-                                                     const int16_t *dc);
 -typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(short *q, const short *dq,
 -    unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs,
 -    const short *dc);
 -typedef void(*vp9_dequant_idct_add_y_block_fn_t)(short *q, const short *dq,
 -    unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs);
 -typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(short *q, const short *dq,
++typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq,
++    unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs,
++    const int16_t *dc);
 +typedef void(*vp9_dequant_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq,
-                                                  uint8_t *pre, uint8_t *dst,
-                                                  int stride, uint16_t *eobs);
++    unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs);
 +typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(int16_t *q, const int16_t *dq,
-                                                   uint8_t *pre, uint8_t *dst_u,
-                                                   uint8_t *dst_v, int stride,
-                                                   uint16_t *eobs);
+     unsigned char *pre, unsigned char *dst_u, unsigned char *dst_v, int stride,
 -    unsigned short *eobs);
++    uint16_t *eobs);
  
- void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,
-                                const int16_t *dq,
-                                uint8_t *pred, uint8_t *dest,
-                                int pitch, int stride);
 -void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, const short *dq,
++void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq,
+                                     unsigned char *pred, unsigned char *dest,
+                                     int pitch, int stride, uint16_t eobs);
  
 -void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input,
 -                                   const short *dq, unsigned char *pred,
 +void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,
-                                    const int16_t *dq, uint8_t *pred,
-                                    uint8_t *dest, int pitch, int stride);
++                                   const int16_t *dq, unsigned char *pred,
+                                    unsigned char *dest, int pitch, int stride,
+                                    uint16_t eobs);
  
 -void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input,
 -                                     const short *dq, unsigned char *pred,
 +void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
-                                      const int16_t *dq, uint8_t *pred,
-                                      uint8_t *dest,
-                                      int pitch, int stride);
++                                     const int16_t *dq, unsigned char *pred,
+                                      unsigned char *dest,
+                                      int pitch, int stride, uint16_t eobs);
  
  #if CONFIG_SUPERBLOCKS
- void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q,
-                                                    const int16_t *dq,
-                                                    uint8_t *dst,
 -void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, const short *dq,
++void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
+                                                    unsigned char *dst,
                                                     int stride,
 -                                                   unsigned short *eobs,
 -                                                   const short *dc,
 +                                                   uint16_t *eobs,
 +                                                   const int16_t *dc,
                                                     MACROBLOCKD *xd);
  
- void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q,
-                                                    const int16_t *dq,
-                                                    uint8_t *dst,
 -void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(short *q, const short *dq,
++void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
+                                                    unsigned char *dst,
                                                     int stride,
 -                                                   unsigned short *eobs,
 -                                                   const short *dc,
 +                                                   uint16_t *eobs,
 +                                                   const int16_t *dc,
                                                     MACROBLOCKD *xd);
  
- void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q,
-                                                  const int16_t *dq,
-                                                  uint8_t *dstu,
-                                                  uint8_t *dstv,
 -void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, const short *dq,
++void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
+                                                  unsigned char *dstu,
+                                                  unsigned char *dstv,
                                                   int stride,
 -                                                 unsigned short *eobs,
 +                                                 uint16_t *eobs,
                                                   MACROBLOCKD *xd);
  
- void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q,
-                                                  const int16_t *dq,
-                                                  uint8_t *dstu,
-                                                  uint8_t *dstv,
 -void vp9_dequant_idct_add_uv_block_4x4_inplace_c(short *q, const short *dq,
++void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
+                                                  unsigned char *dstu,
+                                                  unsigned char *dstv,
                                                   int stride,
 -                                                 unsigned short *eobs,
 +                                                 uint16_t *eobs,
                                                   MACROBLOCKD *xd);
- #endif  // CONFIG_SUPERBLOCKS
+ #endif
  
- #endif  // VP9_DECODER_VP9_DEQUANTIZE_H_
+ #endif
diff --cc vp9/decoder/vp9_onyxd_int.h
index 6b7184fbe,49e13f7f4..64975468d
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@@ -8,11 -8,11 +8,10 @@@
   *  be found in the AUTHORS file in the root of the source tree.
   */
  
 -
  #ifndef VP9_DECODER_VP9_ONYXD_INT_H_
  #define VP9_DECODER_VP9_ONYXD_INT_H_
- 
- #include "vpx_ports/config.h"
+ #include "./vpx_config.h"
 -#include "vp9/common/vp9_onyxd.h"
 +#include "vp9/decoder/vp9_onyxd.h"
  #include "vp9/decoder/vp9_treereader.h"
  #include "vp9/common/vp9_onyxc_int.h"
  #include "vp9/decoder/vp9_dequantize.h"
diff --cc vp9/encoder/vp9_encodeframe.c
index 702c35831,bd1966272..509c426d8
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@@ -2000,135 -2022,20 +2000,133 @@@ static void update_sb_skip_coeff_state(
      }
    }
  }
 +
 +#if CONFIG_SUPERBLOCKS64
 +static void update_sb64_skip_coeff_state(VP9_COMP *cpi,
 +                                         ENTROPY_CONTEXT_PLANES ta[16],
 +                                         ENTROPY_CONTEXT_PLANES tl[16],
 +                                         TOKENEXTRA *t[16],
 +                                         TOKENEXTRA **tp,
 +                                         int skip[16], int output_enabled) {
 +  MACROBLOCK *const x = &cpi->mb;
 +
 +  if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_32X32) {
 +    TOKENEXTRA tokens[4][1024+512];
 +    int n_tokens[4], n;
 +
 +    // if there were no skips, we don't need to do anything
 +    if (!skip[0] && !skip[1] && !skip[2] && !skip[3])
 +      return;
 +
 +    // if we don't do coeff skipping for this frame, we don't
 +    // need to do anything here
 +    if (!cpi->common.mb_no_coeff_skip)
 +      return;
 +
 +    // if all 4 MBs skipped coeff coding, nothing to be done
 +    if (skip[0] && skip[1] && skip[2] && skip[3])
 +      return;
 +
 +    // so the situation now is that we want to skip coeffs
 +    // for some MBs, but not all, and we didn't code EOB
 +    // coefficients for them. However, the skip flag for this
 +    // SB will be 0 overall, so we need to insert EOBs in the
 +    // middle of the token tree. Do so here.
 +    for (n = 0; n < 4; n++) {
 +      if (n < 3) {
 +        n_tokens[n] = t[n + 1] - t[n];
 +      } else {
 +        n_tokens[n] = *tp - t[3];
 +      }
 +      if (n_tokens[n]) {
 +        memcpy(tokens[n], t[n], n_tokens[n] * sizeof(*t[0]));
 +      }
 +    }
 +
 +    // reset pointer, stuff EOBs where necessary
 +    *tp = t[0];
 +    for (n = 0; n < 4; n++) {
 +      if (skip[n]) {
 +        x->e_mbd.above_context = &ta[n * 2];
 +        x->e_mbd.left_context  = &tl[n * 2];
 +        vp9_stuff_sb(cpi, &x->e_mbd, tp, !output_enabled);
 +      } else {
 +        if (n_tokens[n]) {
 +          memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
 +        }
 +        (*tp) += n_tokens[n];
 +      }
 +    }
 +  } else {
 +    TOKENEXTRA tokens[16][16 * 25];
 +    int n_tokens[16], n;
 +
 +    // if there were no skips, we don't need to do anything
 +    if (!skip[ 0] && !skip[ 1] && !skip[ 2] && !skip[ 3] &&
 +        !skip[ 4] && !skip[ 5] && !skip[ 6] && !skip[ 7] &&
 +        !skip[ 8] && !skip[ 9] && !skip[10] && !skip[11] &&
 +        !skip[12] && !skip[13] && !skip[14] && !skip[15])
 +      return;
 +
 +    // if we don't do coeff skipping for this frame, we don't
 +    // need to do anything here
 +    if (!cpi->common.mb_no_coeff_skip)
 +      return;
 +
 +    // if all 4 MBs skipped coeff coding, nothing to be done
 +    if (skip[ 0] && skip[ 1] && skip[ 2] && skip[ 3] &&
 +        skip[ 4] && skip[ 5] && skip[ 6] && skip[ 7] &&
 +        skip[ 8] && skip[ 9] && skip[10] && skip[11] &&
 +        skip[12] && skip[13] && skip[14] && skip[15])
 +      return;
 +
 +    // so the situation now is that we want to skip coeffs
 +    // for some MBs, but not all, and we didn't code EOB
 +    // coefficients for them. However, the skip flag for this
 +    // SB will be 0 overall, so we need to insert EOBs in the
 +    // middle of the token tree. Do so here.
 +    for (n = 0; n < 16; n++) {
 +      if (n < 15) {
 +        n_tokens[n] = t[n + 1] - t[n];
 +      } else {
 +        n_tokens[n] = *tp - t[15];
 +      }
 +      if (n_tokens[n]) {
 +        memcpy(tokens[n], t[n], n_tokens[n] * sizeof(*t[0]));
 +      }
 +    }
 +
 +    // reset pointer, stuff EOBs where necessary
 +    *tp = t[0];
 +    for (n = 0; n < 16; n++) {
 +      if (skip[n]) {
 +        x->e_mbd.above_context = &ta[n];
 +        x->e_mbd.left_context  = &tl[n];
 +        vp9_stuff_mb(cpi, &x->e_mbd, tp, !output_enabled);
 +      } else {
 +        if (n_tokens[n]) {
 +          memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
 +        }
 +        (*tp) += n_tokens[n];
 +      }
 +    }
 +  }
 +}
 +#endif  // CONFIG_SUPERBLOCKS64
  #endif /* CONFIG_SUPERBLOCKS */
  
 -static void encode_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
 -                              TOKENEXTRA **t, int recon_yoffset,
 -                              int recon_uvoffset, int output_enabled,
 -                              int mb_col, int mb_row) {
 -  VP9_COMMON *cm = &cpi->common;
 +static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
 +                              int recon_yoffset, int recon_uvoffset,
 +                              int output_enabled,
 +                              int mb_row, int mb_col) {
 +  VP9_COMMON *const cm = &cpi->common;
 +  MACROBLOCK *const x = &cpi->mb;
    MACROBLOCKD *const xd = &x->e_mbd;
 -  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
 +  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-   unsigned char *segment_id = &mbmi->segment_id;
-   int seg_ref_active;
    unsigned char ref_pred_flag;
  
 -  x->skip = 0;
  #if CONFIG_SUPERBLOCKS
 -  assert(!xd->mode_info_context->mbmi.encoded_as_sb);
 +  assert(!xd->mode_info_context->mbmi.sb_type);
  #endif
  
  #ifdef ENC_DEBUG
diff --cc vp9/encoder/vp9_picklpf.c
index 7091c4932,4eb51df41..b443ede6f
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@@ -21,10 -21,9 +21,9 @@@
  
  void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
                                     YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {
 -  unsigned char *src_y, *dst_y;
 +  uint8_t *src_y, *dst_y;
    int yheight;
    int ystride;
-   int border;
    int yoffset;
    int linestocopy;
  
diff --cc vp9/encoder/vp9_rdopt.c
index 267dd0aa5,27decb91e..8e91d828f
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@@ -4127,8 -3625,9 +4131,9 @@@ static void rd_pick_inter_mode(VP9_COM
  #if CONFIG_COMP_INTRA_PRED
                                               0,
  #endif
 -                                             0);
 +                                             cpi->update_context);
            rate2 += rate;
+           rate2 += intra_cost_penalty;
            distortion2 += distortion;
  
            if (tmp_rd < best_yrd) {
diff --cc vp9/encoder/vp9_sad_c.c
index 9ce27fbed,465044278..84121f79c
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad_c.c
@@@ -11,20 -11,12 +11,20 @@@
  
  #include <stdlib.h>
  #include "vp9/common/vp9_sadmxn.h"
- #include "vpx_ports/config.h"
+ #include "./vpx_config.h"
  #include "vpx/vpx_integer.h"
  
 -unsigned int vp9_sad32x32_c(const unsigned char *src_ptr,
 +unsigned int vp9_sad64x64_c(const uint8_t *src_ptr,
                              int  src_stride,
 -                            const unsigned char *ref_ptr,
 +                            const uint8_t *ref_ptr,
 +                            int  ref_stride,
 +                            int max_sad) {
 +  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64);
 +}
 +
 +unsigned int vp9_sad32x32_c(const uint8_t *src_ptr,
 +                            int  src_stride,
 +                            const uint8_t *ref_ptr,
                              int  ref_stride,
                              int max_sad) {
    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
diff --cc vp9/encoder/vp9_temporal_filter.c
index 159d6faa5,57253bd50..8bbe53486
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@@ -123,14 -129,16 +123,13 @@@ void vp9_temporal_filter_apply_c(uint8_
  
  #if ALT_REF_MC_ENABLED
  
 -static int temporal_filter_find_matching_mb_c
 -(
 -  VP9_COMP *cpi,
 -  YV12_BUFFER_CONFIG *arf_frame,
 -  YV12_BUFFER_CONFIG *frame_ptr,
 -  int mb_offset,
 -  int error_thresh
 -) {
 +static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
 +                                              YV12_BUFFER_CONFIG *arf_frame,
 +                                              YV12_BUFFER_CONFIG *frame_ptr,
 +                                              int mb_offset,
 +                                              int error_thresh) {
    MACROBLOCK *x = &cpi->mb;
    int step_param;
-   int further_steps;
    int sadpb = x->sadperbit16;
    int bestsme = INT_MAX;