From 5b0cb86f27ba0c5433c404bed51c06a5124dfb49 Mon Sep 17 00:00:00 2001
From: Henrik Gramner <hengar-6@student.ltu.se>
Date: Fri, 26 Aug 2011 15:57:04 +0200
Subject: [PATCH] 4:2:2 encoding support

---
 AUTHORS                |   5 +
 common/bitstream.h     |   7 +-
 common/common.c        |  68 +++++---
 common/common.h        |  19 +-
 common/dct.c           |  91 +++++++++-
 common/dct.h           |   4 +
 common/deblock.c       | 142 +++++++++------
 common/frame.c         |  88 +++++-----
 common/frame.h         |   8 +
 common/macroblock.c    | 169 ++++++++++--------
 common/macroblock.h    |   9 -
 common/mc.c            |  23 ++-
 common/mc.h            |  23 ++-
 common/pixel.c         |  88 ++++++----
 common/pixel.h         |  61 ++++---
 common/predict.c       | 175 ++++++++++++++++++-
 common/predict.h       |   7 +-
 common/quant.c         | 198 ++++++++++++++-------
 common/quant.h         |  10 +-
 common/set.h           |   9 +-
 common/vlc.c           | 115 +++++++++++-
 common/x86/mc-a2.asm   |  21 +--
 common/x86/mc-c.c      |  50 +++---
 common/x86/quant-a.asm |  14 +-
 common/x86/quant.h     |   8 +-
 encoder/analyse.c      | 227 +++++++++++++-----------
 encoder/cabac.c        | 200 +++++++++++++--------
 encoder/cavlc.c        |  67 ++++---
 encoder/encoder.c      | 103 ++++++++---
 encoder/macroblock.c   | 386 ++++++++++++++++++++++++++++-------------
 encoder/macroblock.h   |   9 +-
 encoder/me.c           |  52 +++---
 encoder/ratecontrol.c  |  26 +--
 encoder/rdo.c          | 103 ++++++-----
 encoder/set.c          |  27 +--
 encoder/slicetype.c    |  35 ++--
 filters/video/depth.c  |   8 +-
 filters/video/resize.c |   7 +-
 input/avs.c            |  23 ++-
 input/input.c          |   4 +-
 input/input.h          |   3 +-
 tools/checkasm.c       | 290 +++++++++++++++++++++----------
 x264.c                 |   7 +-
 x264.h                 |  21 ++-
 44 files changed, 2044 insertions(+), 966 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 8acaba47..60ffb621 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -42,6 +42,11 @@ E: gpoirier CHEZ mplayerhq POINT hu
 D: Altivec optimizations
 S: Brittany, France
 
+N: Henrik Gramner
+E: hengar-6 AT student DOT ltu DOT se
+D: 4:2:2 chroma subsampling, x86 asm
+S: Sweden
+
 N: Fiona Glaser
 E: fiona AT x264 DOT com
 D: x86 asm, 1pass VBV, adaptive quantization, inline asm
diff --git a/common/bitstream.h b/common/bitstream.h
index 6300e52a..058db8b4 100644
--- a/common/bitstream.h
+++ b/common/bitstream.h
@@ -60,10 +60,11 @@ typedef struct
     uint8_t run[16];
 } x264_run_level_t;
 
-extern const vlc_t x264_coeff0_token[5];
-extern const vlc_t x264_coeff_token[5][16][4];
+extern const vlc_t x264_coeff0_token[6];
+extern const vlc_t x264_coeff_token[6][16][4];
 extern const vlc_t x264_total_zeros[15][16];
-extern const vlc_t x264_total_zeros_dc[3][4];
+extern const vlc_t x264_total_zeros_2x2_dc[3][4];
+extern const vlc_t x264_total_zeros_2x4_dc[7][8];
 extern const vlc_t x264_run_before[7][16];
 
 typedef struct
diff --git a/common/common.c b/common/common.c
index ce076e59..4c978d3c 100644
--- a/common/common.c
+++ b/common/common.c
@@ -426,21 +426,57 @@ void x264_param_apply_fastfirstpass( x264_param_t *param )
     }
 }
 
+static int profile_string_to_int( const char *str )
+{
+    if( !strcasecmp( str, "baseline" ) )
+        return PROFILE_BASELINE;
+    if( !strcasecmp( str, "main" ) )
+        return PROFILE_MAIN;
+    if( !strcasecmp( str, "high" ) )
+        return PROFILE_HIGH;
+    if( !strcasecmp( str, "high10" ) )
+        return PROFILE_HIGH10;
+    if( !strcasecmp( str, "high422" ) )
+        return PROFILE_HIGH422;
+    if( !strcasecmp( str, "high444" ) )
+        return PROFILE_HIGH444_PREDICTIVE;
+    return -1;
+}
+
 int x264_param_apply_profile( x264_param_t *param, const char *profile )
 {
     if( !profile )
         return 0;
 
-#if BIT_DEPTH > 8
-    if( !strcasecmp( profile, "baseline" ) || !strcasecmp( profile, "main" ) ||
-        !strcasecmp( profile, "high" ) )
+    int p = profile_string_to_int( profile );
+    if( p < 0 )
     {
-        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support a bit depth of %d.\n", profile, BIT_DEPTH );
+        x264_log( NULL, X264_LOG_ERROR, "invalid profile: %s\n", profile );
+        return -1;
+    }
+    if( p < PROFILE_HIGH444_PREDICTIVE && ((param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant <= 0) ||
+        (param->rc.i_rc_method == X264_RC_CRF && (int)(param->rc.f_rf_constant + QP_BD_OFFSET) <= 0)) )
+    {
+        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support lossless\n", profile );
+        return -1;
+    }
+    if( p < PROFILE_HIGH444_PREDICTIVE && (param->i_csp & X264_CSP_MASK) >= X264_CSP_I444 )
+    {
+        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support 4:4:4\n", profile );
+        return -1;
+    }
+    if( p < PROFILE_HIGH422 && (param->i_csp & X264_CSP_MASK) >= X264_CSP_I422 )
+    {
+        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support 4:2:2\n", profile );
+        return -1;
+    }
+    if( p < PROFILE_HIGH10 && BIT_DEPTH > 8 )
+    {
+        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support a bit depth of %d\n", profile, BIT_DEPTH );
         return -1;
     }
-#endif
 
-    if( !strcasecmp( profile, "baseline" ) )
+    if( p == PROFILE_BASELINE )
     {
         param->analyse.b_transform_8x8 = 0;
         param->b_cabac = 0;
@@ -459,27 +495,12 @@ int x264_param_apply_profile( x264_param_t *param, const char *profile )
             return -1;
         }
     }
-    else if( !strcasecmp( profile, "main" ) )
+    else if( p == PROFILE_MAIN )
     {
         param->analyse.b_transform_8x8 = 0;
         param->i_cqm_preset = X264_CQM_FLAT;
         param->psz_cqm_file = NULL;
     }
-    else if( !strcasecmp( profile, "high" ) || !strcasecmp( profile, "high10" ) )
-    {
-        /* Default */
-    }
-    else
-    {
-        x264_log( NULL, X264_LOG_ERROR, "invalid profile: %s\n", profile );
-        return -1;
-    }
-    if( (param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant <= 0) ||
-        (param->rc.i_rc_method == X264_RC_CRF && (int)(param->rc.f_rf_constant + QP_BD_OFFSET) <= 0) )
-    {
-        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support lossless\n", profile );
-        return -1;
-    }
     return 0;
 }
 
@@ -1075,6 +1096,9 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
         [X264_CSP_I420] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } },
         [X264_CSP_YV12] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } },
         [X264_CSP_NV12] = { 2, { 256*1, 256*1 },        { 256*1, 256/2 },       },
+        [X264_CSP_I422] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
+        [X264_CSP_YV16] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
+        [X264_CSP_NV16] = { 2, { 256*1, 256*1 },        { 256*1, 256*1 },       },
         [X264_CSP_I444] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } },
         [X264_CSP_YV24] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } },
         [X264_CSP_BGR]  = { 1, { 256*3 },               { 256*1 },              },
diff --git a/common/common.h b/common/common.h
index a4e1cf96..d1f830f6 100644
--- a/common/common.h
+++ b/common/common.h
@@ -40,6 +40,9 @@
 #define IS_DISPOSABLE(type) ( type == X264_TYPE_B )
 #define FIX8(f) ((int)(f*(1<<8)+.5))
 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
+#define CHROMA_FORMAT h->sps->i_chroma_format_idc
+#define CHROMA_SIZE(s) ((s)>>(h->mb.chroma_h_shift+h->mb.chroma_v_shift))
+#define FRAME_SIZE(s) ((s)+2*CHROMA_SIZE(s))
 
 #define CHECKED_MALLOC( var, size )\
 do {\
@@ -56,7 +59,7 @@ do {\
 #define X264_BFRAME_MAX 16
 #define X264_REF_MAX 16
 #define X264_THREAD_MAX 128
-#define X264_PCM_COST ((384<<CHROMA444)*BIT_DEPTH+16)
+#define X264_PCM_COST (FRAME_SIZE(256*BIT_DEPTH)+16)
 #define X264_LOOKAHEAD_MAX 250
 #define QP_BD_OFFSET (6*(BIT_DEPTH-8))
 #define QP_MAX_SPEC (51+QP_BD_OFFSET)
@@ -102,7 +105,7 @@ do {\
 #   define PARAM_INTERLACED 0
 #endif
 
-#define CHROMA444 (h->sps->i_chroma_format_idc == 3)
+#define CHROMA444 (CHROMA_FORMAT == CHROMA_444)
 
 /* Unions for type-punning.
  * Mn: load or store n bits, aligned, native-endian
@@ -565,7 +568,7 @@ struct x264_t
     struct
     {
         ALIGNED_16( dctcoef luma16x16_dc[3][16] );
-        ALIGNED_16( dctcoef chroma_dc[2][4] );
+        ALIGNED_16( dctcoef chroma_dc[2][8] );
         // FIXME share memory?
         ALIGNED_16( dctcoef luma8x8[12][64] );
         ALIGNED_16( dctcoef luma4x4[16*3][16] );
@@ -578,6 +581,10 @@ struct x264_t
         int     i_mb_height;
         int     i_mb_count;                 /* number of mbs in a frame */
 
+        /* Chroma subsampling */
+        int     chroma_h_shift;
+        int     chroma_v_shift;
+
         /* Strides */
         int     i_mb_stride;
         int     i_b8_stride;
@@ -882,6 +889,8 @@ struct x264_t
     ALIGNED_16( uint32_t nr_residual_sum_buf[2][4][64] );
     uint32_t nr_count_buf[2][4];
 
+    uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */
+
     /* Buffers that are allocated per-thread even in sliced threads. */
     void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
     pixel *intra_border_backup[5][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
@@ -891,9 +900,11 @@ struct x264_t
 
     /* CPU functions dependents */
     x264_predict_t      predict_16x16[4+3];
-    x264_predict_t      predict_8x8c[4+3];
     x264_predict8x8_t   predict_8x8[9+3];
     x264_predict_t      predict_4x4[9+3];
+    x264_predict_t      predict_chroma[4+3];
+    x264_predict_t      predict_8x8c[4+3];
+    x264_predict_t      predict_8x16c[4+3];
     x264_predict_8x8_filter_t predict_8x8_filter;
 
     x264_pixel_function_t pixf;
diff --git a/common/dct.c b/common/dct.c
index 9653ee47..cf8a2351 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -5,6 +5,7 @@
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
+ *          Henrik Gramner <hengar-6@student.ltu.se>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -100,6 +101,42 @@ static void idct4x4dc( dctcoef d[16] )
     }
 }
 
+static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
+{
+    int a0 = dct4x4[0][0] + dct4x4[1][0];
+    int a1 = dct4x4[2][0] + dct4x4[3][0];
+    int a2 = dct4x4[4][0] + dct4x4[5][0];
+    int a3 = dct4x4[6][0] + dct4x4[7][0];
+    int a4 = dct4x4[0][0] - dct4x4[1][0];
+    int a5 = dct4x4[2][0] - dct4x4[3][0];
+    int a6 = dct4x4[4][0] - dct4x4[5][0];
+    int a7 = dct4x4[6][0] - dct4x4[7][0];
+    int b0 = a0 + a1;
+    int b1 = a2 + a3;
+    int b2 = a4 + a5;
+    int b3 = a6 + a7;
+    int b4 = a0 - a1;
+    int b5 = a2 - a3;
+    int b6 = a4 - a5;
+    int b7 = a6 - a7;
+    dct[0] = b0 + b1;
+    dct[1] = b2 + b3;
+    dct[2] = b0 - b1;
+    dct[3] = b2 - b3;
+    dct[4] = b4 - b5;
+    dct[5] = b6 - b7;
+    dct[6] = b4 + b5;
+    dct[7] = b6 + b7;
+    dct4x4[0][0] = 0;
+    dct4x4[1][0] = 0;
+    dct4x4[2][0] = 0;
+    dct4x4[3][0] = 0;
+    dct4x4[4][0] = 0;
+    dct4x4[5][0] = 0;
+    dct4x4[6][0] = 0;
+    dct4x4[7][0] = 0;
+}
+
 static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
                                   pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
 {
@@ -164,14 +201,10 @@ static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
 
 static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
 {
-    dctcoef d[16];
     int sum = 0;
-
-    pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
-
-    sum += d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
-    sum += d[8] + d[9] + d[10] + d[11] + d[12] + d[13] + d[14] + d[15];
-
+    for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE )
+        sum += pix1[0] + pix1[1] + pix1[2] + pix1[3]
+             - pix2[0] - pix2[1] - pix2[2] - pix2[3];
     return sum;
 }
 
@@ -188,11 +221,49 @@ static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
     int d2 = dct[0] - dct[1];
     int d3 = dct[2] - dct[3];
     dct[0] = d0 + d1;
-    dct[2] = d2 + d3;
     dct[1] = d0 - d1;
+    dct[2] = d2 + d3;
     dct[3] = d2 - d3;
 }
 
+static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 )
+{
+    int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] );
+    int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] );
+    int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] );
+    int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] );
+    int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] );
+    int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] );
+    int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] );
+    int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] );
+
+    /* 2x4 DC transform */
+    int b0 = a0 + a1;
+    int b1 = a2 + a3;
+    int b2 = a4 + a5;
+    int b3 = a6 + a7;
+    int b4 = a0 - a1;
+    int b5 = a2 - a3;
+    int b6 = a4 - a5;
+    int b7 = a6 - a7;
+    a0 = b0 + b1;
+    a1 = b2 + b3;
+    a2 = b4 + b5;
+    a3 = b6 + b7;
+    a4 = b0 - b1;
+    a5 = b2 - b3;
+    a6 = b4 - b5;
+    a7 = b6 - b7;
+    dct[0] = a0 + a1;
+    dct[1] = a2 + a3;
+    dct[2] = a0 - a1;
+    dct[3] = a2 - a3;
+    dct[4] = a4 - a5;
+    dct[5] = a6 - a7;
+    dct[6] = a4 + a5;
+    dct[7] = a6 + a7;
+}
+
 static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
 {
     dctcoef d[16];
@@ -408,6 +479,8 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
     dctf->add8x8_idct   = add8x8_idct;
     dctf->add8x8_idct_dc = add8x8_idct_dc;
 
+    dctf->sub8x16_dct_dc = sub8x16_dct_dc;
+
     dctf->sub16x16_dct  = sub16x16_dct;
     dctf->add16x16_idct = add16x16_idct;
     dctf->add16x16_idct_dc = add16x16_idct_dc;
@@ -421,6 +494,8 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
     dctf->dct4x4dc  = dct4x4dc;
     dctf->idct4x4dc = idct4x4dc;
 
+    dctf->dct2x4dc = dct2x4dc;
+
 #if HIGH_BIT_DEPTH
 #if HAVE_MMX
     if( cpu&X264_CPU_MMX )
diff --git a/common/dct.h b/common/dct.h
index a764e491..044ad1e1 100644
--- a/common/dct.h
+++ b/common/dct.h
@@ -104,6 +104,8 @@ typedef struct
     void (*add8x8_idct)  ( pixel *p_dst, dctcoef dct[4][16] );
     void (*add8x8_idct_dc) ( pixel *p_dst, dctcoef dct[4] );
 
+    void (*sub8x16_dct_dc)( dctcoef dct[8], pixel *pix1, pixel *pix2 );
+
     void (*sub16x16_dct) ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 );
     void (*add16x16_idct)( pixel *p_dst, dctcoef dct[16][16] );
     void (*add16x16_idct_dc) ( pixel *p_dst, dctcoef dct[16] );
@@ -117,6 +119,8 @@ typedef struct
     void (*dct4x4dc) ( dctcoef d[16] );
     void (*idct4x4dc)( dctcoef d[16] );
 
+    void (*dct2x4dc)( dctcoef dct[8], dctcoef dct4x4[8][16] );
+
 } x264_dct_function_t;
 
 typedef struct
diff --git a/common/deblock.c b/common/deblock.c
index 22d37635..a1108b20 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -6,6 +6,7 @@
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
  *          Fiona Glaser <fiona@x264.com>
+ *          Henrik Gramner <hengar-6@student.ltu.se>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -119,7 +120,7 @@ static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alp
             deblock_edge_luma_c( pix, xstride, alpha, beta, tc0[i] );
     }
 }
-static void deblock_v_luma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_luma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
 {
     for( int d = 0; d < 8; d++, pix += stride )
         deblock_edge_luma_c( pix, 1, alpha, beta, tc0[d>>1] );
@@ -147,33 +148,42 @@ static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, int xstride, int al
         pix[ 0*xstride] = x264_clip_pixel( q0 - delta );    /* q0' */
     }
 }
-static inline void deblock_chroma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
+static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
 {
     for( int i = 0; i < 4; i++ )
     {
         int tc = tc0[i];
         if( tc <= 0 )
         {
-            pix += 2*ystride;
+            pix += height*ystride;
             continue;
         }
-        for( int d = 0; d < 2; d++, pix += ystride-2 )
-        for( int e = 0; e < 2; e++, pix++ )
-            deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] );
+        for( int d = 0; d < height; d++, pix += ystride-2 )
+            for( int e = 0; e < 2; e++, pix++ )
+                deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] );
     }
 }
-static void deblock_v_chroma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_chroma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
 {
     for( int i = 0; i < 4; i++, pix += stride )
         deblock_edge_chroma_c( pix, 2, alpha, beta, tc0[i] );
 }
+static void deblock_h_chroma_422_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+{
+    for( int i = 0; i < 8; i++, pix += stride )
+        deblock_edge_chroma_c( pix, 2, alpha, beta, tc0[i>>1] );
+}
 static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
 {
-    deblock_chroma_c( pix, stride, 2, alpha, beta, tc0 );
+    deblock_chroma_c( pix, 2, stride, 2, alpha, beta, tc0 );
 }
 static void deblock_h_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
 {
-    deblock_chroma_c( pix, 2, stride, alpha, beta, tc0 );
+    deblock_chroma_c( pix, 2, 2, stride, alpha, beta, tc0 );
+}
+static void deblock_h_chroma_422_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+{
+    deblock_chroma_c( pix, 4, 2, stride, alpha, beta, tc0 );
 }
 
 static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, int xstride, int alpha, int beta )
@@ -220,7 +230,7 @@ static inline void deblock_luma_intra_c( pixel *pix, int xstride, int ystride, i
     for( int d = 0; d < 16; d++, pix += ystride )
         deblock_edge_luma_intra_c( pix, xstride, alpha, beta );
 }
-static void deblock_v_luma_intra_mbaff_c( pixel *pix, int ystride, int alpha, int beta )
+static void deblock_h_luma_intra_mbaff_c( pixel *pix, int ystride, int alpha, int beta )
 {
     for( int d = 0; d < 8; d++, pix += ystride )
         deblock_edge_luma_intra_c( pix, 1, alpha, beta );
@@ -247,24 +257,33 @@ static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, int xstride,
         pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2;   /* q0' */
     }
 }
-static inline void deblock_chroma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int dir )
+static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, int xstride, int ystride, int alpha, int beta )
 {
-    for( int d = 0; d < (dir?16:8); d++, pix += ystride-2 )
-    for( int e = 0; e < (dir?1:2); e++, pix++ )
-        deblock_edge_chroma_intra_c( pix, xstride, alpha, beta );
+    for( int d = 0; d < height; d++, pix += ystride-2 )
+        for( int e = 0; e < width; e++, pix++ )
+            deblock_edge_chroma_intra_c( pix, xstride, alpha, beta );
 }
-static void deblock_v_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
+static void deblock_h_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
 {
     for( int i = 0; i < 4; i++, pix += stride )
         deblock_edge_chroma_intra_c( pix, 2, alpha, beta );
 }
+static void deblock_h_chroma_422_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
+{
+    for( int i = 0; i < 8; i++, pix += stride )
+        deblock_edge_chroma_intra_c( pix, 2, alpha, beta );
+}
 static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
 {
-    deblock_chroma_intra_c( pix, stride, 2, alpha, beta, 1 );
+    deblock_chroma_intra_c( pix, 1, 16, stride, 2, alpha, beta );
 }
 static void deblock_h_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
 {
-    deblock_chroma_intra_c( pix, 2, stride, alpha, beta, 0 );
+    deblock_chroma_intra_c( pix, 2, 8, 2, stride, alpha, beta );
+}
+static void deblock_h_chroma_422_intra_c( pixel *pix, int stride, int alpha, int beta )
+{
+    deblock_chroma_intra_c( pix, 2, 16, 2, stride, alpha, beta );
 }
 
 static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
@@ -375,6 +394,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
     int stridey   = h->fdec->i_stride[0];
     int strideuv  = h->fdec->i_stride[1];
     int chroma444 = CHROMA444;
+    int chroma_height = 16 >> h->mb.chroma_v_shift;
     intptr_t uvdiff = chroma444 ? h->fdec->plane[2] - h->fdec->plane[1] : 1;
 
     for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
@@ -388,12 +408,12 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
         uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][mb_x];
 
         pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
-        pixel *pixuv = h->fdec->plane[1] + (8<<chroma444)*mb_y*strideuv + 16*mb_x;
+        pixel *pixuv = h->fdec->plane[1] + chroma_height*mb_y*strideuv + 16*mb_x;
 
         if( mb_y & MB_INTERLACED )
         {
             pixy -= 15*stridey;
-            pixuv -= ((8<<chroma444)-1)*strideuv;
+            pixuv -= (chroma_height-1)*strideuv;
         }
 
         int stride2y  = stridey << MB_INTERLACED;
@@ -405,22 +425,33 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
         #define FILTER( intra, dir, edge, qp, chroma_qp )\
         do\
         {\
-            deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\
-                                 stride2y, bs[dir][edge], qp, a, b, 0,\
-                                 h->loopf.deblock_luma##intra[dir] );\
-            if( chroma444 )\
+            if( !(edge & 1) || !transform_8x8 )\
             {\
-                deblock_edge##intra( h, pixuv          + 4*edge*(dir?stride2uv:1),\
-                                     stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
-                                     h->loopf.deblock_luma##intra[dir] );\
-                deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\
-                                     stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
+                deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\
+                                     stride2y, bs[dir][edge], qp, a, b, 0,\
                                      h->loopf.deblock_luma##intra[dir] );\
+                if( CHROMA_FORMAT == CHROMA_444 )\
+                {\
+                    deblock_edge##intra( h, pixuv          + 4*edge*(dir?stride2uv:1),\
+                                         stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
+                                         h->loopf.deblock_luma##intra[dir] );\
+                    deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\
+                                         stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
+                                         h->loopf.deblock_luma##intra[dir] );\
+                }\
+                else if( CHROMA_FORMAT == CHROMA_420 && !(edge & 1) )\
+                {\
+                    deblock_edge##intra( h, pixuv + edge*(dir?2*stride2uv:4),\
+                                         stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\
+                                         h->loopf.deblock_chroma##intra[dir] );\
+                }\
             }\
-            else if( !(edge & 1) )\
-                deblock_edge##intra( h, pixuv + 2*edge*(dir?stride2uv:2),\
+            if( CHROMA_FORMAT == CHROMA_422 && (dir || !(edge & 1)) )\
+            {\
+                deblock_edge##intra( h, pixuv + edge*(dir?4*stride2uv:4),\
                                      stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\
                                      h->loopf.deblock_chroma##intra[dir] );\
+            }\
         } while(0)
 
         if( h->mb.i_neighbour & MB_LEFT )
@@ -431,9 +462,9 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
                 int chroma_qp[2];
                 int left_qp[2];
                 x264_deblock_inter_t luma_deblock = h->loopf.deblock_luma_mbaff;
-                x264_deblock_inter_t chroma_deblock = chroma444 ? h->loopf.deblock_luma_mbaff : h->loopf.deblock_chroma_mbaff;
+                x264_deblock_inter_t chroma_deblock = h->loopf.deblock_chroma_mbaff;
                 x264_deblock_intra_t luma_intra_deblock = h->loopf.deblock_luma_intra_mbaff;
-                x264_deblock_intra_t chroma_intra_deblock = chroma444 ? h->loopf.deblock_luma_intra_mbaff : h->loopf.deblock_chroma_intra_mbaff;
+                x264_deblock_intra_t chroma_intra_deblock = h->loopf.deblock_chroma_intra_mbaff;
                 int c = chroma444 ? 0 : 1;
 
                 left_qp[0] = h->mb.qp[h->mb.i_mb_left_xy[0]];
@@ -453,8 +484,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
                 }
 
                 int offy = MB_INTERLACED ? 4 : 0;
-                int offuv = MB_INTERLACED ? 3 : 0;
-                if( chroma444 ) offuv = offy;
+                int offuv = MB_INTERLACED ? 4-h->mb.chroma_v_shift : 0;
                 left_qp[1] = h->mb.qp[h->mb.i_mb_left_xy[1]];
                 luma_qp[1] = (qp + left_qp[1] + 1) >> 1;
                 chroma_qp[1] = (qpc + h->chroma_qp_table[left_qp[1]] + 1) >> 1;
@@ -486,9 +516,9 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
         }
         if( !first_edge_only )
         {
-            if( !transform_8x8 ) FILTER( , 0, 1, qp, qpc );
-                                 FILTER( , 0, 2, qp, qpc );
-            if( !transform_8x8 ) FILTER( , 0, 3, qp, qpc );
+            FILTER( , 0, 1, qp, qpc );
+            FILTER( , 0, 2, qp, qpc );
+            FILTER( , 0, 3, qp, qpc );
         }
 
         if( h->mb.i_neighbour & MB_TOP )
@@ -540,9 +570,9 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
 
         if( !first_edge_only )
         {
-            if( !transform_8x8 ) FILTER( , 1, 1, qp, qpc );
-                                 FILTER( , 1, 2, qp, qpc );
-            if( !transform_8x8 ) FILTER( , 1, 3, qp, qpc );
+            FILTER( , 1, 1, qp, qpc );
+            FILTER( , 1, 2, qp, qpc );
+            FILTER( , 1, 3, qp, qpc );
         }
 
         #undef FILTER
@@ -553,7 +583,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
  * TODO:
  *  deblock macroblock edges
  *  support analysis partitions smaller than 16x16
- *  deblock chroma for 4:2:0
+ *  deblock chroma for 4:2:0/4:2:2
  *  handle duplicate refs correctly
  *  handle cavlc+8x8dct correctly
  */
@@ -683,15 +713,19 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
     pf->deblock_luma[1] = deblock_v_luma_c;
     pf->deblock_luma[0] = deblock_h_luma_c;
     pf->deblock_chroma[1] = deblock_v_chroma_c;
-    pf->deblock_chroma[0] = deblock_h_chroma_c;
+    pf->deblock_h_chroma_420 = deblock_h_chroma_c;
+    pf->deblock_h_chroma_422 = deblock_h_chroma_422_c;
     pf->deblock_luma_intra[1] = deblock_v_luma_intra_c;
     pf->deblock_luma_intra[0] = deblock_h_luma_intra_c;
     pf->deblock_chroma_intra[1] = deblock_v_chroma_intra_c;
-    pf->deblock_chroma_intra[0] = deblock_h_chroma_intra_c;
-    pf->deblock_luma_mbaff = deblock_v_luma_mbaff_c;
-    pf->deblock_chroma_mbaff = deblock_v_chroma_mbaff_c;
-    pf->deblock_luma_intra_mbaff = deblock_v_luma_intra_mbaff_c;
-    pf->deblock_chroma_intra_mbaff = deblock_v_chroma_intra_mbaff_c;
+    pf->deblock_h_chroma_420_intra = deblock_h_chroma_intra_c;
+    pf->deblock_h_chroma_422_intra = deblock_h_chroma_422_intra_c;
+    pf->deblock_luma_mbaff = deblock_h_luma_mbaff_c;
+    pf->deblock_chroma_420_mbaff = deblock_h_chroma_mbaff_c;
+    pf->deblock_chroma_422_mbaff = deblock_h_chroma_422_mbaff_c;
+    pf->deblock_luma_intra_mbaff = deblock_h_luma_intra_mbaff_c;
+    pf->deblock_chroma_420_intra_mbaff = deblock_h_chroma_intra_mbaff_c;
+    pf->deblock_chroma_422_intra_mbaff = deblock_h_chroma_422_intra_mbaff_c;
     pf->deblock_strength = deblock_strength_c;
 
 #if HAVE_MMX
@@ -701,11 +735,11 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
         pf->deblock_luma[1] = x264_deblock_v_luma_mmx2;
         pf->deblock_luma[0] = x264_deblock_h_luma_mmx2;
         pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2;
-        pf->deblock_chroma[0] = x264_deblock_h_chroma_mmx2;
+        pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2;
         pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2;
         pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2;
         pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2;
-        pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_mmx2;
+        pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_mmx2;
 #endif
         pf->deblock_strength = x264_deblock_strength_mmx2;
         if( cpu&X264_CPU_SSE2 )
@@ -716,11 +750,11 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
                 pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
                 pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
                 pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2;
-                pf->deblock_chroma[0] = x264_deblock_h_chroma_sse2;
+                pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
                 pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
                 pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
                 pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
-                pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_sse2;
+                pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2;
             }
         }
         if( cpu&X264_CPU_SSSE3 )
@@ -733,11 +767,11 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
                 pf->deblock_luma[1] = x264_deblock_v_luma_avx;
                 pf->deblock_luma[0] = x264_deblock_h_luma_avx;
                 pf->deblock_chroma[1] = x264_deblock_v_chroma_avx;
-                pf->deblock_chroma[0] = x264_deblock_h_chroma_avx;
+                pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
                 pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
                 pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
                 pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
-                pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_avx;
+                pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx;
             }
         }
     }
@@ -758,7 +792,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
         pf->deblock_luma[1] = x264_deblock_v_luma_neon;
         pf->deblock_luma[0] = x264_deblock_h_luma_neon;
 //      pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
-//      pf->deblock_chroma[0] = x264_deblock_h_chroma_neon;
+//      pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
    }
 #endif
 #endif // !HIGH_BIT_DEPTH
diff --git a/common/frame.c b/common/frame.c
index b95c2a86..594aeccc 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -50,6 +50,10 @@ static int x264_frame_internal_csp( int external_csp )
         case X264_CSP_I420:
         case X264_CSP_YV12:
             return X264_CSP_NV12;
+        case X264_CSP_NV16:
+        case X264_CSP_I422:
+        case X264_CSP_YV16:
+            return X264_CSP_NV16;
         case X264_CSP_I444:
         case X264_CSP_YV24:
         case X264_CSP_BGR:
@@ -66,11 +70,10 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
     x264_frame_t *frame;
     int i_csp = x264_frame_internal_csp( h->param.i_csp );
     int i_mb_count = h->mb.i_mb_count;
-    int i_stride, i_width, i_lines;
+    int i_stride, i_width, i_lines, luma_plane_count;
     int i_padv = PADV << PARAM_INTERLACED;
     int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
     int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10;
-    int luma_plane_count = i_csp == X264_CSP_NV12 ? 1 : 3;
 
     CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
 
@@ -79,18 +82,20 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
     i_lines  = h->mb.i_mb_height*16;
     i_stride = align_stride( i_width + 2*PADH, align, disalign );
 
-    if( i_csp == X264_CSP_NV12 )
+    if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
     {
+        luma_plane_count = 1;
         frame->i_plane = 2;
         for( int i = 0; i < 2; i++ )
         {
             frame->i_width[i] = i_width >> i;
-            frame->i_lines[i] = i_lines >> i;
+            frame->i_lines[i] = i_lines >> (i && i_csp == X264_CSP_NV12);
             frame->i_stride[i] = i_stride;
         }
     }
     else if( i_csp == X264_CSP_I444 )
     {
+        luma_plane_count = 3;
         frame->i_plane = 3;
         for( int i = 0; i < 3; i++ )
         {
@@ -130,15 +135,16 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
 
     frame->orig = frame;
 
-    if( i_csp == X264_CSP_NV12 )
+    if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
     {
-        int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + i_padv));
+        int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
+        int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv));
         CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
-        frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * i_padv/2 + PADH;
+        frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH;
         if( PARAM_INTERLACED )
         {
             CHECKED_MALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) );
-            frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * i_padv/2 + PADH;
+            frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH;
         }
     }
 
@@ -367,23 +373,25 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
     }
     else
     {
+        int v_shift = h->mb.chroma_v_shift;
         get_plane_ptr( h, src, &pix[0], &stride[0], 0, 0, 0 );
         h->mc.plane_copy( dst->plane[0], dst->i_stride[0], (pixel*)pix[0],
                           stride[0]/sizeof(pixel), h->param.i_width, h->param.i_height );
-        if( i_csp == X264_CSP_NV12 )
+        if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
         {
-            get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, 1 );
+            get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift );
             h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
-                              stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height>>1 );
+                              stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height>>v_shift );
         }
-        else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_YV12 )
+        else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_I422 || i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16 )
         {
-            get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I420 ? 1 : 2, 1, 1 );
-            get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I420 ? 2 : 1, 1, 1 );
+            int uv_swap = i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16;
+            get_plane_ptr( h, src, &pix[1], &stride[1], uv_swap ? 2 : 1, 1, v_shift );
+            get_plane_ptr( h, src, &pix[2], &stride[2], uv_swap ? 1 : 2, 1, v_shift );
             h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1],
                                          (pixel*)pix[1], stride[1]/sizeof(pixel),
                                          (pixel*)pix[2], stride[2]/sizeof(pixel),
-                                         h->param.i_width>>1, h->param.i_height>>1 );
+                                         h->param.i_width>>1, h->param.i_height>>v_shift );
         }
         else //if( i_csp == X264_CSP_I444 || i_csp == X264_CSP_YV24 )
         {
@@ -478,33 +486,34 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e
         return;
     for( int i = 0; i < frame->i_plane; i++ )
     {
-        int shift = i && !CHROMA444;
+        int h_shift = i && h->mb.chroma_h_shift;
+        int v_shift = i && h->mb.chroma_v_shift;
         int stride = frame->i_stride[i];
         int width = 16*h->mb.i_mb_width;
-        int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> shift;
+        int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> v_shift;
         int padh = PADH;
-        int padv = PADV >> shift;
+        int padv = PADV >> v_shift;
         // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
         if( b_end && !b_start )
-            height += 4 >> (shift + SLICE_MBAFF);
+            height += 4 >> (v_shift + SLICE_MBAFF);
         pixel *pix;
         if( SLICE_MBAFF )
         {
             // border samples for each field are extended separately
-            pix = frame->plane_fld[i] + X264_MAX(0, (16*mb_y-4)*stride >> shift);
-            plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, shift );
-            plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, shift );
+            pix = frame->plane_fld[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
+            plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, h_shift );
+            plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, h_shift );
 
-            height = (b_end ? 16*(h->mb.i_mb_height - mb_y) : 32) >> shift;
+            height = (b_end ? 16*(h->mb.i_mb_height - mb_y) : 32) >> v_shift;
             if( b_end && !b_start )
-                height += 4 >> shift;
-            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> shift);
-            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, shift );
+                height += 4 >> v_shift;
+            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
+            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, h_shift );
         }
         else
         {
-            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> shift);
-            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, shift );
+            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
+            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, h_shift );
         }
     }
 }
@@ -545,9 +554,9 @@ void x264_frame_expand_border_lowres( x264_frame_t *frame )
 
 void x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane )
 {
-    int shift = !CHROMA444;
-    plane_expand_border( frame->plane[plane], frame->i_stride[plane], 16*h->mb.i_mb_width, 16*h->mb.i_mb_height>>shift,
-                         PADH, PADV>>shift, 1, 1, shift );
+    int v_shift = h->mb.chroma_v_shift;
+    plane_expand_border( frame->plane[plane], frame->i_stride[plane], 16*h->mb.i_mb_width, 16*h->mb.i_mb_height>>v_shift,
+                         PADH, PADV>>v_shift, 1, 1, h->mb.chroma_h_shift );
 }
 
 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
@@ -555,17 +564,18 @@ void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
     for( int i = 0; i < frame->i_plane; i++ )
     {
         int i_width = h->param.i_width;
-        int shift = i && !CHROMA444;
-        int i_height = h->param.i_height >> shift;
+        int h_shift = i && h->mb.chroma_h_shift;
+        int v_shift = i && h->mb.chroma_v_shift;
+        int i_height = h->param.i_height >> v_shift;
         int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width);
-        int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> shift;
+        int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
 
         if( i_padx )
         {
             for( int y = 0; y < i_height; y++ )
                 pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
-                              &frame->plane[i][y*frame->i_stride[i] + i_width - 1-shift],
-                              i_padx>>shift, sizeof(pixel)<<shift );
+                              &frame->plane[i][y*frame->i_stride[i] + i_width - 1-h_shift],
+                              i_padx>>h_shift, sizeof(pixel)<<h_shift );
         }
         if( i_pady )
         {
@@ -581,10 +591,10 @@ void x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y )
 {
     for( int i = 0; i < h->fenc->i_plane; i++ )
     {
-        int shift = i && !CHROMA444;
+        int v_shift = i && h->mb.chroma_v_shift;
         int stride = h->fenc->i_stride[i];
-        int height = h->param.i_height >> shift;
-        int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> shift;
+        int height = h->param.i_height >> v_shift;
+        int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
         pixel *fenc = h->fenc->plane[i] + 16*mb_x;
         for( int y = height; y < height + pady; y++ )
             memcpy( fenc + y*stride, fenc + (height-1)*stride, 16*sizeof(pixel) );
diff --git a/common/frame.h b/common/frame.h
index 77af60d1..a13e05b4 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -181,12 +181,20 @@ typedef struct
 {
     x264_deblock_inter_t deblock_luma[2];
     x264_deblock_inter_t deblock_chroma[2];
+    x264_deblock_inter_t deblock_h_chroma_420;
+    x264_deblock_inter_t deblock_h_chroma_422;
     x264_deblock_intra_t deblock_luma_intra[2];
     x264_deblock_intra_t deblock_chroma_intra[2];
+    x264_deblock_intra_t deblock_h_chroma_420_intra;
+    x264_deblock_intra_t deblock_h_chroma_422_intra;
     x264_deblock_inter_t deblock_luma_mbaff;
     x264_deblock_inter_t deblock_chroma_mbaff;
+    x264_deblock_inter_t deblock_chroma_420_mbaff;
+    x264_deblock_inter_t deblock_chroma_422_mbaff;
     x264_deblock_intra_t deblock_luma_intra_mbaff;
     x264_deblock_intra_t deblock_chroma_intra_mbaff;
+    x264_deblock_intra_t deblock_chroma_420_intra_mbaff;
+    x264_deblock_intra_t deblock_chroma_422_intra_mbaff;
     void (*deblock_strength) ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
                                int bframe );
diff --git a/common/macroblock.c b/common/macroblock.c
index 7c524ff0..f985e772 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -6,6 +6,7 @@
  * Authors: Fiona Glaser <fiona@x264.com>
  *          Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
+ *          Henrik Gramner <hengar-6@student.ltu.se>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -50,23 +51,27 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h
     }
     else
     {
-        // chroma is offset if MCing from a field of opposite parity
-        if( MB_INTERLACED & i_ref )
+        int v_shift = h->mb.chroma_v_shift;
+        // Chroma in 4:2:0 is offset if MCing from a field of opposite parity
+        if( v_shift & MB_INTERLACED & i_ref )
             mvy += (h->mb.i_mb_y & 1)*4 - 2;
 
-        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x],
-                         &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+        int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
+        height = 4*height >> v_shift;
+
+        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset],
+                         &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
                          h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
-                         mvx, mvy, 2*width, 2*height );
+                         mvx, 2*mvy>>v_shift, 2*width, height );
 
         if( h->sh.weight[i_ref][1].weightfn )
-            h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                                                       &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                                                       &h->sh.weight[i_ref][1], height*2 );
+            h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE,
+                                                       &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE,
+                                                       &h->sh.weight[i_ref][1], height );
         if( h->sh.weight[i_ref][2].weightfn )
-            h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                                                       &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                                                       &h->sh.weight[i_ref][2],height*2 );
+            h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
+                                                       &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
+                                                       &h->sh.weight[i_ref][2], height );
     }
 }
 static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
@@ -85,13 +90,15 @@ static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int h
     }
     else
     {
-        if( MB_INTERLACED & i_ref )
+        int v_shift = h->mb.chroma_v_shift;
+        if( v_shift & MB_INTERLACED & i_ref )
             mvy += (h->mb.i_mb_y & 1)*4 - 2;
 
-        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x],
-                         &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+        int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
+        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset],
+                         &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
                          h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1],
-                         mvx, mvy, 2*width, 2*height );
+                         mvx, 2*mvy>>v_shift, 2*width, 4*height>>v_shift );
     }
 }
 
@@ -128,17 +135,21 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
     }
     else
     {
-        if( MB_INTERLACED & i_ref0 )
+        int v_shift = h->mb.chroma_v_shift;
+        if( v_shift & MB_INTERLACED & i_ref0 )
             mvy0 += (h->mb.i_mb_y & 1)*4 - 2;
-        if( MB_INTERLACED & i_ref1 )
+        if( v_shift & MB_INTERLACED & i_ref1 )
             mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
 
         h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1],
-                         mvx0, mvy0, 2*width, 2*height );
+                         mvx0, 2*mvy0>>v_shift, 2*width, 4*height>>v_shift );
         h->mc.mc_chroma( tmp1, tmp1+8, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1],
-                         mvx1, mvy1, 2*width, 2*height );
-        h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
-        h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight );
+                         mvx1, 2*mvy1>>v_shift, 2*width, 4*height>>v_shift );
+
+        int chromapix = h->luma2chroma_pixel[i_mode];
+        int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
+        h->mc.avg[chromapix]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE, tmp0,   16, tmp1,   16, weight );
+        h->mc.avg[chromapix]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight );
     }
 }
 
@@ -301,7 +312,9 @@ int x264_macroblock_cache_allocate( x264_t *h )
         }
         else
         {
-            luma_plane_size = h->fdec->i_stride[0] * (h->mb.i_mb_height*16+2*i_padv);
+            /* Both ref and fenc is stored for 4:2:0 and 4:2:2 which means that 4:2:0 and 4:4:4
+             * needs the same amount of space and 4:2:2 needs twice that much */
+            luma_plane_size = h->fdec->i_stride[0] * (h->mb.i_mb_height*(16<<(CHROMA_FORMAT==CHROMA_422))+2*i_padv);
 
             if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
                 //smart can weight one ref and one offset -1 in 8-bit
@@ -491,6 +504,24 @@ void x264_macroblock_thread_init( x264_t *h )
                           (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
     h->mb.i_mb_prev_xy = -1;
 
+    /*          4:2:0                      4:2:2                      4:4:4
+     * fdec            fenc       fdec            fenc       fdec            fenc
+     * y y y y y y y   Y Y Y Y    y y y y y y y   Y Y Y Y    y y y y y y y   Y Y Y Y
+     * y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y
+     * y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y
+     * y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y    y Y Y Y Y       Y Y Y Y
+     * y Y Y Y Y       U U V V    y Y Y Y Y       U U V V    y Y Y Y Y       U U U U
+     * u u u   v v v   U U V V    u u u   v v v   U U V V    u u u u u u u   U U U U
+     * u U U   v V V              u U U   v V V   U U V V    u U U U U       U U U U
+     * u U U   v V V              u U U   v V V   U U V V    u U U U U       U U U U
+     *                            u U U   v V V              u U U U U       V V V V
+     *                            u U U   v V V              u U U U U       V V V V
+     *                                                       v v v v v v v   V V V V
+     *                                                       v V V V V       V V V V
+     *                                                       v V V V V
+     *                                                       v V V V V
+     *                                                       v V V V V
+     */
     h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
     h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
     h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
@@ -500,16 +531,6 @@ void x264_macroblock_thread_init( x264_t *h )
         h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE;
         h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE;
     }
-    /* fdec:      fenc:
-     * yyyyyyy
-     * yYYYY      YYYY
-     * yYYYY      YYYY
-     * yYYYY      YYYY
-     * yYYYY      YYYY
-     * uuu vvv    UUVV
-     * uUU vVV    UUVV
-     * uUU vVV
-     */
     else
     {
         h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
@@ -522,7 +543,7 @@ void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
     int stride_y  = fenc->i_stride[0];
     int stride_uv = fenc->i_stride[1];
     int off_y  = 16 * i_mb_x + 16 * i_mb_y * stride_y;
-    int off_uv = 16 * i_mb_x + 8 * i_mb_y * stride_uv;
+    int off_uv = 16 * i_mb_x + (16 * i_mb_y * stride_uv >> h->mb.chroma_v_shift);
     h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y,
                          fenc->plane[1]+off_uv, stride_uv, i_mb_x );
 }
@@ -537,12 +558,12 @@ NOINLINE void x264_copy_column8( pixel *dst, pixel *src )
 static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff )
 {
     int mb_interlaced = b_mbaff && MB_INTERLACED;
-    int w = b_chroma ? 8 : 16;
+    int height = b_chroma ? 16 >> h->mb.chroma_v_shift : 16;
     int i_stride = h->fdec->i_stride[i];
     int i_stride2 = i_stride << mb_interlaced;
     int i_pix_offset = mb_interlaced
-                     ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
-                     : 16 * mb_x + w * mb_y * i_stride;
+                     ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
+                     : 16 * mb_x + height * mb_y * i_stride;
     pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset];
     int fdec_idx = b_mbaff ? (mb_interlaced ? (3 + (mb_y&1)) : (mb_y&1) ? 2 : 4) : 0;
     pixel *intra_fdec = &h->intra_border_backup[fdec_idx][i][mb_x*16];
@@ -554,7 +575,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
     h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
     if( b_chroma )
     {
-        h->mc.load_deinterleave_8x8x2_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2 );
+        h->mc.load_deinterleave_chroma_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2, height );
         memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*sizeof(pixel) );
         memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*sizeof(pixel) );
         if( b_mbaff )
@@ -572,7 +593,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
     }
     if( b_mbaff )
     {
-        for( int j = 0; j < w; j++ )
+        for( int j = 0; j < height; j++ )
             if( b_chroma )
             {
                 h->mb.pic.p_fdec[1][-1+j*FDEC_STRIDE] = plane_fdec[-2+j*i_stride2];
@@ -854,8 +875,8 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
 
         /* load non_zero_count */
         CP32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8], &nnz[top][12] );
-        CP32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8], &nnz[top][16+4 + 8*CHROMA444] );
-        CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32+4 + 8*CHROMA444] );
+        CP32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8], &nnz[top][16-4 + (16>>h->mb.chroma_v_shift)] );
+        CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32-4 + (16>>h->mb.chroma_v_shift)] );
 
         /* Finish the prefetching */
         for( int l = 0; l < lists; l++ )
@@ -906,16 +927,17 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
         h->mb.cache.non_zero_count[x264_scan8[ 8] - 1] = nnz[lbot][left_index_table->nnz[2]];
         h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[lbot][left_index_table->nnz[3]];
 
-        if( CHROMA444 )
+        if( CHROMA_FORMAT >= CHROMA_422 )
         {
-            h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+16];
-            h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+16];
-            h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+16];
-            h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = nnz[lbot][left_index_table->nnz[3]+16];
-            h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+32];
-            h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+32];
-            h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+32];
-            h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = nnz[lbot][left_index_table->nnz[3]+32];
+            int offset = (4>>h->mb.chroma_h_shift) - 4;
+            h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+16+offset];
+            h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+16+offset];
+            h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+16+offset];
+            h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = nnz[lbot][left_index_table->nnz[3]+16+offset];
+            h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+32+offset];
+            h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+32+offset];
+            h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+32+offset];
+            h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = nnz[lbot][left_index_table->nnz[3]+32+offset];
         }
         else
         {
@@ -943,7 +965,7 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
         h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] =
         h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] =
         h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = 0x80;
-        if( CHROMA444 )
+        if( CHROMA_FORMAT >= CHROMA_422 )
         {
             h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] =
             h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] =
@@ -983,6 +1005,11 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
         {
             x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE );
             x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE );
+            if( CHROMA_FORMAT == CHROMA_422 )
+            {
+                x264_copy_column8( h->mb.pic.p_fdec[1]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+12*FDEC_STRIDE );
+                x264_copy_column8( h->mb.pic.p_fdec[2]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+12*FDEC_STRIDE );
+            }
             x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1, 0 );
         }
     }
@@ -1424,15 +1451,17 @@ void x264_macroblock_deblock_strength( x264_t *h )
     }
 
     /* Early termination: in this case, nnz guarantees all edges use strength 2.*/
-    if( h->mb.b_transform_8x8 && (h->mb.i_cbp_luma&7) == 7 && !CHROMA444 )
+    if( h->mb.b_transform_8x8 && !CHROMA444 )
     {
-        M32( bs[0][0] ) = 0x02020202;
-        M32( bs[0][2] ) = 0x02020202;
-        M32( bs[0][4] ) = 0x02020202;
-        M32( bs[1][0] ) = 0x02020202;
-        M32( bs[1][2] ) = 0x02020202;
-        M32( bs[1][4] ) = 0x02020202;
-        return;
+        int cbp_mask = 0xf >> h->mb.chroma_v_shift;
+        if( (h->mb.i_cbp_luma&cbp_mask) == cbp_mask )
+        {
+            M32( bs[0][0] ) = 0x02020202;
+            M32( bs[0][2] ) = 0x02020202;
+            M32( bs[0][4] ) = 0x02020202;
+            memset( bs[1][0], 2, 5*4*sizeof(uint8_t) ); /* [1][1] and [1][3] has to be set for 4:2:2 */
+            return;
+        }
     }
 
     int neighbour_changed = 0;
@@ -1595,14 +1624,14 @@ void x264_macroblock_deblock_strength( x264_t *h )
 
 static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff )
 {
-    int w = b_chroma ? 8 : 16;
+    int height = b_chroma ? 16>>h->mb.chroma_v_shift : 16;
     int i_stride = h->fdec->i_stride[i];
     int i_stride2 = i_stride << (b_mbaff && MB_INTERLACED);
     int i_pix_offset = (b_mbaff && MB_INTERLACED)
-                     ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
-                     : 16 * mb_x + w * mb_y * i_stride;
+                     ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
+                     : 16 * mb_x + height * mb_y * i_stride;
     if( b_chroma )
-        h->mc.store_interleave_8x8x2( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2] );
+        h->mc.store_interleave_chroma( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], height );
     else
         h->mc.copy[PIXEL_16x16]( &h->fdec->plane[i][i_pix_offset], i_stride2, h->mb.pic.p_fdec[i], FDEC_STRIDE, 16 );
 }
@@ -1622,8 +1651,9 @@ static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int
     }
     else
     {
-        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*7,   8*sizeof(pixel) );
-        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+FDEC_STRIDE*7,   8*sizeof(pixel) );
+        int backup_src = (15>>h->mb.chroma_v_shift) * FDEC_STRIDE;
+        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src, 8*sizeof(pixel) );
+        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*sizeof(pixel) );
     }
     if( b_mbaff )
     {
@@ -1639,7 +1669,8 @@ static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int
             }
             else
             {
-                backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE;
+                if( CHROMA_FORMAT == CHROMA_420 )
+                    backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE;
                 memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src,  8*sizeof(pixel) );
                 memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src,  8*sizeof(pixel) );
             }
@@ -1650,8 +1681,8 @@ static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int
         /* In progressive we update intra_border_backup in-place, so the topleft neighbor will
          * no longer exist there when load_pic_pointers wants it. Move it within p_fdec instead. */
         h->mb.pic.p_fdec[0][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[0][-FDEC_STRIDE+15];
-        h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[1][-FDEC_STRIDE+7 + 8*CHROMA444];
-        h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[2][-FDEC_STRIDE+7 + 8*CHROMA444];
+        h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[1][-FDEC_STRIDE+(15>>h->mb.chroma_h_shift)];
+        h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[2][-FDEC_STRIDE+(15>>h->mb.chroma_h_shift)];
     }
 }
 
@@ -1744,7 +1775,7 @@ void x264_macroblock_cache_save( x264_t *h )
     CP32( &nnz[16+1*4], &h->mb.cache.non_zero_count[x264_scan8[16+2]] );
     CP32( &nnz[32+0*4], &h->mb.cache.non_zero_count[x264_scan8[32+0]] );
     CP32( &nnz[32+1*4], &h->mb.cache.non_zero_count[x264_scan8[32+2]] );
-    if( CHROMA444 )
+    if( CHROMA_FORMAT >= CHROMA_422 )
     {
         CP32( &nnz[16+2*4], &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] );
         CP32( &nnz[16+3*4], &h->mb.cache.non_zero_count[x264_scan8[16+10]] );
@@ -1809,7 +1840,7 @@ void x264_macroblock_cache_save( x264_t *h )
         uint8_t (*mvd0)[2] = h->mb.mvd[0][i_mb_xy];
         uint8_t (*mvd1)[2] = h->mb.mvd[1][i_mb_xy];
         if( IS_INTRA(i_mb_type) && i_mb_type != I_PCM )
-            h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ];
+            h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode];
         else
             h->mb.chroma_pred_mode[i_mb_xy] = I_PRED_CHROMA_DC;
 
diff --git a/common/macroblock.h b/common/macroblock.h
index 7f5d5661..12b90c62 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -397,15 +397,6 @@ static ALWAYS_INLINE uint64_t pack32to64( uint32_t a, uint32_t b )
 #   define pack_pixel_2to4 pack16to32
 #endif
 
-#define array_non_zero(a) array_non_zero_int(a, sizeof(a)/sizeof(dctcoef))
-#define array_non_zero_int array_non_zero_int
-static ALWAYS_INLINE int array_non_zero_int( dctcoef *v, int i_count )
-{
-    for( int i = 0; i < i_count; i++ )
-        if( v[i] )
-            return 1;
-    return 0;
-}
 static ALWAYS_INLINE int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
 {
     const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1];
diff --git a/common/mc.c b/common/mc.c
index 5352be14..c2b77f58 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -90,9 +90,11 @@ PIXEL_AVG_C( pixel_avg_16x8,  16, 8 )
 PIXEL_AVG_C( pixel_avg_8x16,  8, 16 )
 PIXEL_AVG_C( pixel_avg_8x8,   8, 8 )
 PIXEL_AVG_C( pixel_avg_8x4,   8, 4 )
+PIXEL_AVG_C( pixel_avg_4x16,  4, 16 )
 PIXEL_AVG_C( pixel_avg_4x8,   4, 8 )
 PIXEL_AVG_C( pixel_avg_4x4,   4, 4 )
 PIXEL_AVG_C( pixel_avg_4x2,   4, 2 )
+PIXEL_AVG_C( pixel_avg_2x8,   2, 8 )
 PIXEL_AVG_C( pixel_avg_2x4,   2, 4 )
 PIXEL_AVG_C( pixel_avg_2x2,   2, 2 )
 
@@ -330,9 +332,9 @@ void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, int i_dsta,
     }
 }
 
-static void store_interleave_8x8x2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv )
+static void store_interleave_chroma( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height )
 {
-    for( int y=0; y<8; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
+    for( int y=0; y<height; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
         for( int x=0; x<8; x++ )
         {
             dst[2*x]   = srcu[x];
@@ -340,14 +342,14 @@ static void store_interleave_8x8x2( pixel *dst, int i_dst, pixel *srcu, pixel *s
         }
 }
 
-static void load_deinterleave_8x8x2_fenc( pixel *dst, pixel *src, int i_src )
+static void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height )
 {
-    x264_plane_copy_deinterleave_c( dst, FENC_STRIDE, dst+FENC_STRIDE/2, FENC_STRIDE, src, i_src, 8, 8 );
+    x264_plane_copy_deinterleave_c( dst, FENC_STRIDE, dst+FENC_STRIDE/2, FENC_STRIDE, src, i_src, 8, height );
 }
 
-static void load_deinterleave_8x8x2_fdec( pixel *dst, pixel *src, int i_src )
+static void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height )
 {
-    x264_plane_copy_deinterleave_c( dst, FDEC_STRIDE, dst+FDEC_STRIDE/2, FDEC_STRIDE, src, i_src, 8, 8 );
+    x264_plane_copy_deinterleave_c( dst, FDEC_STRIDE, dst+FDEC_STRIDE/2, FDEC_STRIDE, src, i_src, 8, height );
 }
 
 static void prefetch_fenc_null( pixel *pix_y, int stride_y,
@@ -467,6 +469,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
 {
     pf->mc_luma   = mc_luma;
     pf->get_ref   = get_ref;
+
     pf->mc_chroma = mc_chroma;
 
     pf->avg[PIXEL_16x16]= pixel_avg_16x16;
@@ -474,9 +477,11 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
     pf->avg[PIXEL_8x16] = pixel_avg_8x16;
     pf->avg[PIXEL_8x8]  = pixel_avg_8x8;
     pf->avg[PIXEL_8x4]  = pixel_avg_8x4;
+    pf->avg[PIXEL_4x16] = pixel_avg_4x16;
     pf->avg[PIXEL_4x8]  = pixel_avg_4x8;
     pf->avg[PIXEL_4x4]  = pixel_avg_4x4;
     pf->avg[PIXEL_4x2]  = pixel_avg_4x2;
+    pf->avg[PIXEL_2x8]  = pixel_avg_2x8;
     pf->avg[PIXEL_2x4]  = pixel_avg_2x4;
     pf->avg[PIXEL_2x2]  = pixel_avg_2x2;
 
@@ -490,9 +495,9 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
     pf->copy[PIXEL_8x8]   = mc_copy_w8;
     pf->copy[PIXEL_4x4]   = mc_copy_w4;
 
-    pf->store_interleave_8x8x2  = store_interleave_8x8x2;
-    pf->load_deinterleave_8x8x2_fenc = load_deinterleave_8x8x2_fenc;
-    pf->load_deinterleave_8x8x2_fdec = load_deinterleave_8x8x2_fdec;
+    pf->store_interleave_chroma       = store_interleave_chroma;
+    pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc;
+    pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec;
 
     pf->plane_copy = x264_plane_copy_c;
     pf->plane_copy_interleave = x264_plane_copy_interleave_c;
diff --git a/common/mc.h b/common/mc.h
index 15a0a254..09dda557 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -62,30 +62,27 @@ extern const x264_weight_t x264_weight_none[3];
 
 typedef struct
 {
-    void (*mc_luma)(pixel *dst, int i_dst, pixel **src, int i_src,
-                    int mvx, int mvy,
-                    int i_width, int i_height, const x264_weight_t *weight );
+    void (*mc_luma)( pixel *dst, int i_dst, pixel **src, int i_src,
+                     int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
 
     /* may round up the dimensions if they're not a power of 2 */
-    pixel* (*get_ref)(pixel *dst, int *i_dst, pixel **src, int i_src,
-                      int mvx, int mvy,
-                      int i_width, int i_height, const x264_weight_t *weight );
+    pixel* (*get_ref)( pixel *dst, int *i_dst, pixel **src, int i_src,
+                       int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
 
     /* mc_chroma may write up to 2 bytes of garbage to the right of dst,
      * so it must be run from left to right. */
-    void (*mc_chroma)(pixel *dstu, pixel *dstv, int i_dst, pixel *src, int i_src,
-                      int mvx, int mvy,
-                      int i_width, int i_height );
+    void (*mc_chroma)( pixel *dstu, pixel *dstv, int i_dst, pixel *src, int i_src,
+                       int mvx, int mvy, int i_width, int i_height );
 
-    void (*avg[10])( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight );
+    void (*avg[12])( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight );
 
     /* only 16x16, 8x8, and 4x4 defined */
     void (*copy[7])( pixel *dst, int, pixel *src, int, int i_height );
     void (*copy_16x16_unaligned)( pixel *dst, int, pixel *src, int, int i_height );
 
-    void (*store_interleave_8x8x2)( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
-    void (*load_deinterleave_8x8x2_fenc)( pixel *dst, pixel *src, int i_src );
-    void (*load_deinterleave_8x8x2_fdec)( pixel *dst, pixel *src, int i_src );
+    void (*store_interleave_chroma)( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
+    void (*load_deinterleave_chroma_fenc)( pixel *dst, pixel *src, int i_src, int height );
+    void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, int i_src, int height );
 
     void (*plane_copy)( pixel *dst, int i_dst,
                         pixel *src, int i_src, int w, int h );
diff --git a/common/pixel.c b/common/pixel.c
index 91dc1b87..b346681b 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -68,10 +68,10 @@ PIXEL_SAD_C( x264_pixel_sad_16x8,  16,  8 )
 PIXEL_SAD_C( x264_pixel_sad_8x16,   8, 16 )
 PIXEL_SAD_C( x264_pixel_sad_8x8,    8,  8 )
 PIXEL_SAD_C( x264_pixel_sad_8x4,    8,  4 )
+PIXEL_SAD_C( x264_pixel_sad_4x16,   4, 16 )
 PIXEL_SAD_C( x264_pixel_sad_4x8,    4,  8 )
 PIXEL_SAD_C( x264_pixel_sad_4x4,    4,  4 )
 
-
 /****************************************************************************
  * pixel_ssd_WxH
  ****************************************************************************/
@@ -98,6 +98,7 @@ PIXEL_SSD_C( x264_pixel_ssd_16x8,  16,  8 )
 PIXEL_SSD_C( x264_pixel_ssd_8x16,   8, 16 )
 PIXEL_SSD_C( x264_pixel_ssd_8x8,    8,  8 )
 PIXEL_SSD_C( x264_pixel_ssd_8x4,    8,  4 )
+PIXEL_SSD_C( x264_pixel_ssd_4x16,   4, 16 )
 PIXEL_SSD_C( x264_pixel_ssd_4x8,    4,  8 )
 PIXEL_SSD_C( x264_pixel_ssd_4x4,    4,  4 )
 
@@ -169,11 +170,11 @@ void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pi
 /****************************************************************************
  * pixel_var_wxh
  ****************************************************************************/
-#define PIXEL_VAR_C( name, w ) \
+#define PIXEL_VAR_C( name, w, h ) \
 static uint64_t name( pixel *pix, int i_stride ) \
 {                                             \
     uint32_t sum = 0, sqr = 0;                \
-    for( int y = 0; y < w; y++ )              \
+    for( int y = 0; y < h; y++ )              \
     {                                         \
         for( int x = 0; x < w; x++ )          \
         {                                     \
@@ -185,32 +186,37 @@ static uint64_t name( pixel *pix, int i_stride ) \
     return sum + ((uint64_t)sqr << 32);       \
 }
 
-PIXEL_VAR_C( x264_pixel_var_16x16, 16 )
-PIXEL_VAR_C( x264_pixel_var_8x8,    8 )
+PIXEL_VAR_C( x264_pixel_var_16x16, 16, 16 )
+PIXEL_VAR_C( x264_pixel_var_8x16,   8, 16 )
+PIXEL_VAR_C( x264_pixel_var_8x8,    8,  8 )
 
 /****************************************************************************
  * pixel_var2_wxh
  ****************************************************************************/
-static int pixel_var2_8x8( pixel *pix1, int i_stride1, pixel *pix2, int i_stride2, int *ssd )
-{
-    uint32_t var = 0, sum = 0, sqr = 0;
-    for( int y = 0; y < 8; y++ )
-    {
-        for( int x = 0; x < 8; x++ )
-        {
-            int diff = pix1[x] - pix2[x];
-            sum += diff;
-            sqr += diff * diff;
-        }
-        pix1 += i_stride1;
-        pix2 += i_stride2;
-    }
-    sum = abs(sum);
-    var = sqr - ((uint64_t)sum * sum >> 6);
-    *ssd = sqr;
-    return var;
+#define PIXEL_VAR2_C( name, w, h ) \
+static int name( pixel *pix1, int i_stride1, pixel *pix2, int i_stride2, int *ssd ) \
+{ \
+    uint32_t var = 0, sum = 0, sqr = 0; \
+    for( int y = 0; y < h; y++ ) \
+    { \
+        for( int x = 0; x < w; x++ ) \
+        { \
+            int diff = pix1[x] - pix2[x]; \
+            sum += diff; \
+            sqr += diff * diff; \
+        } \
+        pix1 += i_stride1; \
+        pix2 += i_stride2; \
+    } \
+    sum = abs(sum); \
+    var = sqr - ((uint64_t)sum * sum >> 6); \
+    *ssd = sqr; \
+    return var; \
 }
 
+PIXEL_VAR2_C( x264_pixel_var2_8x16, 8, 16 )
+PIXEL_VAR2_C( x264_pixel_var2_8x8,  8,  8 )
+
 #if BIT_DEPTH > 8
     typedef uint32_t sum_t;
     typedef uint64_t sum2_t;
@@ -309,9 +315,9 @@ PIXEL_SATD_C( 16, 16, x264_pixel_satd_8x4 )
 PIXEL_SATD_C( 16, 8,  x264_pixel_satd_8x4 )
 PIXEL_SATD_C( 8,  16, x264_pixel_satd_8x4 )
 PIXEL_SATD_C( 8,  8,  x264_pixel_satd_8x4 )
+PIXEL_SATD_C( 4,  16, x264_pixel_satd_4x4 )
 PIXEL_SATD_C( 4,  8,  x264_pixel_satd_4x4 )
 
-
 static NOINLINE int sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
 {
     sum2_t tmp[8][4];
@@ -535,6 +541,8 @@ INTRA_MBCMP( sad,  4x4,   v, h, dc,  ,, _c )
 INTRA_MBCMP(satd,  4x4,   v, h, dc,  ,, _c )
 INTRA_MBCMP( sad,  8x8,  dc, h,  v, c,, _c )
 INTRA_MBCMP(satd,  8x8,  dc, h,  v, c,, _c )
+INTRA_MBCMP( sad,  8x16, dc, h,  v, c,, _c )
+INTRA_MBCMP(satd,  8x16, dc, h,  v, c,, _c )
 INTRA_MBCMP( sad, 16x16,  v, h, dc,  ,, _c )
 INTRA_MBCMP(satd, 16x16,  v, h, dc,  ,, _c )
 
@@ -754,23 +762,27 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 #define INIT7_NAME( name1, name2, cpu ) \
     INIT6_NAME( name1, name2, cpu ) \
     pixf->name1[PIXEL_4x4]   = x264_pixel_##name2##_4x4##cpu;
+#define INIT8_NAME( name1, name2, cpu ) \
+    INIT7_NAME( name1, name2, cpu ) \
+    pixf->name1[PIXEL_4x16]  = x264_pixel_##name2##_4x16##cpu;
 #define INIT2( name, cpu ) INIT2_NAME( name, name, cpu )
 #define INIT4( name, cpu ) INIT4_NAME( name, name, cpu )
 #define INIT5( name, cpu ) INIT5_NAME( name, name, cpu )
 #define INIT6( name, cpu ) INIT6_NAME( name, name, cpu )
 #define INIT7( name, cpu ) INIT7_NAME( name, name, cpu )
+#define INIT8( name, cpu ) INIT8_NAME( name, name, cpu )
 
 #define INIT_ADS( cpu ) \
     pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\
     pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\
     pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu;
 
-    INIT7( sad, );
-    INIT7_NAME( sad_aligned, sad, );
+    INIT8( sad, );
+    INIT8_NAME( sad_aligned, sad, );
     INIT7( sad_x3, );
     INIT7( sad_x4, );
-    INIT7( ssd, );
-    INIT7( satd, );
+    INIT8( ssd, );
+    INIT8( satd, );
     INIT7( satd_x3, );
     INIT7( satd_x4, );
     INIT4( hadamard_ac, );
@@ -779,12 +791,14 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16;
     pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8;
     pixf->var[PIXEL_16x16] = x264_pixel_var_16x16;
+    pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16;
     pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8;
+    pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16;
+    pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8;
 
     pixf->ssd_nv12_core = pixel_ssd_nv12_core;
     pixf->ssim_4x4x2_core = ssim_4x4x2_core;
     pixf->ssim_end4 = ssim_end4;
-    pixf->var2_8x8 = pixel_var2_8x8;
     pixf->vsad = pixel_vsad;
 
     pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4;
@@ -793,6 +807,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     pixf->intra_sa8d_x3_8x8   = x264_intra_sa8d_x3_8x8;
     pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c;
     pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c;
+    pixf->intra_sad_x3_8x16c  = x264_intra_sad_x3_8x16c;
+    pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c;
     pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16;
     pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16;
 
@@ -813,7 +829,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
         pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
         pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_mmx2;
-        pixf->var2_8x8 = x264_pixel_var2_8x8_mmx2;
+        pixf->var2[PIXEL_8x8]  = x264_pixel_var2_8x8_mmx2;
 
         pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_mmx2;
         pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_mmx2;
@@ -837,8 +853,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_sse2;
         pixf->ssim_end4        = x264_pixel_ssim_end4_sse2;
         pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
-        pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
-        pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
+        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_sse2;
+        pixf->var2[PIXEL_8x8]  = x264_pixel_var2_8x8_sse2;
     }
     if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
     {
@@ -937,7 +953,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_mmx2;
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2;
         pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2;
-        pixf->var2_8x8 = x264_pixel_var2_8x8_mmx2;
+        pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
         pixf->vsad = x264_pixel_vsad_mmx2;
 
         if( cpu&X264_CPU_CACHELINE_32 )
@@ -986,7 +1002,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 #if ARCH_X86_64
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
 #endif
-        pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
+        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_sse2;
         pixf->vsad = x264_pixel_vsad_sse2;
     }
 
@@ -1072,7 +1088,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 #if ARCH_X86_64
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
 #endif
-        pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3;
+        pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
         if( cpu&X264_CPU_SHUFFLE_IS_FAST )
             pixf->intra_sad_x3_8x8  = x264_intra_sad_x3_8x8_ssse3;
         if( cpu&X264_CPU_CACHELINE_64 )
@@ -1154,7 +1170,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
         pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
         pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
-        pixf->var2_8x8          = x264_pixel_var2_8x8_neon;
+        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;
 
         pixf->ssim_4x4x2_core   = x264_pixel_ssim_4x4x2_core_neon;
         pixf->ssim_end4         = x264_pixel_ssim_end4_neon;
diff --git a/common/pixel.h b/common/pixel.h
index c7ee0fbf..d2ea52f5 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -5,6 +5,7 @@
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Fiona Glaser <fiona@x264.com>
+            Henrik Gramner <hengar-6@student.ltu.se>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -42,22 +43,19 @@ enum
     PIXEL_8x4   = 4,
     PIXEL_4x8   = 5,
     PIXEL_4x4   = 6,
-    PIXEL_4x2   = 7,
-    PIXEL_2x4   = 8,
-    PIXEL_2x2   = 9,
+
+    /* Subsampled chroma only */
+    PIXEL_4x16  = 7,  /* 4:2:2 */
+    PIXEL_4x2   = 8,
+    PIXEL_2x8   = 9,  /* 4:2:2 */
+    PIXEL_2x4   = 10,
+    PIXEL_2x2   = 11,
 };
 
-static const struct
-{
-    int w;
-    int h;
-} x264_pixel_size[7] =
+static const struct { uint8_t w, h; } x264_pixel_size[12] =
 {
-    { 16, 16 },
-    { 16,  8 }, {  8, 16 },
-    {  8,  8 },
-    {  8,  4 }, {  4,  8 },
-    {  4,  4 }
+    { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 },
+    {  4, 16 }, {  4, 2 }, { 2,  8 }, { 2, 4 }, { 2, 2 },
 };
 
 static const uint8_t x264_size2pixel[5][5] =
@@ -69,23 +67,32 @@ static const uint8_t x264_size2pixel[5][5] =
     { 0, 0,        PIXEL_8x16, 0, PIXEL_16x16 }
 };
 
+static const uint8_t x264_luma2chroma_pixel[4][7] =
+{
+    { 0 },
+    { PIXEL_8x8,   PIXEL_8x4,  PIXEL_4x8,  PIXEL_4x4, PIXEL_4x2, PIXEL_2x4, PIXEL_2x2 }, /* 4:2:0 */
+    { PIXEL_8x16,  PIXEL_8x8,  PIXEL_4x16, PIXEL_4x8, PIXEL_4x4, PIXEL_2x8, PIXEL_2x4 }, /* 4:2:2 */
+    { PIXEL_16x16, PIXEL_16x8, PIXEL_8x16, PIXEL_8x8, PIXEL_8x4, PIXEL_4x8, PIXEL_4x4 }, /* 4:4:4 */
+};
+
 typedef struct
 {
-    x264_pixel_cmp_t  sad[7];
-    x264_pixel_cmp_t  ssd[7];
-    x264_pixel_cmp_t satd[7];
+    x264_pixel_cmp_t  sad[8];
+    x264_pixel_cmp_t  ssd[8];
+    x264_pixel_cmp_t satd[8];
     x264_pixel_cmp_t ssim[7];
     x264_pixel_cmp_t sa8d[4];
-    x264_pixel_cmp_t mbcmp[7]; /* either satd or sad for subpel refine and mode decision */
-    x264_pixel_cmp_t mbcmp_unaligned[7]; /* unaligned mbcmp for subpel */
-    x264_pixel_cmp_t fpelcmp[7]; /* either satd or sad for fullpel motion search */
+    x264_pixel_cmp_t mbcmp[8]; /* either satd or sad for subpel refine and mode decision */
+    x264_pixel_cmp_t mbcmp_unaligned[8]; /* unaligned mbcmp for subpel */
+    x264_pixel_cmp_t fpelcmp[8]; /* either satd or sad for fullpel motion search */
     x264_pixel_cmp_x3_t fpelcmp_x3[7];
     x264_pixel_cmp_x4_t fpelcmp_x4[7];
-    x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
+    x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */
     int (*vsad)( pixel *, int, int );
-    int (*var2_8x8)( pixel *, int, pixel *, int, int * );
 
     uint64_t (*var[4])( pixel *pix, int stride );
+    int (*var2[4])( pixel *pix1, int stride1,
+                    pixel *pix2, int stride2, int *ssd );
     uint64_t (*hadamard_ac[4])( pixel *pix, int stride );
 
     void (*ssd_nv12_core)( pixel *pixuv1, int stride1,
@@ -110,12 +117,18 @@ typedef struct
     void (*intra_mbcmp_x3_16x16)( pixel *fenc, pixel *fdec, int res[3] );
     void (*intra_satd_x3_16x16) ( pixel *fenc, pixel *fdec, int res[3] );
     void (*intra_sad_x3_16x16)  ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_mbcmp_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_satd_x3_8x8c)  ( pixel *fenc, pixel *fdec, int res[3] );
-    void (*intra_sad_x3_8x8c)   ( pixel *fenc, pixel *fdec, int res[3] );
     void (*intra_mbcmp_x3_4x4)  ( pixel *fenc, pixel *fdec, int res[3] );
     void (*intra_satd_x3_4x4)   ( pixel *fenc, pixel *fdec, int res[3] );
     void (*intra_sad_x3_4x4)    ( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_mbcmp_x3_chroma)( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_satd_x3_chroma) ( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_sad_x3_chroma)  ( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_mbcmp_x3_8x16c) ( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_satd_x3_8x16c)  ( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_sad_x3_8x16c)   ( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_mbcmp_x3_8x8c)  ( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_satd_x3_8x8c)   ( pixel *fenc, pixel *fdec, int res[3] );
+    void (*intra_sad_x3_8x8c)    ( pixel *fenc, pixel *fdec, int res[3] );
     void (*intra_mbcmp_x3_8x8)  ( pixel *fenc, pixel edge[36], int res[3] );
     void (*intra_sa8d_x3_8x8)   ( pixel *fenc, pixel edge[36], int res[3] );
     void (*intra_sad_x3_8x8)    ( pixel *fenc, pixel edge[36], int res[3] );
diff --git a/common/predict.c b/common/predict.c
index 34798c2f..f5ed6426 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -6,6 +6,7 @@
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
  *          Fiona Glaser <fiona@x264.com>
+ *          Henrik Gramner <hengar-6@student.ltu.se>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -151,7 +152,7 @@ void x264_predict_16x16_p_c( pixel *src )
 
 
 /****************************************************************************
- * 8x8 prediction for intra chroma block
+ * 8x8 prediction for intra chroma block (4:2:0)
  ****************************************************************************/
 
 static void x264_predict_8x8c_dc_128_c( pixel *src )
@@ -297,6 +298,167 @@ void x264_predict_8x8c_p_c( pixel *src )
     }
 }
 
+/****************************************************************************
+ * 8x16 prediction for intra chroma block (4:2:2)
+ ****************************************************************************/
+
+static void x264_predict_8x16c_dc_128_c( pixel *src )
+{
+    for( int y = 0; y < 16; y++ )
+    {
+        MPIXEL_X4( src+0 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
+        MPIXEL_X4( src+4 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
+        src += FDEC_STRIDE;
+    }
+}
+static void x264_predict_8x16c_dc_left_c( pixel *src )
+{
+    for( int i = 0; i < 4; i++ )
+    {
+        int dc = 0;
+
+        for( int y = 0; y < 4; y++ )
+            dc += src[y*FDEC_STRIDE - 1];
+
+        pixel4 dcsplat = PIXEL_SPLAT_X4( (dc + 2) >> 2 );
+
+        for( int y = 0; y < 4; y++ )
+        {
+            MPIXEL_X4( src+0 ) = dcsplat;
+            MPIXEL_X4( src+4 ) = dcsplat;
+            src += FDEC_STRIDE;
+        }
+    }
+}
+static void x264_predict_8x16c_dc_top_c( pixel *src )
+{
+    int dc0 = 0, dc1 = 0;
+
+    for(int  x = 0; x < 4; x++ )
+    {
+        dc0 += src[x     - FDEC_STRIDE];
+        dc1 += src[x + 4 - FDEC_STRIDE];
+    }
+    pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
+    pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );
+
+    for( int y = 0; y < 16; y++ )
+    {
+        MPIXEL_X4( src+0 ) = dc0splat;
+        MPIXEL_X4( src+4 ) = dc1splat;
+        src += FDEC_STRIDE;
+    }
+}
+void x264_predict_8x16c_dc_c( pixel *src )
+{
+    int s0 = 0, s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0;
+
+    /*
+          s0 s1
+       s2
+       s3
+       s4
+       s5
+    */
+    for( int i = 0; i < 4; i++ )
+    {
+        s0 += src[i+0 - FDEC_STRIDE];
+        s1 += src[i+4 - FDEC_STRIDE];
+        s2 += src[-1 + (i+0)  * FDEC_STRIDE];
+        s3 += src[-1 + (i+4)  * FDEC_STRIDE];
+        s4 += src[-1 + (i+8)  * FDEC_STRIDE];
+        s5 += src[-1 + (i+12) * FDEC_STRIDE];
+    }
+    /*
+       dc0 dc1
+       dc2 dc3
+       dc4 dc5
+       dc6 dc7
+    */
+    pixel4 dc0 = PIXEL_SPLAT_X4( ( s0 + s2 + 4 ) >> 3 );
+    pixel4 dc1 = PIXEL_SPLAT_X4( ( s1 + 2 ) >> 2 );
+    pixel4 dc2 = PIXEL_SPLAT_X4( ( s3 + 2 ) >> 2 );
+    pixel4 dc3 = PIXEL_SPLAT_X4( ( s1 + s3 + 4 ) >> 3 );
+    pixel4 dc4 = PIXEL_SPLAT_X4( ( s4 + 2 ) >> 2 );
+    pixel4 dc5 = PIXEL_SPLAT_X4( ( s1 + s4 + 4 ) >> 3 );
+    pixel4 dc6 = PIXEL_SPLAT_X4( ( s5 + 2 ) >> 2 );
+    pixel4 dc7 = PIXEL_SPLAT_X4( ( s1 + s5 + 4 ) >> 3 );
+
+    for( int y = 0; y < 4; y++ )
+    {
+        MPIXEL_X4( src+0 ) = dc0;
+        MPIXEL_X4( src+4 ) = dc1;
+        src += FDEC_STRIDE;
+    }
+    for( int y = 0; y < 4; y++ )
+    {
+        MPIXEL_X4( src+0 ) = dc2;
+        MPIXEL_X4( src+4 ) = dc3;
+        src += FDEC_STRIDE;
+    }
+    for( int y = 0; y < 4; y++ )
+    {
+        MPIXEL_X4( src+0 ) = dc4;
+        MPIXEL_X4( src+4 ) = dc5;
+        src += FDEC_STRIDE;
+    }
+    for( int y = 0; y < 4; y++ )
+    {
+        MPIXEL_X4( src+0 ) = dc6;
+        MPIXEL_X4( src+4 ) = dc7;
+        src += FDEC_STRIDE;
+    }
+}
+void x264_predict_8x16c_h_c( pixel *src )
+{
+    for( int i = 0; i < 16; i++ )
+    {
+        pixel4 v = PIXEL_SPLAT_X4( src[-1] );
+        MPIXEL_X4( src+0 ) = v;
+        MPIXEL_X4( src+4 ) = v;
+        src += FDEC_STRIDE;
+    }
+}
+void x264_predict_8x16c_v_c( pixel *src )
+{
+    pixel4 v0 = MPIXEL_X4( src+0-FDEC_STRIDE );
+    pixel4 v1 = MPIXEL_X4( src+4-FDEC_STRIDE );
+
+    for( int i = 0; i < 16; i++ )
+    {
+        MPIXEL_X4( src+0 ) = v0;
+        MPIXEL_X4( src+4 ) = v1;
+        src += FDEC_STRIDE;
+    }
+}
+void x264_predict_8x16c_p_c( pixel *src )
+{
+    int H = 0;
+    int V = 0;
+
+    for( int i = 0; i < 4; i++ )
+        H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );
+    for( int i = 0; i < 8; i++ )
+        V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] );
+
+    int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );
+    int b = ( 17 * H + 16 ) >> 5;
+    int c = ( 5 * V + 32 ) >> 6;
+    int i00 = a -3*b -7*c + 16;
+
+    for( int y = 0; y < 16; y++ )
+    {
+        int pix = i00;
+        for( int x = 0; x < 8; x++ )
+        {
+            src[x] = x264_clip_pixel( pix>>5 );
+            pix += b;
+        }
+        src += FDEC_STRIDE;
+        i00 += c;
+    }
+}
+
 /****************************************************************************
  * 4x4 prediction for intra luma block
  ****************************************************************************/
@@ -762,6 +924,17 @@ void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
 #endif
 }
 
+void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] )
+{
+    pf[I_PRED_CHROMA_V ]     = x264_predict_8x16c_v_c;
+    pf[I_PRED_CHROMA_H ]     = x264_predict_8x16c_h_c;
+    pf[I_PRED_CHROMA_DC]     = x264_predict_8x16c_dc_c;
+    pf[I_PRED_CHROMA_P ]     = x264_predict_8x16c_p_c;
+    pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_c;
+    pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_c;
+    pf[I_PRED_CHROMA_DC_128 ]= x264_predict_8x16c_dc_128_c;
+}
+
 void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
 {
     pf[I_PRED_8x8_V]      = x264_predict_8x8_v_c;
diff --git a/common/predict.h b/common/predict.h
index 23330f51..8ceb5773 100644
--- a/common/predict.h
+++ b/common/predict.h
@@ -42,7 +42,7 @@ enum intra_chroma_pred_e
     I_PRED_CHROMA_DC_TOP  = 5,
     I_PRED_CHROMA_DC_128  = 6
 };
-static const uint8_t x264_mb_pred_mode8x8c_fix[7] =
+static const uint8_t x264_mb_chroma_pred_mode_fix[7] =
 {
     I_PRED_CHROMA_DC, I_PRED_CHROMA_H, I_PRED_CHROMA_V, I_PRED_CHROMA_P,
     I_PRED_CHROMA_DC, I_PRED_CHROMA_DC,I_PRED_CHROMA_DC
@@ -123,9 +123,14 @@ void x264_predict_8x8c_dc_c ( pixel *src );
 void x264_predict_8x8c_h_c  ( pixel *src );
 void x264_predict_8x8c_v_c  ( pixel *src );
 void x264_predict_8x8c_p_c  ( pixel *src );
+void x264_predict_8x16c_dc_c( pixel *src );
+void x264_predict_8x16c_h_c ( pixel *src );
+void x264_predict_8x16c_v_c ( pixel *src );
+void x264_predict_8x16c_p_c ( pixel *src );
 
 void x264_predict_16x16_init ( int cpu, x264_predict_t pf[7] );
 void x264_predict_8x8c_init  ( int cpu, x264_predict_t pf[7] );
+void x264_predict_8x16c_init ( int cpu, x264_predict_t pf[7] );
 void x264_predict_4x4_init   ( int cpu, x264_predict_t pf[12] );
 void x264_predict_8x8_init   ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
 
diff --git a/common/quant.c b/common/quant.c
index 5be7f57f..db9d57a8 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -6,6 +6,7 @@
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Fiona Glaser <fiona@x264.com>
  *          Christian Heine <sennindemokrit@gmx.net>
+ *          Henrik Gramner <hengar-6@student.ltu.se>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -141,54 +142,121 @@ static void dequant_4x4_dc( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
     }
 }
 
-static ALWAYS_INLINE void idct_dequant_2x2_dconly( dctcoef out[4], dctcoef dct[4], int dequant_mf )
+#define IDCT_DEQUANT_2X4_START \
+    int a0 = dct[0] + dct[1]; \
+    int a1 = dct[2] + dct[3]; \
+    int a2 = dct[4] + dct[5]; \
+    int a3 = dct[6] + dct[7]; \
+    int a4 = dct[0] - dct[1]; \
+    int a5 = dct[2] - dct[3]; \
+    int a6 = dct[4] - dct[5]; \
+    int a7 = dct[6] - dct[7]; \
+    int b0 = a0 + a1; \
+    int b1 = a2 + a3; \
+    int b2 = a4 + a5; \
+    int b3 = a6 + a7; \
+    int b4 = a0 - a1; \
+    int b5 = a2 - a3; \
+    int b6 = a4 - a5; \
+    int b7 = a6 - a7;
+
+static void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
+{
+    IDCT_DEQUANT_2X4_START
+    int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
+    dct4x4[0][0] = ((b0 + b1) * dmf + 32) >> 6;
+    dct4x4[1][0] = ((b2 + b3) * dmf + 32) >> 6;
+    dct4x4[2][0] = ((b0 - b1) * dmf + 32) >> 6;
+    dct4x4[3][0] = ((b2 - b3) * dmf + 32) >> 6;
+    dct4x4[4][0] = ((b4 - b5) * dmf + 32) >> 6;
+    dct4x4[5][0] = ((b6 - b7) * dmf + 32) >> 6;
+    dct4x4[6][0] = ((b4 + b5) * dmf + 32) >> 6;
+    dct4x4[7][0] = ((b6 + b7) * dmf + 32) >> 6;
+}
+
+static void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
+{
+    IDCT_DEQUANT_2X4_START
+    int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
+    dct[0] = ((b0 + b1) * dmf + 32) >> 6;
+    dct[1] = ((b2 + b3) * dmf + 32) >> 6;
+    dct[2] = ((b0 - b1) * dmf + 32) >> 6;
+    dct[3] = ((b2 - b3) * dmf + 32) >> 6;
+    dct[4] = ((b4 - b5) * dmf + 32) >> 6;
+    dct[5] = ((b6 - b7) * dmf + 32) >> 6;
+    dct[6] = ((b4 + b5) * dmf + 32) >> 6;
+    dct[7] = ((b6 + b7) * dmf + 32) >> 6;
+}
+
+static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x4( dctcoef out[8], dctcoef dct[8], int dmf )
+{
+    IDCT_DEQUANT_2X4_START
+    out[0] = ((b0 + b1) * dmf + 2080) >> 6; /* 2080 = 32 + (32<<6) */
+    out[1] = ((b2 + b3) * dmf + 2080) >> 6;
+    out[2] = ((b0 - b1) * dmf + 2080) >> 6;
+    out[3] = ((b2 - b3) * dmf + 2080) >> 6;
+    out[4] = ((b4 - b5) * dmf + 2080) >> 6;
+    out[5] = ((b6 - b7) * dmf + 2080) >> 6;
+    out[6] = ((b4 + b5) * dmf + 2080) >> 6;
+    out[7] = ((b6 + b7) * dmf + 2080) >> 6;
+}
+#undef IDCT_DEQUANT_2X4_START
+
+static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x2( dctcoef out[4], dctcoef dct[4], int dmf )
 {
     int d0 = dct[0] + dct[1];
     int d1 = dct[2] + dct[3];
     int d2 = dct[0] - dct[1];
     int d3 = dct[2] - dct[3];
-    out[0] = (d0 + d1) * dequant_mf >> 5;
-    out[1] = (d0 - d1) * dequant_mf >> 5;
-    out[2] = (d2 + d3) * dequant_mf >> 5;
-    out[3] = (d2 - d3) * dequant_mf >> 5;
+    out[0] = ((d0 + d1) * dmf >> 5) + 32;
+    out[1] = ((d0 - d1) * dmf >> 5) + 32;
+    out[2] = ((d2 + d3) * dmf >> 5) + 32;
+    out[3] = ((d2 - d3) * dmf >> 5) + 32;
 }
 
-static ALWAYS_INLINE int idct_dequant_round_2x2_dc( dctcoef ref[4], dctcoef dct[4], int dequant_mf )
+static ALWAYS_INLINE int optimize_chroma_round( dctcoef *ref, dctcoef *dct, int dequant_mf, int chroma422 )
 {
-    dctcoef out[4];
-    idct_dequant_2x2_dconly( out, dct, dequant_mf );
-    return ((ref[0] ^ (out[0]+32))
-          | (ref[1] ^ (out[1]+32))
-          | (ref[2] ^ (out[2]+32))
-          | (ref[3] ^ (out[3]+32))) >> 6;
+    dctcoef out[8];
+
+    if( chroma422 )
+        optimize_chroma_idct_dequant_2x4( out, dct, dequant_mf );
+    else
+        optimize_chroma_idct_dequant_2x2( out, dct, dequant_mf );
+
+    int sum = 0;
+    for( int i = 0; i < (chroma422?8:4); i++ )
+        sum |= ref[i] ^ out[i];
+    return sum >> 6;
 }
 
-static int optimize_chroma_dc( dctcoef dct[4], int dequant_mf )
+static ALWAYS_INLINE int optimize_chroma_dc_internal( dctcoef *dct, int dequant_mf, int chroma422 )
 {
     /* dequant_mf = h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << i_qp/6, max 32*64 */
-    dctcoef dct_orig[4];
+    dctcoef dct_orig[8];
     int coeff, nz;
 
-    idct_dequant_2x2_dconly( dct_orig, dct, dequant_mf );
-    dct_orig[0] += 32;
-    dct_orig[1] += 32;
-    dct_orig[2] += 32;
-    dct_orig[3] += 32;
+    if( chroma422 )
+        optimize_chroma_idct_dequant_2x4( dct_orig, dct, dequant_mf );
+    else
+        optimize_chroma_idct_dequant_2x2( dct_orig, dct, dequant_mf );
 
     /* If the DC coefficients already round to zero, terminate early. */
-    if( !((dct_orig[0]|dct_orig[1]|dct_orig[2]|dct_orig[3])>>6) )
+    int sum = 0;
+    for( int i = 0; i < (chroma422?8:4); i++ )
+        sum |= dct_orig[i];
+    if( !(sum >> 6) )
         return 0;
 
     /* Start with the highest frequency coefficient... is this the best option? */
-    for( nz = 0, coeff = 3; coeff >= 0; coeff-- )
+    for( nz = 0, coeff = (chroma422?7:3); coeff >= 0; coeff-- )
     {
         int level = dct[coeff];
-        int sign = level>>31 | 1; /* dct2x2[coeff] < 0 ? -1 : 1 */
+        int sign = level>>31 | 1; /* dct[coeff] < 0 ? -1 : 1 */
 
         while( level )
         {
             dct[coeff] = level - sign;
-            if( idct_dequant_round_2x2_dc( dct_orig, dct, dequant_mf ) )
+            if( optimize_chroma_round( dct_orig, dct, dequant_mf, chroma422 ) )
             {
                 nz = 1;
                 dct[coeff] = level;
@@ -201,6 +269,16 @@ static int optimize_chroma_dc( dctcoef dct[4], int dequant_mf )
     return nz;
 }
 
+static int optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
+{
+    return optimize_chroma_dc_internal( dct, dequant_mf, 0 );
+}
+
+static int optimize_chroma_2x4_dc( dctcoef dct[8], int dequant_mf )
+{
+    return optimize_chroma_dc_internal( dct, dequant_mf, 1 );
+}
+
 static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size )
 {
     for( int i = 0; i < size; i++ )
@@ -275,30 +353,20 @@ static int x264_decimate_score64( dctcoef *dct )
     return x264_decimate_score_internal( dct, 64 );
 }
 
-static int ALWAYS_INLINE x264_coeff_last_internal( dctcoef *l, int i_count )
-{
-    int i_last = i_count-1;
-    while( i_last >= 0 && l[i_last] == 0 )
-        i_last--;
-    return i_last;
+#define last(num)\
+static int x264_coeff_last##num( dctcoef *l )\
+{\
+    int i_last = num-1;\
+    while( i_last >= 0 && l[i_last] == 0 )\
+        i_last--;\
+    return i_last;\
 }
 
-static int x264_coeff_last4( dctcoef *l )
-{
-    return x264_coeff_last_internal( l, 4 );
-}
-static int x264_coeff_last15( dctcoef *l )
-{
-    return x264_coeff_last_internal( l, 15 );
-}
-static int x264_coeff_last16( dctcoef *l )
-{
-    return x264_coeff_last_internal( l, 16 );
-}
-static int x264_coeff_last64( dctcoef *l )
-{
-    return x264_coeff_last_internal( l, 64 );
-}
+last(4)
+last(8)
+last(15)
+last(16)
+last(64)
 
 #define level_run(num)\
 static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )\
@@ -317,10 +385,10 @@ static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )
 }
 
 level_run(4)
+level_run(8)
 level_run(15)
 level_run(16)
 
-
 void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
 {
     pf->quant_8x8 = quant_8x8;
@@ -332,18 +400,24 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     pf->dequant_4x4_dc = dequant_4x4_dc;
     pf->dequant_8x8 = dequant_8x8;
 
-    pf->optimize_chroma_dc = optimize_chroma_dc;
+    pf->idct_dequant_2x4_dc = idct_dequant_2x4_dc;
+    pf->idct_dequant_2x4_dconly = idct_dequant_2x4_dconly;
+
+    pf->optimize_chroma_2x2_dc = optimize_chroma_2x2_dc;
+    pf->optimize_chroma_2x4_dc = optimize_chroma_2x4_dc;
 
     pf->denoise_dct = x264_denoise_dct;
     pf->decimate_score15 = x264_decimate_score15;
     pf->decimate_score16 = x264_decimate_score16;
     pf->decimate_score64 = x264_decimate_score64;
 
-    pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4;
+    pf->coeff_last4 = x264_coeff_last4;
+    pf->coeff_last8 = x264_coeff_last8;
     pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15;
     pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16;
     pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64;
-    pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4;
+    pf->coeff_level_run4 = x264_coeff_level_run4;
+    pf->coeff_level_run8 = x264_coeff_level_run8;
     pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15;
     pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16;
 
@@ -361,16 +435,16 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
             pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
         }
         pf->decimate_score64 = x264_decimate_score64_mmx2;
-        pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2;
+        pf->coeff_last4 = x264_coeff_last4_mmx2;
         pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmx2;
         pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
         pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
         pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
         pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
 #endif
-        pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2;
+        pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
         if( cpu&X264_CPU_LZCNT )
-            pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2_lzcnt;
+            pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
     }
     if( cpu&X264_CPU_SSE2 )
     {
@@ -397,7 +471,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
         if( cpu&X264_CPU_LZCNT )
         {
-            pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2_lzcnt;
+            pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
             pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
             pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
             pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
@@ -471,12 +545,12 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
         pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
 #endif
-        pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2;
-        pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2;
+        pf->coeff_last4 = x264_coeff_last4_mmx2;
+        pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
         if( cpu&X264_CPU_LZCNT )
         {
-            pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2_lzcnt;
-            pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2_lzcnt;
+            pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
+            pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
         }
     }
 
@@ -493,7 +567,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
             pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
             pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
         }
-        pf->optimize_chroma_dc = x264_optimize_chroma_dc_sse2;
+        pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse2;
         pf->denoise_dct = x264_denoise_dct_sse2;
         pf->decimate_score15 = x264_decimate_score15_sse2;
         pf->decimate_score16 = x264_decimate_score16_sse2;
@@ -524,7 +598,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
         pf->quant_4x4 = x264_quant_4x4_ssse3;
         pf->quant_8x8 = x264_quant_8x8_ssse3;
-        pf->optimize_chroma_dc = x264_optimize_chroma_dc_ssse3;
+        pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3;
         pf->denoise_dct = x264_denoise_dct_ssse3;
         pf->decimate_score15 = x264_decimate_score15_ssse3;
         pf->decimate_score16 = x264_decimate_score16_ssse3;
@@ -541,7 +615,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
         pf->quant_4x4 = x264_quant_4x4_sse4;
         pf->quant_8x8 = x264_quant_8x8_sse4;
-        pf->optimize_chroma_dc = x264_optimize_chroma_dc_sse4;
+        pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse4;
     }
 
     if( cpu&X264_CPU_AVX )
@@ -552,7 +626,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
             pf->dequant_4x4 = x264_dequant_4x4_avx;
             pf->dequant_8x8 = x264_dequant_8x8_avx;
         }
-        pf->optimize_chroma_dc = x264_optimize_chroma_dc_avx;
+        pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx;
         pf->denoise_dct = x264_denoise_dct_avx;
     }
 #endif // HAVE_MMX
@@ -571,7 +645,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
 
 #if HAVE_ARMV6
     if( cpu&X264_CPU_ARMV6 )
-        pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_arm;
+        pf->coeff_last4 = x264_coeff_last4_arm;
 
     if( cpu&X264_CPU_NEON )
     {
diff --git a/common/quant.h b/common/quant.h
index 09364143..9ad5385a 100644
--- a/common/quant.h
+++ b/common/quant.h
@@ -38,7 +38,11 @@ typedef struct
     void (*dequant_4x4)( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
     void (*dequant_4x4_dc)( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 
-    int (*optimize_chroma_dc)( dctcoef dct[4], int dequant_mf );
+    void (*idct_dequant_2x4_dc)( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
+    void (*idct_dequant_2x4_dconly)( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
+
+    int (*optimize_chroma_2x2_dc)( dctcoef dct[4], int dequant_mf );
+    int (*optimize_chroma_2x4_dc)( dctcoef dct[8], int dequant_mf );
 
     void (*denoise_dct)( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
 
@@ -46,7 +50,11 @@ typedef struct
     int (*decimate_score16)( dctcoef *dct );
     int (*decimate_score64)( dctcoef *dct );
     int (*coeff_last[14])( dctcoef *dct );
+    int (*coeff_last4)( dctcoef *dct );
+    int (*coeff_last8)( dctcoef *dct );
     int (*coeff_level_run[13])( dctcoef *dct, x264_run_level_t *runlevel );
+    int (*coeff_level_run4)( dctcoef *dct, x264_run_level_t *runlevel );
+    int (*coeff_level_run8)( dctcoef *dct, x264_run_level_t *runlevel );
 } x264_quant_function_t;
 
 void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
diff --git a/common/set.h b/common/set.h
index 4bbfea6e..038dbd4c 100644
--- a/common/set.h
+++ b/common/set.h
@@ -35,10 +35,17 @@ enum profile_e
     PROFILE_HIGH    = 100,
     PROFILE_HIGH10  = 110,
     PROFILE_HIGH422 = 122,
-    PROFILE_HIGH444 = 144,
     PROFILE_HIGH444_PREDICTIVE = 244,
 };
 
+enum chroma_format_e
+{
+    CHROMA_400 = 0,
+    CHROMA_420 = 1,
+    CHROMA_422 = 2,
+    CHROMA_444 = 3,
+};
+
 enum cqm4_e
 {
     CQM_4IY = 0,
diff --git a/common/vlc.c b/common/vlc.c
index 1d002bbc..bd2fc52c 100644
--- a/common/vlc.c
+++ b/common/vlc.c
@@ -5,6 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Fiona Glaser <fiona@x264.com>
+ *          Henrik Gramner <hengar-6@student.ltu.se>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -26,16 +27,19 @@
 
 #include "common.h"
 
-const vlc_t x264_coeff0_token[5] =
+/* [nC] */
+const vlc_t x264_coeff0_token[6] =
 {
     { 0x1, 1 }, /* str=1 */
     { 0x3, 2 }, /* str=11 */
     { 0xf, 4 }, /* str=1111 */
     { 0x3, 6 }, /* str=000011 */
     { 0x1, 2 }, /* str=01 */
+    { 0x1, 1 }, /* str=1 */
 };
 
-const vlc_t x264_coeff_token[5][16][4] =
+/* [nC][i_total_coeff-1][i_trailing] */
+const vlc_t x264_coeff_token[6][16][4] =
 {
     { /* table 0 */
         { /* i_total 1 */
@@ -440,6 +444,53 @@ const vlc_t x264_coeff_token[5][16][4] =
             { 0x0, 7 }, /* str=0000000 */
         },
     },
+    { /* table 5 */
+        { /* i_total 1 */
+            { 0xf, 7 }, /* str=0001111 */
+            { 0x1, 2 }, /* str=01 */
+        },
+        { /* i_total 2 */
+            { 0xe, 7 }, /* str=0001110 */
+            { 0xd, 7 }, /* str=0001101 */
+            { 0x1, 3 }, /* str=001 */
+        },
+        { /* i_total 3 */
+            { 0x7, 9 }, /* str=000000111 */
+            { 0xc, 7 }, /* str=0001100 */
+            { 0xb, 7 }, /* str=0001011 */
+            { 0x1, 5 }, /* str=00001 */
+        },
+        { /* i_total 4 */
+            { 0x6, 9 }, /* str=000000110 */
+            { 0x5, 9 }, /* str=000000101 */
+            { 0xa, 7 }, /* str=0001010 */
+            { 0x1, 6 }, /* str=000001 */
+        },
+        { /* i_total 5 */
+            { 0x7, 10 }, /* str=0000000111 */
+            { 0x6, 10 }, /* str=0000000110 */
+            { 0x4, 9 },  /* str=000000100 */
+            { 0x9, 7 },  /* str=0001001 */
+        },
+        { /* i_total 6 */
+            { 0x7, 11 }, /* str=00000000111 */
+            { 0x6, 11 }, /* str=00000000110 */
+            { 0x5, 10 }, /* str=0000000101 */
+            { 0x8, 7 },  /* str=0001000 */
+        },
+        { /* i_total 7 */
+            { 0x7, 12 }, /* str=000000000111 */
+            { 0x6, 12 }, /* str=000000000110 */
+            { 0x5, 11 }, /* str=00000000101 */
+            { 0x4, 10 }, /* str=0000000100 */
+        },
+        { /* i_total 8 */
+            { 0x7, 13 }, /* str=0000000000111 */
+            { 0x5, 12 }, /* str=000000000101 */
+            { 0x4, 12 }, /* str=000000000100 */
+            { 0x4, 11 }, /* str=00000000100 */
+        },
+    },
 };
 
 /* [i_total_coeff-1][i_total_zeros] */
@@ -613,7 +664,7 @@ const vlc_t x264_total_zeros[15][16] =
 };
 
 /* [i_total_coeff-1][i_total_zeros] */
-const vlc_t x264_total_zeros_dc[3][4] =
+const vlc_t x264_total_zeros_2x2_dc[3][4] =
 {
     { /* i_total 1 */
         { 0x1, 1 }, /* str=1 */
@@ -632,7 +683,61 @@ const vlc_t x264_total_zeros_dc[3][4] =
     },
 };
 
-/* x264_run_before[__MIN( i_zero_left -1, 6 )][run_before] */
+/* [i_total_coeff-1][i_total_zeros] */
+const vlc_t x264_total_zeros_2x4_dc[7][8] =
+{
+    { /* i_total 1 */
+        { 0x1, 1 }, /* str=1 */
+        { 0x2, 3 }, /* str=010 */
+        { 0x3, 3 }, /* str=011 */
+        { 0x2, 4 }, /* str=0010 */
+        { 0x3, 4 }, /* str=0011 */
+        { 0x1, 4 }, /* str=0001 */
+        { 0x1, 5 }, /* str=00001 */
+        { 0x0, 5 }, /* str=00000 */
+    },
+    { /* i_total 2 */
+        { 0x0, 3 }, /* str=000 */
+        { 0x1, 2 }, /* str=01 */
+        { 0x1, 3 }, /* str=001 */
+        { 0x4, 3 }, /* str=100 */
+        { 0x5, 3 }, /* str=101 */
+        { 0x6, 3 }, /* str=110 */
+        { 0x7, 3 }, /* str=111 */
+    },
+    { /* i_total 3 */
+        { 0x0, 3 }, /* str=000 */
+        { 0x1, 3 }, /* str=001 */
+        { 0x1, 2 }, /* str=01 */
+        { 0x2, 2 }, /* str=10 */
+        { 0x6, 3 }, /* str=110 */
+        { 0x7, 3 }, /* str=111 */
+    },
+    { /* i_total 4 */
+        { 0x6, 3 }, /* str=110 */
+        { 0x0, 2 }, /* str=00 */
+        { 0x1, 2 }, /* str=01 */
+        { 0x2, 2 }, /* str=10 */
+        { 0x7, 3 }, /* str=111 */
+    },
+    { /* i_total 5 */
+        { 0x0, 2 }, /* str=00 */
+        { 0x1, 2 }, /* str=01 */
+        { 0x2, 2 }, /* str=10 */
+        { 0x3, 2 }, /* str=11 */
+    },
+    { /* i_total 6 */
+        { 0x0, 2 }, /* str=00 */
+        { 0x1, 2 }, /* str=01 */
+        { 0x1, 1 }, /* str=1 */
+    },
+    { /* i_total 7 */
+        { 0x0, 1 }, /* str=0 */
+        { 0x1, 1 }, /* str=1 */
+    }
+};
+
+/* [MIN( i_zero_left-1, 6 )][run_before] */
 const vlc_t x264_run_before[7][16] =
 {
     { /* i_zero_left 1 */
@@ -674,7 +779,7 @@ const vlc_t x264_run_before[7][16] =
         { 0x5, 3 }, /* str=101 */
         { 0x4, 3 }, /* str=100 */
     },
-    { /* i_zero_left 7 */
+    { /* i_zero_left >6 */
         { 0x7, 3 }, /* str=111 */
         { 0x6, 3 }, /* str=110 */
         { 0x5, 3 }, /* str=101 */
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 24a5c3fa..f5c0d797 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1015,10 +1015,9 @@ cglobal plane_copy_interleave_core, 7,7
     RET
 
 ;-----------------------------------------------------------------------------
-; void store_interleave_8x8x2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv )
+; void store_interleave_chroma( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv, int height )
 ;-----------------------------------------------------------------------------
-cglobal store_interleave_8x8x2, 4,5
-    mov    r4d, 4
+cglobal store_interleave_chroma, 5,5
     FIX_STRIDES r1d
 .loop:
     INTERLEAVE r0+ 0, r2+           0, r3+           0, a
@@ -1026,7 +1025,7 @@ cglobal store_interleave_8x8x2, 4,5
     add    r2, FDEC_STRIDEB*2
     add    r3, FDEC_STRIDEB*2
     lea    r0, [r0+r1*2]
-    dec    r4d
+    sub   r4d, 2
     jg .loop
     REP_RET
 %endmacro ; PLANE_INTERLEAVE
@@ -1076,34 +1075,32 @@ cglobal plane_copy_deinterleave, 6,7
     REP_RET
 
 ;-----------------------------------------------------------------------------
-; void load_deinterleave_8x8x2_fenc( pixel *dst, pixel *src, int i_src )
+; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height )
 ;-----------------------------------------------------------------------------
-cglobal load_deinterleave_8x8x2_fenc, 3,4
+cglobal load_deinterleave_chroma_fenc, 4,4
     DEINTERLEAVE_START
-    mov    r3d, 4
     FIX_STRIDES r2d
 .loop:
     DEINTERLEAVE r0+           0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
     DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
     add    r0, FENC_STRIDEB*2
     lea    r1, [r1+r2*2]
-    dec    r3d
+    sub   r3d, 2
     jg .loop
     REP_RET
 
 ;-----------------------------------------------------------------------------
-; void load_deinterleave_8x8x2_fdec( pixel *dst, pixel *src, int i_src )
+; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height )
 ;-----------------------------------------------------------------------------
-cglobal load_deinterleave_8x8x2_fdec, 3,4
+cglobal load_deinterleave_chroma_fdec, 4,4
     DEINTERLEAVE_START
-    mov    r3d, 4
     FIX_STRIDES r2d
 .loop:
     DEINTERLEAVE r0+           0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
     DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
     add    r0, FDEC_STRIDEB*2
     lea    r1, [r1+r2*2]
-    dec    r3d
+    sub   r3d, 2
     jg .loop
     REP_RET
 %endmacro ; PLANE_DEINTERLEAVE
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 52e62d6e..6a730475 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -113,17 +113,17 @@ void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, int i_dstu,
 void x264_plane_copy_deinterleave_avx( uint16_t *dstu, int i_dstu,
                                          uint16_t *dstv, int i_dstv,
                                          uint16_t *src, int i_src, int w, int h );
-void x264_store_interleave_8x8x2_mmx2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
-void x264_store_interleave_8x8x2_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
-void x264_store_interleave_8x8x2_avx( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
-void x264_load_deinterleave_8x8x2_fenc_mmx( pixel *dst, pixel *src, int i_src );
-void x264_load_deinterleave_8x8x2_fenc_sse2( pixel *dst, pixel *src, int i_src );
-void x264_load_deinterleave_8x8x2_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src );
-void x264_load_deinterleave_8x8x2_fenc_avx( uint16_t *dst, uint16_t *src, int i_src );
-void x264_load_deinterleave_8x8x2_fdec_mmx( pixel *dst, pixel *src, int i_src );
-void x264_load_deinterleave_8x8x2_fdec_sse2( pixel *dst, pixel *src, int i_src );
-void x264_load_deinterleave_8x8x2_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src );
-void x264_load_deinterleave_8x8x2_fdec_avx( uint16_t *dst, uint16_t *src, int i_src );
+void x264_store_interleave_chroma_mmx2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_store_interleave_chroma_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_store_interleave_chroma_avx( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_load_deinterleave_chroma_fenc_mmx( pixel *dst, pixel *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fdec_mmx( pixel *dst, pixel *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, int i_src, int height );
 void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
 void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
 void x264_memzero_aligned_mmx( void * dst, int n );
@@ -497,8 +497,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     if( !(cpu&X264_CPU_MMX) )
         return;
 
-    pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_mmx;
-    pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_mmx;
+    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_mmx;
+    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_mmx;
 
     pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx;
 
@@ -519,7 +519,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
 
     pf->plane_copy = x264_plane_copy_mmx2;
     pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
-    pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmx2;
+    pf->store_interleave_chroma = x264_store_interleave_chroma_mmx2;
 
     pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmx2;
     pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_mmx2;
@@ -552,8 +552,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
 
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
 
-    pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_sse2;
-    pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_sse2;
+    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
+    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
 
     pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
     pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
@@ -570,7 +570,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->integral_init4v = x264_integral_init4v_sse2;
     pf->integral_init8v = x264_integral_init8v_sse2;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
-    pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_sse2;
+    pf->store_interleave_chroma = x264_store_interleave_chroma_sse2;
     pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
     pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
 
@@ -603,11 +603,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     if( !(cpu&X264_CPU_AVX) )
         return;
 
-    pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_avx;
-    pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_avx;
+    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx;
+    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;
     pf->plane_copy_interleave        = x264_plane_copy_interleave_avx;
     pf->plane_copy_deinterleave      = x264_plane_copy_deinterleave_avx;
-    pf->store_interleave_8x8x2       = x264_store_interleave_8x8x2_avx;
+    pf->store_interleave_chroma      = x264_store_interleave_chroma_avx;
 
     if( !(cpu&X264_CPU_STACK_MOD4) )
         pf->mc_chroma = x264_mc_chroma_avx;
@@ -663,9 +663,9 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
 
     if( cpu&X264_CPU_SSE2_IS_FAST )
     {
-        pf->store_interleave_8x8x2  = x264_store_interleave_8x8x2_sse2; // FIXME sse2fast? sse2medium?
-        pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_sse2;
-        pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_sse2;
+        pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
+        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
+        pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
         pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
         pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
         pf->mc_luma = mc_luma_sse2;
@@ -695,8 +695,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_ssse3;
     pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_ssse3;
 
-    pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_ssse3;
-    pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_ssse3;
+    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
+    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
     pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
 
     pf->hpel_filter = x264_hpel_filter_ssse3;
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 215a7170..40f9ed58 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -670,10 +670,10 @@ DEQUANT_DC w, pmullw
 %endif
 
 ;-----------------------------------------------------------------------------
-; x264_optimize_chroma_dc( dctcoef dct[4], int dequant_mf )
+; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
 ;-----------------------------------------------------------------------------
 
-%macro OPTIMIZE_CHROMA_DC 0
+%macro OPTIMIZE_CHROMA_2x2_DC 0
 %assign %%regs 5
 %if cpuflag(sse4)
     %assign %%regs %%regs-1
@@ -681,7 +681,7 @@ DEQUANT_DC w, pmullw
 %ifndef ARCH_X86_64
     %assign %%regs %%regs+1      ; t0-t4 are volatile on x86-64
 %endif
-cglobal optimize_chroma_dc, 0,%%regs,7
+cglobal optimize_chroma_2x2_dc, 0,%%regs,7
     movifnidn t0, r0mp
     movd      m2, r1m
     movq      m1, [t0]
@@ -775,13 +775,13 @@ cglobal optimize_chroma_dc, 0,%%regs,7
 
 %ifndef HIGH_BIT_DEPTH
 INIT_XMM sse2
-OPTIMIZE_CHROMA_DC
+OPTIMIZE_CHROMA_2x2_DC
 INIT_XMM ssse3
-OPTIMIZE_CHROMA_DC
+OPTIMIZE_CHROMA_2x2_DC
 INIT_XMM sse4
-OPTIMIZE_CHROMA_DC
+OPTIMIZE_CHROMA_2x2_DC
 INIT_XMM avx
-OPTIMIZE_CHROMA_DC
+OPTIMIZE_CHROMA_2x2_DC
 %endif ; !HIGH_BIT_DEPTH
 
 %ifdef HIGH_BIT_DEPTH
diff --git a/common/x86/quant.h b/common/x86/quant.h
index 4abaea09..8b604720 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -57,10 +57,10 @@ void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_
 void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
 void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
-int x264_optimize_chroma_dc_sse2( dctcoef dct[4], int dequant_mf );
-int x264_optimize_chroma_dc_ssse3( dctcoef dct[4], int dequant_mf );
-int x264_optimize_chroma_dc_sse4( dctcoef dct[4], int dequant_mf );
-int x264_optimize_chroma_dc_avx( dctcoef dct[4], int dequant_mf );
+int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf );
+int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf );
+int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf );
+int x264_optimize_chroma_2x2_dc_avx( dctcoef dct[4], int dequant_mf );
 void x264_denoise_dct_mmx  ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
 void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
 void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 69de5174..b5b5a78d 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -103,8 +103,8 @@ typedef struct
     int i_satd_pcm;
 
     /* Chroma part */
-    int i_satd_i8x8chroma;
-    int i_satd_i8x8chroma_dir[7];
+    int i_satd_chroma;
+    int i_satd_chroma_dir[7];
     int i_predict8x8chroma;
 
     /* II: Inter part P/B frame */
@@ -431,7 +431,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
     a->i_satd_i16x16 =
     a->i_satd_i8x8   =
     a->i_satd_i4x4   =
-    a->i_satd_i8x8chroma = COST_MAX;
+    a->i_satd_chroma = COST_MAX;
 
     /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
     a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
@@ -607,7 +607,7 @@ static const int8_t i16x16_mode_available[5][5] =
     {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
 };
 
-static const int8_t i8x8chroma_mode_available[5][5] =
+static const int8_t chroma_mode_available[5][5] =
 {
     {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
     {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
@@ -641,11 +641,11 @@ static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour
     return i16x16_mode_available[idx];
 }
 
-static ALWAYS_INLINE const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
+static ALWAYS_INLINE const int8_t *predict_chroma_mode_available( int i_neighbour )
 {
     int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
     idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
-    return i8x8chroma_mode_available[idx];
+    return chroma_mode_available[idx];
 }
 
 static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i )
@@ -690,45 +690,46 @@ static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
 
 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
 {
-    if( a->i_satd_i8x8chroma < COST_MAX )
+    if( a->i_satd_chroma < COST_MAX )
         return;
 
     if( CHROMA444 )
     {
         if( !h->mb.b_chroma_me )
         {
-            a->i_satd_i8x8chroma = 0;
+            a->i_satd_chroma = 0;
             return;
         }
 
         /* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */
         h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
         h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
-        a->i_satd_i8x8chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
-                             + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
+        a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
+                         + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
         return;
     }
 
-    const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
+    const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
+    int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
 
-    /* 8x8 prediction selection for chroma */
+    /* Prediction selection for chroma */
     if( predict_mode[3] >= 0 && !h->mb.b_lossless )
     {
         int satdu[4], satdv[4];
-        h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
-        h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
-        h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
-        h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
-        satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
-        satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
+        h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
+        h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
+        h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
+        h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
+        satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
+        satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
 
         for( ; *predict_mode >= 0; predict_mode++ )
         {
             int i_mode = *predict_mode;
             int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
 
-            a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
-            COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
+            a->i_satd_chroma_dir[i_mode] = i_satd;
+            COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
         }
     }
     else
@@ -740,20 +741,20 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
 
             /* we do the prediction */
             if( h->mb.b_lossless )
-                x264_predict_lossless_8x8_chroma( h, i_mode );
+                x264_predict_lossless_chroma( h, i_mode );
             else
             {
-                h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
-                h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
+                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
             }
 
             /* we calculate the cost */
-            i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
-                     h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
-                     a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
+            i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
+                     h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
+                     a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] );
 
-            a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
-            COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
+            a->i_satd_chroma_dir[i_mode] = i_satd;
+            COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
         }
     }
 
@@ -1110,17 +1111,17 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
     /* RD selection for chroma prediction */
     if( !CHROMA444 )
     {
-        const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
+        const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
         if( predict_mode[1] >= 0 )
         {
             int8_t predict_mode_sorted[4];
             int i_max;
-            int i_thresh = a->b_early_terminate ? a->i_satd_i8x8chroma * 5/4 : COST_MAX;
+            int i_thresh = a->b_early_terminate ? a->i_satd_chroma * 5/4 : COST_MAX;
 
             for( i_max = 0; *predict_mode >= 0; predict_mode++ )
             {
                 int i_mode = *predict_mode;
-                if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
+                if( a->i_satd_chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
                     predict_mode_sorted[i_max++] = i_mode;
             }
 
@@ -1131,21 +1132,21 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
                 /* the previous thing encoded was x264_intra_rd(), so the pixels and
                  * coefs for the current chroma mode are still around, so we only
                  * have to recount the bits. */
-                i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
+                i_best = x264_rd_cost_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
                 for( int i = 0; i < i_max; i++ )
                 {
                     int i_mode = predict_mode_sorted[i];
                     if( h->mb.b_lossless )
-                        x264_predict_lossless_8x8_chroma( h, i_mode );
+                        x264_predict_lossless_chroma( h, i_mode );
                     else
                     {
-                        h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
-                        h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+                        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
+                        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
                     }
                     /* if we've already found a mode that needs no residual, then
                      * probably any mode with a residual will be worse.
                      * so avoid dct on the remaining modes to improve speed. */
-                    i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
+                    i_satd = x264_rd_cost_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
                     COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
                 }
                 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
@@ -1273,14 +1274,13 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
 
 #define LOAD_FENC(m, src, xoff, yoff) \
 { \
-    int s = !CHROMA444; \
     (m)->p_cost_mv = a->p_cost_mv; \
     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
     (m)->i_stride[2] = h->mb.pic.i_stride[2]; \
     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
-    (m)->p_fenc[1] = &(src)[1][((xoff)>>s)+((yoff)>>s)*FENC_STRIDE]; \
-    (m)->p_fenc[2] = &(src)[2][((xoff)>>s)+((yoff)>>s)*FENC_STRIDE]; \
+    (m)->p_fenc[1] = &(src)[1][((xoff)>>h->mb.chroma_h_shift)+((yoff)>>h->mb.chroma_v_shift)*FENC_STRIDE]; \
+    (m)->p_fenc[2] = &(src)[2][((xoff)>>h->mb.chroma_h_shift)+((yoff)>>h->mb.chroma_v_shift)*FENC_STRIDE]; \
 }
 
 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
@@ -1301,7 +1301,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
         (m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \
     } \
     else \
-        (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>1)*(m)->i_stride[1]]; \
+        (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>h->mb.chroma_v_shift)*(m)->i_stride[1]]; \
     (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
     (m)->weight = x264_weight_none; \
     (m)->i_ref = ref; \
@@ -1672,19 +1672,22 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i
     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
 }
 
-static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
+static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
+                                                                     pixel **p_fref, int i8x8, int size, int chroma )
 {
-    ALIGNED_ARRAY_16( pixel, pix1,[16*8] );
+    ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
     pixel *pix2 = pix1+8;
-    const int i_stride = h->mb.pic.i_stride[1];
-    const int or = 8*(i8x8&1) + 2*(i8x8&2)*i_stride;
-    const int i_ref = a->l0.me8x8[i8x8].i_ref;
-    const int mvy_offset = MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+    int i_stride = h->mb.pic.i_stride[1];
+    int chroma_h_shift = chroma <= CHROMA_422;
+    int chroma_v_shift = chroma == CHROMA_420;
+    int or = 8*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*i_stride;
+    int i_ref = a->l0.me8x8[i8x8].i_ref;
+    int mvy_offset = chroma_v_shift && MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
     x264_weight_t *weight = h->sh.weight[i_ref];
 
     // FIXME weight can be done on 4x4 blocks even if mc is smaller
 #define CHROMA4x4MC( width, height, me, x, y ) \
-    if( CHROMA444 ) \
+    if( chroma == CHROMA_444 ) \
     { \
         int mvx = (me).mv[0] + 4*2*x; \
         int mvy = (me).mv[1] + 4*2*y; \
@@ -1695,14 +1698,16 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
     } \
     else \
     { \
-        h->mc.mc_chroma( &pix1[x+y*16], &pix2[x+y*16], 16, &p_fref[4][or+x*2+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
+        int offset = x + (2>>chroma_v_shift)*16*y; \
+        int chroma_height = (2>>chroma_v_shift)*height; \
+        h->mc.mc_chroma( &pix1[offset], &pix2[offset], 16, &p_fref[4][or+2*x+(2>>chroma_v_shift)*y*i_stride], i_stride, \
+                         (me).mv[0], (2>>chroma_v_shift)*((me).mv[1]+mvy_offset), width, chroma_height ); \
         if( weight[1].weightfn ) \
-            weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
+            weight[1].weightfn[width>>2]( &pix1[offset], 16, &pix1[offset], 16, &weight[1], chroma_height ); \
         if( weight[2].weightfn ) \
-            weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height ); \
+            weight[2].weightfn[width>>2]( &pix2[offset], 16, &pix2[offset], 16, &weight[2], chroma_height ); \
     }
 
-
     if( size == PIXEL_4x4 )
     {
         x264_me_t *m = a->l0.me4x4[i8x8];
@@ -1723,13 +1728,24 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
         CHROMA4x4MC( 2,4, m[0], 0,0 );
         CHROMA4x4MC( 2,4, m[1], 2,0 );
     }
+#undef CHROMA4x4MC
 
-    int oe = (8*(i8x8&1) + 4*(i8x8&2)*FENC_STRIDE) >> !CHROMA444;
-    int chromapix = CHROMA444 ? PIXEL_8x8 : PIXEL_4x4;
+    int oe = (8>>chroma_h_shift)*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*FENC_STRIDE;
+    int chromapix = chroma == CHROMA_444 ? PIXEL_8x8 : chroma == CHROMA_422 ? PIXEL_4x8 : PIXEL_4x4;
     return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
          + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
 }
 
+static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
+{
+    if( CHROMA_FORMAT == CHROMA_444 )
+        return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_444 );
+    else if( CHROMA_FORMAT == CHROMA_422 )
+        return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_422 );
+    else
+        return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_420 );
+}
+
 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
 {
     pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
@@ -1845,47 +1861,46 @@ static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *
 {
     ALIGNED_ARRAY_16( pixel, pix, [4],[16*16] );
     ALIGNED_ARRAY_16( pixel,  bi, [2],[16*16] );
-    int l0_mvy_offset, l1_mvy_offset;
     int i_chroma_cost = 0;
+    int chromapix = h->luma2chroma_pixel[i_pixel];
 
 #define COST_BI_CHROMA( m0, m1, width, height ) \
 { \
     if( CHROMA444 ) \
     { \
         h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \
-                       m0.mv[0], m0.mv[1], 2*width, 2*height, x264_weight_none ); \
+                       m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
         h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \
-                       m0.mv[0], m0.mv[1], 2*width, 2*height, x264_weight_none ); \
+                       m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
         h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \
-                       m1.mv[0], m1.mv[1], 2*width, 2*height, x264_weight_none ); \
+                       m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
         h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \
-                       m1.mv[0], m1.mv[1], 2*width, 2*height, x264_weight_none ); \
-        h->mc.avg[i_pixel]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
-        h->mc.avg[i_pixel]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
-        i_chroma_cost  = h->pixf.mbcmp[i_pixel]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ); \
-        i_chroma_cost += h->pixf.mbcmp[i_pixel]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
+                       m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
     } \
     else \
     { \
-        l0_mvy_offset = MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
-        l1_mvy_offset = MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
-        h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], m0.mv[0], m0.mv[1] + l0_mvy_offset, width, height ); \
-        h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], m1.mv[0], m1.mv[1] + l1_mvy_offset, width, height ); \
-        h->mc.avg[i_pixel+3]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
-        h->mc.avg[i_pixel+3]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
-        i_chroma_cost  = h->pixf.mbcmp[i_pixel+3]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ); \
-        i_chroma_cost += h->pixf.mbcmp[i_pixel+3]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
+        int v_shift = h->mb.chroma_v_shift; \
+        int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
+        int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
+        h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \
+                         m0.mv[0], 2*(m0.mv[1]+l0_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
+        h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], \
+                         m1.mv[0], 2*(m1.mv[1]+l1_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
     } \
+    h->mc.avg[chromapix]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
+    h->mc.avg[chromapix]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
+    i_chroma_cost = h->pixf.mbcmp[chromapix]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ) \
+                  + h->pixf.mbcmp[chromapix]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
 }
 
     if( i_pixel == PIXEL_16x16 )
-        COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 8, 8 )
+        COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 16, 16 )
     else if( i_pixel == PIXEL_16x8 )
-        COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 8, 4 )
+        COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 16, 8 )
     else if( i_pixel == PIXEL_8x16 )
-        COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 4, 8 )
+        COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 8, 16 )
     else
-        COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 4, 4 )
+        COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 8, 8 )
 
     return i_chroma_cost;
 }
@@ -1897,12 +1912,12 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
 
     pixel *p_fenc = h->mb.pic.p_fenc[0];
     pixel *p_fdec = h->mb.pic.p_fdec[0];
-    int s = !CHROMA444;
 
     a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
     if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
     {
-        int chromapix = CHROMA444 ? PIXEL_8x8 : PIXEL_4x4;
+        int chromapix = h->luma2chroma_pixel[PIXEL_8x8];
+
         for( int i = 0; i < 4; i++ )
         {
             const int x = (i&1)*8;
@@ -1911,10 +1926,12 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
                                                               &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
             if( h->mb.b_chroma_me )
             {
-                a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][(x>>s)+(y>>s)*FENC_STRIDE], FENC_STRIDE,
-                                                                   &h->mb.pic.p_fdec[1][(x>>s)+(y>>s)*FDEC_STRIDE], FDEC_STRIDE )
-                                      +  h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][(x>>s)+(y>>s)*FENC_STRIDE], FENC_STRIDE,
-                                                                   &h->mb.pic.p_fdec[2][(x>>s)+(y>>s)*FDEC_STRIDE], FDEC_STRIDE );
+                int fenc_offset = (x>>h->mb.chroma_h_shift) + (y>>h->mb.chroma_v_shift)*FENC_STRIDE;
+                int fdec_offset = (x>>h->mb.chroma_h_shift) + (y>>h->mb.chroma_v_shift)*FDEC_STRIDE;
+                a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE,
+                                                                   &h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE )
+                                       + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE,
+                                                                   &h->mb.pic.p_fdec[2][fdec_offset], FDEC_STRIDE );
             }
             a->i_cost16x16direct += a->i_cost8x8direct[i];
 
@@ -1924,10 +1941,10 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
     }
     else
     {
-        int chromapix = CHROMA444 ? PIXEL_16x16 : PIXEL_8x8;
         a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
         if( h->mb.b_chroma_me )
         {
+            int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
             a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
                                  +  h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
         }
@@ -2055,7 +2072,6 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
 
         if( h->mb.b_chroma_me )
         {
-            ALIGNED_ARRAY_16( pixel, pixuv, [2],[8*FENC_STRIDE] );
             ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] );
 
             if( CHROMA444 )
@@ -2071,31 +2087,37 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
             }
             else
             {
-                if( MB_INTERLACED & a->l0.bi16x16.i_ref )
+                ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] );
+                int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
+                int v_shift = h->mb.chroma_v_shift;
+
+                if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref )
                 {
-                    int l0_mvy_offset = MB_INTERLACED & a->l0.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+                    int l0_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
                     h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
                                      h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
                 }
                 else
-                    h->mc.load_deinterleave_8x8x2_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1] );
+                    h->mc.load_deinterleave_chroma_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
+                                                         h->mb.pic.i_stride[1], 16>>v_shift );
 
-                if( MB_INTERLACED & a->l1.bi16x16.i_ref )
+                if( v_shift & MB_INTERLACED & a->l1.bi16x16.i_ref )
                 {
-                    int l1_mvy_offset = MB_INTERLACED & a->l1.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+                    int l1_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
                     h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
                                      h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
                 }
                 else
-                    h->mc.load_deinterleave_8x8x2_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1] );
+                    h->mc.load_deinterleave_chroma_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
+                                                         h->mb.pic.i_stride[1], 16>>v_shift );
 
-                h->mc.avg[PIXEL_8x8]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE,
+                h->mc.avg[chromapix]( bi,   FENC_STRIDE, pixuv[0],   FENC_STRIDE, pixuv[1],   FENC_STRIDE,
                                       h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
-                h->mc.avg[PIXEL_8x8]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
+                h->mc.avg[chromapix]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
                                       h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
 
-                cost00 += h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE )
-                       +  h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
+                cost00 += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi,   FENC_STRIDE )
+                       +  h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
             }
         }
 
@@ -3172,11 +3194,11 @@ intra_analysis:
                 else
                 {
                     x264_mb_analyse_intra_chroma( h, &analysis );
-                    x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
+                    x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma );
                 }
-                analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
-                analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
-                analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
+                analysis.i_satd_i16x16 += analysis.i_satd_chroma;
+                analysis.i_satd_i8x8   += analysis.i_satd_chroma;
+                analysis.i_satd_i4x4   += analysis.i_satd_chroma;
             }
             else
                 x264_mb_analyse_intra( h, &analysis, i_cost );
@@ -3219,8 +3241,9 @@ intra_analysis:
                     h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );
                 if( !CHROMA444 )
                 {
-                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
-                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
+                    int height = 16 >> h->mb.chroma_v_shift;
+                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height );
+                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height );
                 }
                 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
                 goto intra_analysis;
@@ -3583,11 +3606,11 @@ intra_analysis:
                 else
                 {
                     x264_mb_analyse_intra_chroma( h, &analysis );
-                    x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_i8x8chroma );
+                    x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma );
                 }
-                analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
-                analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
-                analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
+                analysis.i_satd_i16x16 += analysis.i_satd_chroma;
+                analysis.i_satd_i8x8   += analysis.i_satd_chroma;
+                analysis.i_satd_i4x4   += analysis.i_satd_chroma;
             }
             else
                 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 491b4ee7..c575724e 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -210,8 +210,8 @@ static void x264_cabac_mb_intra4x4_pred_mode( x264_cabac_t *cb, int i_pred, int
 
 static void x264_cabac_mb_intra_chroma_pred_mode( x264_t *h, x264_cabac_t *cb )
 {
-    const int i_mode = x264_mb_pred_mode8x8c_fix[h->mb.i_chroma_pred_mode];
-    int       ctx = 0;
+    int i_mode = x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode];
+    int ctx = 0;
 
     /* No need to test for I4x4 or I_16x16 as cache_save handle that */
     if( (h->mb.i_neighbour & MB_LEFT) && h->mb.chroma_pred_mode[h->mb.i_mb_left_xy[0]] != 0 )
@@ -485,7 +485,7 @@ static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i )
  *                1-> AC 16x16  i_idx = luma4x4idx
  *                2-> Luma4x4   i_idx = luma4x4idx
  *                3-> DC Chroma i_idx = iCbCr
- *                4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx
+ *                4-> AC Chroma i_idx = numChroma4x4Blks * iCbCr + chroma4x4idx
  *                5-> Luma8x8   i_idx = luma8x8idx
  */
 
@@ -567,6 +567,7 @@ static const uint8_t last_coeff_flag_offset_8x8[63] =
     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
 };
+static const uint8_t coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* MIN( i/2, 2 ) */
 
 // node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
 //           4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
@@ -574,6 +575,9 @@ static const uint8_t last_coeff_flag_offset_8x8[63] =
 static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
 /* map node ctx => cabac ctx for level>1 */
 static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
+/* 4:2:2 chroma dc uses a slightly different state machine for some reason, also note that
+ * 4:2:0 chroma dc doesn't use the last state so it has identical output with both arrays. */
+static const uint8_t coeff_abs_levelgt1_ctx_chroma_dc[8] = { 5, 5, 5, 5, 6, 7, 8, 8 };
 static const uint8_t coeff_abs_level_transition[2][8] = {
 /* update node ctx after coding a level=1 */
     { 1, 2, 3, 3, 4, 5, 6, 7 },
@@ -583,18 +587,17 @@ static const uint8_t coeff_abs_level_transition[2][8] = {
 static const uint8_t count_cat_m1[14] = {15, 14, 15, 3, 14, 63, 15, 14, 15, 63, 15, 14, 15, 63};
 
 #if !RDO_SKIP_BS
-static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+static ALWAYS_INLINE void block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int chroma422dc )
 {
-    const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED];
     int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
     int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
     int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat];
-    int coeff_idx = -1, node_ctx = 0, last;
-    int coeffs[64];
-
-    last = h->quantf.coeff_last[ctx_block_cat]( l );
+    int coeff_idx = -1, node_ctx = 0;
+    int last = h->quantf.coeff_last[ctx_block_cat]( l );
+    const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
+    dctcoef coeffs[64];
 
-#define WRITE_SIGMAP( l8x8 )\
+#define WRITE_SIGMAP( sig_off, last_off )\
 {\
     int i = 0;\
     while( 1 )\
@@ -602,19 +605,18 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_blo
         if( l[i] )\
         {\
             coeffs[++coeff_idx] = l[i];\
-            x264_cabac_encode_decision( cb, ctx_sig + (l8x8 ? sig_offset[i] : i), 1 );\
+            x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );\
             if( i == last )\
             {\
-                x264_cabac_encode_decision( cb, ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 1 );\
+                x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );\
                 break;\
             }\
             else\
-                x264_cabac_encode_decision( cb, ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 0 );\
+                x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );\
         }\
         else\
-            x264_cabac_encode_decision( cb, ctx_sig + (l8x8 ? sig_offset[i] : i), 0 );\
-        i++;\
-        if( i == count_m1 )\
+            x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );\
+        if( ++i == count_m1 )\
         {\
             coeffs[++coeff_idx] = l[i];\
             break;\
@@ -622,11 +624,22 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_blo
     }\
 }
 
-    int count_m1 = count_cat_m1[ctx_block_cat];
-    if( count_m1 == 63 )
-        WRITE_SIGMAP( 1 )
+    if( chroma422dc )
+    {
+        int count_m1 = 7;
+        WRITE_SIGMAP( coeff_flag_offset_chroma_422_dc[i], coeff_flag_offset_chroma_422_dc[i] )
+    }
     else
-        WRITE_SIGMAP( 0 )
+    {
+        int count_m1 = count_cat_m1[ctx_block_cat];
+        if( count_m1 == 63 )
+        {
+            const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED];
+            WRITE_SIGMAP( sig_offset[i], last_coeff_flag_offset_8x8[i] )
+        }
+        else
+            WRITE_SIGMAP( i, i )
+    }
 
     do
     {
@@ -639,7 +652,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_blo
         if( abs_coeff > 1 )
         {
             x264_cabac_encode_decision( cb, ctx, 1 );
-            ctx = coeff_abs_levelgt1_ctx[node_ctx] + ctx_level;
+            ctx = levelgt1_ctx[node_ctx] + ctx_level;
             for( int i = X264_MIN( abs_coeff, 15 ) - 2; i > 0; i-- )
                 x264_cabac_encode_decision( cb, ctx, 1 );
             if( abs_coeff < 15 )
@@ -658,15 +671,23 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_blo
         x264_cabac_encode_bypass( cb, coeff_sign );
     } while( --coeff_idx >= 0 );
 }
+static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+{
+    block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 0 );
+}
+static void block_residual_write_cabac_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+{
+    /* Template a version specifically for chroma 4:2:2 DC in order to avoid
+     * slowing down everything else due to the added complexity. */
+    block_residual_write_cabac_internal( h, cb, DCT_CHROMA_DC, l, 1 );
+}
 #define block_residual_write_cabac_8x8( h, cb, cat, l ) block_residual_write_cabac( h, cb, cat, l )
-
 #else
 
-/* Faster RDO by merging sigmap and level coding.  Note that for 8x8dct
- * this is slightly incorrect because the sigmap is not reversible
- * (contexts are repeated).  However, there is nearly no quality penalty
- * for this (~0.001db) and the speed boost (~30%) is worth it. */
-static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8 )
+/* Faster RDO by merging sigmap and level coding. Note that for 8x8dct and chroma 4:2:2 dc this is
+ * slightly incorrect because the sigmap is not reversible (contexts are repeated). However, there
+ * is nearly no quality penalty for this (~0.001db) and the speed boost (~30%) is worth it. */
+static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8, int chroma422dc )
 {
     const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED];
     int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
@@ -676,17 +697,20 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c
     int coeff_abs = abs(l[last]);
     int ctx = coeff_abs_level1_ctx[0] + ctx_level;
     int node_ctx;
+    const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
 
-    if( last != (b_8x8 ? 63 : count_cat_m1[ctx_block_cat]) )
+    if( last != (b_8x8 ? 63 : chroma422dc ? 7 : count_cat_m1[ctx_block_cat]) )
     {
-        x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] : last), 1 );
-        x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[last] : last), 1 );
+        x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] :
+                                    chroma422dc ? coeff_flag_offset_chroma_422_dc[last] : last), 1 );
+        x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[last] :
+                                    chroma422dc ? coeff_flag_offset_chroma_422_dc[last] : last), 1 );
     }
 
     if( coeff_abs > 1 )
     {
         x264_cabac_encode_decision( cb, ctx, 1 );
-        ctx = coeff_abs_levelgt1_ctx[0] + ctx_level;
+        ctx = levelgt1_ctx[0] + ctx_level;
         if( coeff_abs < 15 )
         {
             cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]];
@@ -712,14 +736,16 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c
         if( l[i] )
         {
             coeff_abs = abs(l[i]);
-            x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : i), 1 );
-            x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[i] : i), 0 );
+            x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] :
+                                        chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 1 );
+            x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[i] :
+                                        chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 0 );
             ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level;
 
             if( coeff_abs > 1 )
             {
                 x264_cabac_encode_decision( cb, ctx, 1 );
-                ctx = coeff_abs_levelgt1_ctx[node_ctx] + ctx_level;
+                ctx = levelgt1_ctx[node_ctx] + ctx_level;
                 if( coeff_abs < 15 )
                 {
                     cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]];
@@ -741,45 +767,49 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c
             }
         }
         else
-            x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : i), 0 );
+            x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] :
+                                        chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 0 );
     }
 }
 
 static void block_residual_write_cabac_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
 {
-    block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 1 );
+    block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 1, 0 );
+}
+static void block_residual_write_cabac_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+{
+    block_residual_write_cabac_internal( h, cb, DCT_CHROMA_DC, l, 0, 1 );
 }
 static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
 {
-    block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 0 );
+    block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 0, 0 );
 }
 #endif
 
-#define block_residual_write_cabac_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
+#define block_residual_write_cabac_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, name )\
 do\
 {\
     int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, ctx_block_cat, i_idx, b_intra );\
     if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\
     {\
         x264_cabac_encode_decision( cb, ctxidxinc, 1 );\
-        block_residual_write_cabac( h, cb, ctx_block_cat, l );\
+        block_residual_write_cabac##name( h, cb, ctx_block_cat, l );\
     }\
     else\
         x264_cabac_encode_decision( cb, ctxidxinc, 0 );\
 } while(0)
 
+#define block_residual_write_cabac_dc_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
+    block_residual_write_cabac_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, )
+
+#define block_residual_write_cabac_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
+    block_residual_write_cabac_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, )
+
 #define block_residual_write_cabac_8x8_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
-do\
-{\
-    int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, ctx_block_cat, i_idx, b_intra );\
-    if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\
-    {\
-        x264_cabac_encode_decision( cb, ctxidxinc, 1 );\
-        block_residual_write_cabac_8x8( h, cb, ctx_block_cat, l );\
-    }\
-    else\
-        x264_cabac_encode_decision( cb, ctxidxinc, 0 );\
-} while(0)
+    block_residual_write_cabac_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, _8x8 )
+
+#define block_residual_write_cabac_422_dc_cbf( h, cb, ch, b_intra )\
+    block_residual_write_cabac_cbf_internal( h, cb, DCT_CHROMA_DC, CHROMA_DC+(ch), h->dct.chroma_dc[ch], b_intra, _422_dc )
 
 static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int plane_count, int chroma )
 {
@@ -808,7 +838,7 @@ static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_
                 bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] );
         if( chroma )
             for( int ch = 1; ch < 3; ch++ )
-                for( int i = 0; i < 8; i++ )
+                for( int i = 0; i < 16>>h->mb.chroma_v_shift; i++ )
                     for( int j = 0; j < 8; j++ )
                         bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
 
@@ -968,7 +998,7 @@ static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_
         x264_cabac_mb_transform_size( h, cb );
     }
 
-    if( h->mb.i_cbp_luma > 0 || (chroma && h->mb.i_cbp_chroma > 0) || i_mb_type == I_16x16 )
+    if( h->mb.i_cbp_luma || (chroma && h->mb.i_cbp_chroma) || i_mb_type == I_16x16 )
     {
         const int b_intra = IS_INTRA( i_mb_type );
         x264_cabac_mb_qp_delta( h, cb );
@@ -979,7 +1009,7 @@ static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_
             /* DC Luma */
             for( int p = 0; p < plane_count; p++ )
             {
-                block_residual_write_cabac_cbf( h, cb, ctx_cat_plane[DCT_LUMA_DC][p], LUMA_DC+p, h->dct.luma16x16_dc[p], 1 );
+                block_residual_write_cabac_dc_cbf( h, cb, ctx_cat_plane[DCT_LUMA_DC][p], LUMA_DC+p, h->dct.luma16x16_dc[p], 1 );
 
                 /* AC Luma */
                 if( h->mb.i_cbp_luma )
@@ -1054,12 +1084,24 @@ if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy]
 
         if( chroma && h->mb.i_cbp_chroma ) /* Chroma DC residual present */
         {
-            block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], b_intra );
-            block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], b_intra );
-            if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
-                for( int ch = 1; ch < 3; ch++ )
-                    for( int i = ch*16; i < ch*16+4; i++ )
-                        block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, b_intra );
+            if( CHROMA_FORMAT == CHROMA_422 )
+            {
+                block_residual_write_cabac_422_dc_cbf( h, cb, 0, b_intra );
+                block_residual_write_cabac_422_dc_cbf( h, cb, 1, b_intra );
+            }
+            else
+            {
+                block_residual_write_cabac_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], b_intra );
+                block_residual_write_cabac_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], b_intra );
+            }
+
+            if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */
+            {
+                int step = 8 << h->mb.chroma_v_shift;
+                for( int i = 16; i < 3*16; i += step )
+                    for( int j = i; j < i+4; j++ )
+                        block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, b_intra );
+            }
         }
     }
 
@@ -1130,8 +1172,19 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
 
         if( h->mb.i_cbp_chroma )
         {
-            block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 0 );
-            block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1, 0 );
+            if( CHROMA_FORMAT == CHROMA_422 )
+            {
+                int offset = (5*i8) & 0x09;
+                block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 16+offset, h->dct.luma4x4[16+offset]+1, 0 );
+                block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 18+offset, h->dct.luma4x4[18+offset]+1, 0 );
+                block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 32+offset, h->dct.luma4x4[32+offset]+1, 0 );
+                block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 34+offset, h->dct.luma4x4[34+offset]+1, 0 );
+            }
+            else
+            {
+                block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 0 );
+                block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1, 0 );
+            }
         }
 
         i8 += x264_pixel_size[i_pixel].h >> 3;
@@ -1180,19 +1233,30 @@ static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4,
         block_residual_write_cabac_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i4+p*16, h->dct.luma4x4[i4+p*16], 1 );
 }
 
-static void x264_i8x8_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
+static void x264_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
 {
     x264_cabac_mb_intra_chroma_pred_mode( h, cb );
     x264_cabac_mb_cbp_chroma( h, cb );
-    if( h->mb.i_cbp_chroma > 0 )
+    if( h->mb.i_cbp_chroma )
     {
-        block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], 1 );
-        block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], 1 );
+        if( CHROMA_FORMAT == CHROMA_422 )
+        {
+            block_residual_write_cabac_422_dc_cbf( h, cb, 0, 1 );
+            block_residual_write_cabac_422_dc_cbf( h, cb, 1, 1 );
+        }
+        else
+        {
+            block_residual_write_cabac_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], 1 );
+            block_residual_write_cabac_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], 1 );
+        }
 
         if( h->mb.i_cbp_chroma == 2 )
-            for( int ch = 1; ch < 3; ch++ )
-                for( int i = ch*16; i < ch*16+4; i++ )
-                    block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 1 );
+        {
+            int step = 8 << h->mb.chroma_v_shift;
+            for( int i = 16; i < 3*16; i += step )
+                for( int j = i; j < i+4; j++ )
+                    block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, 1 );
+        }
     }
 }
 #endif
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index dcf4e9b4..07397e0a 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -122,10 +122,9 @@ static int block_residual_write_cavlc_internal( x264_t *h, int ctx_block_cat, dc
 {
     bs_t *s = &h->out.bs;
     static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
-    static const uint8_t count_cat[14] = {16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64};
+    static const uint8_t count_cat[14] = {16, 15, 16, 0, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64};
     x264_run_level_t runlevel;
-    int i_trailing, i_total_zero, i_suffix_length;
-    int i_total = 0;
+    int i_total, i_trailing, i_total_zero, i_suffix_length;
     unsigned int i_sign;
 
     /* level and run and total */
@@ -177,13 +176,17 @@ static int block_residual_write_cavlc_internal( x264_t *h, int ctx_block_cat, dc
         }
     }
 
-    if( (uint8_t)i_total < count_cat[ctx_block_cat] )
+    if( ctx_block_cat == DCT_CHROMA_DC )
     {
-        if( ctx_block_cat == DCT_CHROMA_DC )
-            bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] );
-        else
-            bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
+        if( i_total < 8>>h->mb.chroma_v_shift )
+        {
+            vlc_t total_zeros = CHROMA_FORMAT == CHROMA_420 ? x264_total_zeros_2x2_dc[i_total-1][i_total_zero]
+                                                            : x264_total_zeros_2x4_dc[i_total-1][i_total_zero];
+            bs_write_vlc( s, total_zeros );
+        }
     }
+    else if( (uint8_t)i_total < count_cat[ctx_block_cat] )
+        bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
 
     for( int i = 0; i < i_total-1 && i_total_zero > 0; i++ )
     {
@@ -199,7 +202,8 @@ static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
 
 #define block_residual_write_cavlc(h,cat,idx,l)\
 {\
-    int nC = cat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];\
+    int nC = cat == DCT_CHROMA_DC ? 3 + CHROMA_FORMAT\
+                                  : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];\
     uint8_t *nnz = &h->mb.cache.non_zero_count[x264_scan8[idx]];\
     if( !*nnz )\
         bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );\
@@ -323,7 +327,7 @@ void x264_macroblock_write_cavlc( x264_t *h )
                 bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] );
         if( chroma )
             for( int ch = 1; ch < 3; ch++ )
-                for( int i = 0; i < 8; i++ )
+                for( int i = 0; i < 16>>h->mb.chroma_v_shift; i++ )
                     for( int j = 0; j < 8; j++ )
                         bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
 
@@ -358,14 +362,14 @@ void x264_macroblock_write_cavlc( x264_t *h )
                 bs_write( s, 4, i_mode - (i_mode > i_pred) );
         }
         if( chroma )
-            bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
+            bs_write_ue( s, x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] );
     }
     else if( i_mb_type == I_16x16 )
     {
         bs_write_ue( s, i_mb_i_offset + 1 + x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode] +
                         h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
         if( chroma )
-            bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
+            bs_write_ue( s, x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] );
     }
     else if( i_mb_type == P_L0 )
     {
@@ -539,10 +543,13 @@ void x264_macroblock_write_cavlc( x264_t *h )
         /* Chroma DC residual present */
         block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] );
         block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );
-        if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
-            for( int ch = 1; ch < 3; ch++ )
-                for( int i = ch*16; i < ch*16+4; i++ )
-                    block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
+        if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */
+        {
+            int step = 8 << h->mb.chroma_v_shift;
+            for( int i = 16; i < 3*16; i += step )
+                for( int j = i; j < i+4; j++ )
+                    block_residual_write_cavlc( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 );
+        }
     }
 
 #if !RDO_SKIP_BS
@@ -592,8 +599,19 @@ static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
             x264_macroblock_luma_write_cavlc( h, p*4+i8, p*4+i8 );
         if( h->mb.i_cbp_chroma )
         {
-            block_residual_write_cavlc( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
-            block_residual_write_cavlc( h, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1 );
+            if( CHROMA_FORMAT == CHROMA_422 )
+            {
+                int offset = (5*i8) & 0x09;
+                block_residual_write_cavlc( h, DCT_CHROMA_AC, 16+offset, h->dct.luma4x4[16+offset]+1 );
+                block_residual_write_cavlc( h, DCT_CHROMA_AC, 18+offset, h->dct.luma4x4[18+offset]+1 );
+                block_residual_write_cavlc( h, DCT_CHROMA_AC, 32+offset, h->dct.luma4x4[32+offset]+1 );
+                block_residual_write_cavlc( h, DCT_CHROMA_AC, 34+offset, h->dct.luma4x4[34+offset]+1 );
+            }
+            else
+            {
+                block_residual_write_cavlc( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
+                block_residual_write_cavlc( h, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1 );
+            }
         }
         i8 += x264_pixel_size[i_pixel].h >> 3;
     }
@@ -644,18 +662,21 @@ static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
     return h->out.bs.i_bits_encoded;
 }
 
-static int x264_i8x8_chroma_size_cavlc( x264_t *h )
+static int x264_chroma_size_cavlc( x264_t *h )
 {
-    h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
+    h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] );
     if( h->mb.i_cbp_chroma )
     {
         block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] );
         block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );
 
         if( h->mb.i_cbp_chroma == 2 )
-            for( int ch = 1; ch < 3; ch++ )
-                for( int i = ch*16; i < ch*16+4; i++ )
-                    block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
+        {
+            int step = 8 << h->mb.chroma_v_shift;
+            for( int i = 16; i < 3*16; i += step )
+                for( int j = i; j < i+4; j++ )
+                    block_residual_write_cavlc( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 );
+        }
     }
     return h->out.bs.i_bits_encoded;
 }
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 987b39a4..4c47a998 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -71,7 +71,7 @@ static void x264_frame_dump( x264_t *h )
         return;
 
     /* Write the frame in display order */
-    int frame_size = h->param.i_height * h->param.i_width * (3<<CHROMA444)/2 * sizeof(pixel);
+    int frame_size = FRAME_SIZE( h->param.i_height * h->param.i_width * sizeof(pixel) );
     fseek( f, (uint64_t)h->fdec->i_frame * frame_size, SEEK_SET );
     for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
         for( int y = 0; y < h->param.i_height; y++ )
@@ -79,7 +79,7 @@ static void x264_frame_dump( x264_t *h )
     if( !CHROMA444 )
     {
         int cw = h->param.i_width>>1;
-        int ch = h->param.i_height>>1;
+        int ch = h->param.i_height>>h->mb.chroma_v_shift;
         pixel *planeu = x264_malloc( (cw*ch*2+32)*sizeof(pixel) );
         pixel *planev = planeu + cw*ch + 16;
         h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch );
@@ -90,7 +90,6 @@ static void x264_frame_dump( x264_t *h )
     fclose( f );
 }
 
-
 /* Fill "default" values */
 static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
                                     x264_sps_t *sps, x264_pps_t *pps,
@@ -400,6 +399,17 @@ static int x264_validate_parameters( x264_t *h, int b_open )
         return -1;
     }
 #endif
+
+#if HAVE_INTERLACED
+    h->param.b_interlaced = !!PARAM_INTERLACED;
+#else
+    if( h->param.b_interlaced )
+    {
+        x264_log( h, X264_LOG_ERROR, "not compiled with interlaced support\n" );
+        return -1;
+    }
+#endif
+
     if( h->param.i_width <= 0 || h->param.i_height <= 0 )
     {
         x264_log( h, X264_LOG_ERROR, "invalid width x height (%dx%d)\n",
@@ -410,26 +420,30 @@ static int x264_validate_parameters( x264_t *h, int b_open )
     int i_csp = h->param.i_csp & X264_CSP_MASK;
     if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX )
     {
-        x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/I444/YV24/BGR/BGRA/RGB supported)\n" );
+        x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" );
         return -1;
     }
 
-    if( i_csp < X264_CSP_I444 && (h->param.i_width % 2 || h->param.i_height % 2) )
+    if( i_csp < X264_CSP_I444 && h->param.i_width % 2 )
     {
-        x264_log( h, X264_LOG_ERROR, "width or height not divisible by 2 (%dx%d)\n",
+        x264_log( h, X264_LOG_ERROR, "width not divisible by 2 (%dx%d)\n",
                   h->param.i_width, h->param.i_height );
         return -1;
     }
 
-#if HAVE_INTERLACED
-    h->param.b_interlaced = !!PARAM_INTERLACED;
-#else
-    if( h->param.b_interlaced )
+    if( i_csp < X264_CSP_I422 && PARAM_INTERLACED && h->param.i_height % 4 )
     {
-        x264_log( h, X264_LOG_ERROR, "not compiled with interlaced support\n" );
+        x264_log( h, X264_LOG_ERROR, "height not divisible by 4 (%dx%d)\n",
+                  h->param.i_width, h->param.i_height );
+        return -1;
+    }
+
+    if( (i_csp < X264_CSP_I422 || PARAM_INTERLACED) && h->param.i_height % 2 )
+    {
+        x264_log( h, X264_LOG_ERROR, "height not divisible by 2 (%dx%d)\n",
+                  h->param.i_width, h->param.i_height );
         return -1;
     }
-#endif
 
     if( (h->param.crop_rect.i_left + h->param.crop_rect.i_right ) >= h->param.i_width ||
         (h->param.crop_rect.i_top  + h->param.crop_rect.i_bottom) >= h->param.i_height )
@@ -927,7 +941,8 @@ static void mbcmp_init( x264_t *h )
     memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) );
     memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
     h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
-    h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
+    h->pixf.intra_mbcmp_x3_8x16c = satd ? h->pixf.intra_satd_x3_8x16c : h->pixf.intra_sad_x3_8x16c;
+    h->pixf.intra_mbcmp_x3_8x8c  = satd ? h->pixf.intra_satd_x3_8x8c  : h->pixf.intra_sad_x3_8x8c;
     h->pixf.intra_mbcmp_x3_8x8 = satd ? h->pixf.intra_sa8d_x3_8x8 : h->pixf.intra_sad_x3_8x8;
     h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4;
     h->pixf.intra_mbcmp_x9_4x4 = h->param.b_cpu_independent || h->mb.b_lossless ? NULL
@@ -938,6 +953,39 @@ static void mbcmp_init( x264_t *h )
     memcpy( h->pixf.fpelcmp_x4, satd ? h->pixf.satd_x4 : h->pixf.sad_x4, sizeof(h->pixf.fpelcmp_x4) );
 }
 
+static void chroma_dsp_init( x264_t *h )
+{
+    memcpy( h->luma2chroma_pixel, x264_luma2chroma_pixel[CHROMA_FORMAT], sizeof(h->luma2chroma_pixel) );
+
+    switch( CHROMA_FORMAT )
+    {
+        case CHROMA_420:
+            memcpy( h->predict_chroma, h->predict_8x8c, sizeof(h->predict_chroma) );
+            h->loopf.deblock_chroma[0] = h->loopf.deblock_h_chroma_420;
+            h->loopf.deblock_chroma_intra[0] = h->loopf.deblock_h_chroma_420_intra;
+            h->loopf.deblock_chroma_mbaff = h->loopf.deblock_chroma_420_mbaff;
+            h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_chroma_420_intra_mbaff;
+            h->pixf.intra_mbcmp_x3_chroma = h->pixf.intra_mbcmp_x3_8x8c;
+            h->quantf.coeff_last[DCT_CHROMA_DC] = h->quantf.coeff_last4;
+            h->quantf.coeff_level_run[DCT_CHROMA_DC] = h->quantf.coeff_level_run4;
+            break;
+        case CHROMA_422:
+            memcpy( h->predict_chroma, h->predict_8x16c, sizeof(h->predict_chroma) );
+            h->loopf.deblock_chroma[0] = h->loopf.deblock_h_chroma_422;
+            h->loopf.deblock_chroma_intra[0] = h->loopf.deblock_h_chroma_422_intra;
+            h->loopf.deblock_chroma_mbaff = h->loopf.deblock_chroma_422_mbaff;
+            h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_chroma_422_intra_mbaff;
+            h->pixf.intra_mbcmp_x3_chroma = h->pixf.intra_mbcmp_x3_8x16c;
+            h->quantf.coeff_last[DCT_CHROMA_DC] = h->quantf.coeff_last8;
+            h->quantf.coeff_level_run[DCT_CHROMA_DC] = h->quantf.coeff_level_run8;
+            break;
+        case CHROMA_444:
+            h->loopf.deblock_chroma_mbaff = h->loopf.deblock_luma_mbaff;
+            h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_luma_intra_mbaff;
+            break;
+    }
+}
+
 static void x264_set_aspect_ratio( x264_t *h, x264_param_t *param, int initial )
 {
     /* VUI */
@@ -1039,6 +1087,10 @@ x264_t *x264_encoder_open( x264_param_t *param )
     h->mb.i_mb_width = h->sps->i_mb_width;
     h->mb.i_mb_height = h->sps->i_mb_height;
     h->mb.i_mb_count = h->mb.i_mb_width * h->mb.i_mb_height;
+
+    h->mb.chroma_h_shift = CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422;
+    h->mb.chroma_v_shift = CHROMA_FORMAT == CHROMA_420;
+
     /* Adaptive MBAFF and subme 0 are not supported as we require halving motion
      * vectors during prediction, resulting in hpel mvs.
      * The chosen solution is to make MBAFF non-adaptive in this case. */
@@ -1092,6 +1144,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
     /* init CPU functions */
     x264_predict_16x16_init( h->param.cpu, h->predict_16x16 );
     x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c );
+    x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c );
     x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter );
     x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
     if( h->param.b_cabac )
@@ -1109,6 +1162,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
     x264_dct_init_weights();
 
     mbcmp_init( h );
+    chroma_dsp_init( h );
 
     p = buf + sprintf( buf, "using cpu capabilities:" );
     for( int i = 0; x264_cpu_names[i].flags; i++ )
@@ -1238,6 +1292,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
                           h->sps->i_profile_idc == PROFILE_MAIN ? "Main" :
                           h->sps->i_profile_idc == PROFILE_HIGH ? "High" :
                           h->sps->i_profile_idc == PROFILE_HIGH10 ? (h->sps->b_constraint_set3 == 1 ? "High 10 Intra" : "High 10") :
+                          h->sps->i_profile_idc == PROFILE_HIGH422 ? (h->sps->b_constraint_set3 == 1 ? "High 4:2:2 Intra" : "High 4:2:2") :
                           h->sps->b_constraint_set3 == 1 ? "High 4:4:4 Intra" : "High 4:4:4 Predictive";
     char level[4];
     snprintf( level, sizeof(level), "%d.%d", h->sps->i_level_idc/10, h->sps->i_level_idc%10 );
@@ -1252,8 +1307,9 @@ x264_t *x264_encoder_open( x264_param_t *param )
     }
     else
     {
+        static const char * const subsampling[4] = { "4:0:0", "4:2:0", "4:2:2", "4:4:4" };
         x264_log( h, X264_LOG_INFO, "profile %s, level %s, %s %d-bit\n",
-            profile, level, CHROMA444 ? "4:4:4" : "4:2:0", BIT_DEPTH );
+            profile, level, subsampling[CHROMA_FORMAT], BIT_DEPTH );
     }
 
     return h;
@@ -1776,7 +1832,7 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
      * consistency by copying deblocked pixels between planes. */
     if( PARAM_INTERLACED )
         for( int p = 0; p < h->fdec->i_plane; p++ )
-            for( int i = minpix_y>>(!CHROMA444 && p); i < maxpix_y>>(!CHROMA444 && p); i++ )
+            for( int i = minpix_y>>(h->mb.chroma_v_shift && p); i < maxpix_y>>(h->mb.chroma_v_shift && p); i++ )
                 memcpy( h->fdec->plane_fld[p] + i*h->fdec->i_stride[p],
                         h->fdec->plane[p]     + i*h->fdec->i_stride[p],
                         h->mb.i_mb_width*16*sizeof(pixel) );
@@ -1815,10 +1871,11 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
             if( !CHROMA444 )
             {
                 uint64_t ssd_u, ssd_v;
+                int v_shift = h->mb.chroma_v_shift;
                 x264_pixel_ssd_nv12( &h->pixf,
-                    h->fdec->plane[1] + (minpix_y>>1) * h->fdec->i_stride[1], h->fdec->i_stride[1],
-                    h->fenc->plane[1] + (minpix_y>>1) * h->fenc->i_stride[1], h->fenc->i_stride[1],
-                    h->param.i_width>>1, (maxpix_y-minpix_y)>>1, &ssd_u, &ssd_v );
+                    h->fdec->plane[1] + (minpix_y>>v_shift) * h->fdec->i_stride[1], h->fdec->i_stride[1],
+                    h->fenc->plane[1] + (minpix_y>>v_shift) * h->fenc->i_stride[1], h->fenc->i_stride[1],
+                    h->param.i_width>>1, (maxpix_y-minpix_y)>>v_shift, &ssd_u, &ssd_v );
                 h->stat.frame.i_ssd[1] += ssd_u;
                 h->stat.frame.i_ssd[2] += ssd_v;
             }
@@ -2263,7 +2320,7 @@ reencode:
                 else //if( h->mb.i_type == I_4x4 )
                     for( int i = 0; i < 16; i++ )
                         h->stat.frame.i_mb_pred_mode[2][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++;
-                h->stat.frame.i_mb_pred_mode[3][x264_mb_pred_mode8x8c_fix[h->mb.i_chroma_pred_mode]]++;
+                h->stat.frame.i_mb_pred_mode[3][x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode]]++;
             }
             h->stat.frame.i_mb_field[b_intra?0:b_skip?2:1] += MB_INTERLACED;
         }
@@ -3141,7 +3198,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
             h->stat.frame.i_ssd[2],
         };
         int luma_size = h->param.i_width * h->param.i_height;
-        int chroma_size = h->param.i_width * h->param.i_height >> (!CHROMA444 * 2);
+        int chroma_size = CHROMA_SIZE( luma_size );
         double psnr_y = x264_psnr( ssd[0], luma_size );
         double psnr_u = x264_psnr( ssd[1], chroma_size );
         double psnr_v = x264_psnr( ssd[2], chroma_size );
@@ -3232,9 +3289,7 @@ static void x264_print_intra( int64_t *i_mb_count, double i_count, int b_print_p
  ****************************************************************************/
 void    x264_encoder_close  ( x264_t *h )
 {
-    int luma_size = h->param.i_width * h->param.i_height;
-    int chroma_size = h->param.i_width * h->param.i_height >> (!CHROMA444 * 2);
-    int64_t i_yuv_size = luma_size + chroma_size * 2;
+    int64_t i_yuv_size = FRAME_SIZE( h->param.i_width * h->param.i_height );
     int64_t i_mb_count_size[2][7] = {{0}};
     char buf[200];
     int b_print_pcm = h->stat.i_mb_count[SLICE_TYPE_I][I_PCM]
@@ -3470,7 +3525,7 @@ void    x264_encoder_close  ( x264_t *h )
         }
         for( int i = 0; i <= I_PRED_CHROMA_DC_128; i++ )
         {
-            fixed_pred_modes[3][x264_mb_pred_mode8x8c_fix[i]] += h->stat.i_mb_pred_mode[3][i];
+            fixed_pred_modes[3][x264_mb_chroma_pred_mode_fix[i]] += h->stat.i_mb_pred_mode[3][i];
             sum_pred_modes[3] += h->stat.i_mb_pred_mode[3][i];
         }
         if( sum_pred_modes[3] && !CHROMA444 )
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index a8768c57..0dfebb26 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -6,6 +6,7 @@
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
  *          Fiona Glaser <fiona@x264.com>
+ *          Henrik Gramner <hengar-6@student.ltu.se>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -40,7 +41,19 @@ static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] )
 }
 #undef ZIG
 
-#define IDCT_DEQUANT_START \
+static inline void zigzag_scan_2x4_dc( dctcoef level[8], dctcoef dct[8] )
+{
+    level[0] = dct[0];
+    level[1] = dct[2];
+    level[2] = dct[1];
+    level[3] = dct[4];
+    level[4] = dct[6];
+    level[5] = dct[3];
+    level[6] = dct[5];
+    level[7] = dct[7];
+}
+
+#define IDCT_DEQUANT_2X2_START \
     int d0 = dct[0] + dct[1]; \
     int d1 = dct[2] + dct[3]; \
     int d2 = dct[0] - dct[1]; \
@@ -49,21 +62,22 @@ static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] )
 
 static inline void idct_dequant_2x2_dc( dctcoef dct[4], dctcoef dct4x4[4][16], int dequant_mf[6][16], int i_qp )
 {
-    IDCT_DEQUANT_START
+    IDCT_DEQUANT_2X2_START
     dct4x4[0][0] = (d0 + d1) * dmf >> 5;
     dct4x4[1][0] = (d0 - d1) * dmf >> 5;
     dct4x4[2][0] = (d2 + d3) * dmf >> 5;
     dct4x4[3][0] = (d2 - d3) * dmf >> 5;
 }
 
-static inline void idct_dequant_2x2_dconly( dctcoef out[4], dctcoef dct[4], int dequant_mf[6][16], int i_qp )
+static inline void idct_dequant_2x2_dconly( dctcoef dct[4], int dequant_mf[6][16], int i_qp )
 {
-    IDCT_DEQUANT_START
-    out[0] = (d0 + d1) * dmf >> 5;
-    out[1] = (d0 - d1) * dmf >> 5;
-    out[2] = (d2 + d3) * dmf >> 5;
-    out[3] = (d2 - d3) * dmf >> 5;
+    IDCT_DEQUANT_2X2_START
+    dct[0] = (d0 + d1) * dmf >> 5;
+    dct[1] = (d0 - d1) * dmf >> 5;
+    dct[2] = (d2 + d3) * dmf >> 5;
+    dct[3] = (d2 - d3) * dmf >> 5;
 }
+#undef IDCT_2X2_DEQUANT_START
 
 static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] )
 {
@@ -81,6 +95,23 @@ static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] )
     dct4x4[3][0] = 0;
 }
 
+static ALWAYS_INLINE int array_non_zero( dctcoef *v, int i_count )
+{
+    if( WORD_SIZE == 8 )
+    {
+        for( int i = 0; i < i_count; i += 8/sizeof(dctcoef) )
+            if( M64( &v[i] ) )
+                return 1;
+    }
+    else
+    {
+        for( int i = 0; i < i_count; i += 4/sizeof(dctcoef) )
+            if( M32( &v[i] ) )
+                return 1;
+    }
+    return 0;
+}
+
 static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
 {
     int i_quant_cat = b_intra ? (p?CQM_4IC:CQM_4IY) : (p?CQM_4PC:CQM_4PY);
@@ -236,7 +267,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
             block_cbp |= nz;
         }
         h->mb.i_cbp_luma |= block_cbp * 0xf;
-        h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4 );
+        h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4, 16 );
         h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
         return;
     }
@@ -278,7 +309,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
 
     h->dctf.dct4x4dc( dct_dc4x4 );
     if( h->mb.b_trellis )
-        nz = x264_quant_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, 0, LUMA_DC+p );
+        nz = x264_quant_luma_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, LUMA_DC+p );
     else
         nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[i_quant_cat][i_qp][0]>>1, h->quant4_bias[i_quant_cat][i_qp][0]<<1 );
 
@@ -306,7 +337,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
  * Unlike luma blocks, this can't be done with a lookup table or
  * other shortcut technique because of the interdependencies
  * between the coefficients due to the chroma DC transform. */
-static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef dct2x2[4], int dequant_mf[6][16], int i_qp )
+static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef *dct_dc, int dequant_mf[6][16], int i_qp, int chroma422 )
 {
     int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
 
@@ -314,14 +345,18 @@ static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef dct2x2[4
     if( dmf > 32*64 )
         return 1;
 
-    return h->quantf.optimize_chroma_dc( dct2x2, dmf );
+    if( chroma422 )
+        return h->quantf.optimize_chroma_2x4_dc( dct_dc, dmf );
+    else
+        return h->quantf.optimize_chroma_2x2_dc( dct_dc, dmf );
 }
 
-void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
+static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter, int i_qp, int chroma422 )
 {
     int nz, nz_dc;
     int b_decimate = b_inter && h->mb.b_dct_decimate;
-    ALIGNED_ARRAY_16( dctcoef, dct2x2,[4] );
+    int (*dequant_mf)[16] = h->dequant4_mf[CQM_4IC + b_inter];
+    ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
     h->mb.i_cbp_chroma = 0;
     h->nr_count[2] += h->mb.b_noise_reduction * 4;
 
@@ -330,17 +365,26 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
      * Values are experimentally derived. */
     if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction )
     {
-        int thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
+        int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
         int ssd[2];
-        int score = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
+        int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8;
+
+        int score  = h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
         if( score < thresh*4 )
-            score += h->pixf.var2_8x8( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
+            score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
         if( score < thresh*4 )
         {
             M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
             M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
             M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
             M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
+            if( chroma422 )
+            {
+                M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
+                M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
+                M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
+                M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
+            }
             h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
             h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
 
@@ -348,20 +392,43 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
             {
                 if( ssd[ch] > thresh )
                 {
-                    h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] );
+                    pixel *p_src = h->mb.pic.p_fenc[1+ch];
+                    pixel *p_dst = h->mb.pic.p_fdec[1+ch];
+
+                    if( chroma422 )
+                        /* Cannot be replaced by two calls to sub8x8_dct_dc since the hadamard transform is different */
+                        h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
+                    else
+                        h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
+
                     if( h->mb.b_trellis )
-                        nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1, CHROMA_DC+ch );
+                        nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
                     else
-                        nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
+                    {
+                        nz_dc = 0;
+                        for( int i = 0; i <= chroma422; i++ )
+                            nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
+                                                             h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
+                    }
 
                     if( nz_dc )
                     {
-                        if( !x264_mb_optimize_chroma_dc( h, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) )
+                        if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
                             continue;
                         h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 1;
-                        zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
-                        idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
-                        h->dctf.add8x8_idct_dc( h->mb.pic.p_fdec[1+ch], dct2x2 );
+                        if( chroma422 )
+                        {
+                            zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
+                            h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
+                        }
+                        else
+                        {
+                            zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
+                            idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
+                        }
+
+                        for( int i = 0; i <= chroma422; i++ )
+                            h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
                         h->mb.i_cbp_chroma = 1;
                     }
                 }
@@ -377,78 +444,120 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
         int i_decimate_score = 0;
         int nz_ac = 0;
 
-        ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
+        ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
 
         if( h->mb.b_lossless )
         {
-            for( int i = 0; i < 4; i++ )
+            static const uint8_t chroma422_scan[8] = { 0, 2, 1, 5, 3, 6, 4, 7 };
+
+            for( int i = 0; i < (chroma422?8:4); i++ )
             {
-                int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
-                int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
-                nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+ch*16], p_src+oe, p_dst+od, &h->dct.chroma_dc[ch][i] );
-                h->mb.cache.non_zero_count[x264_scan8[16+i+ch*16]] = nz;
+                int oe = 4*(i&1) + 4*(i>>1)*FENC_STRIDE;
+                int od = 4*(i&1) + 4*(i>>1)*FDEC_STRIDE;
+                nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], p_src+oe, p_dst+od,
+                                           &h->dct.chroma_dc[ch][chroma422?chroma422_scan[i]:i] );
+                h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
                 h->mb.i_cbp_chroma |= nz;
             }
-            h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch] );
+            h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch], chroma422?8:4 );
             continue;
         }
 
-        h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
+        for( int i = 0; i <= chroma422; i++ )
+            h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
+
         if( h->mb.b_noise_reduction )
-            for( int i = 0; i < 4; i++ )
+            for( int i = 0; i < (chroma422?8:4); i++ )
                 h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[2], h->nr_offset[2], 16 );
-        dct2x2dc( dct2x2, dct4x4 );
+
+        if( chroma422 )
+            h->dctf.dct2x4dc( dct_dc, dct4x4 );
+        else
+            dct2x2dc( dct_dc, dct4x4 );
+
         /* calculate dct coeffs */
-        for( int i = 0; i < 4; i++ )
+        for( int i = 0; i < (chroma422?8:4); i++ )
         {
             if( h->mb.b_trellis )
                 nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 );
             else
                 nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
-            h->mb.cache.non_zero_count[x264_scan8[16+i+ch*16]] = nz;
+            h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
             if( nz )
             {
                 nz_ac = 1;
-                h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*16], dct4x4[i] );
-                h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+                h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], dct4x4[i] );
+                h->quantf.dequant_4x4( dct4x4[i], dequant_mf, i_qp );
                 if( b_decimate )
-                    i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*16] );
+                    i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16] );
             }
         }
 
         if( h->mb.b_trellis )
-            nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1, CHROMA_DC+ch );
+            nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
         else
-            nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
+        {
+            nz_dc = 0;
+            for( int i = 0; i <= chroma422; i++ )
+                nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
+                                                 h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
+        }
 
         h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc;
 
         if( (b_decimate && i_decimate_score < 7) || !nz_ac )
         {
             /* Decimate the block */
-            M16( &h->mb.cache.non_zero_count[x264_scan8[16+0+16*ch]] ) = 0;
-            M16( &h->mb.cache.non_zero_count[x264_scan8[16+2+16*ch]] ) = 0;
+            M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0;
+            M16( &h->mb.cache.non_zero_count[x264_scan8[18+16*ch]] ) = 0;
+            if( chroma422 )
+            {
+                M16( &h->mb.cache.non_zero_count[x264_scan8[24+16*ch]] ) = 0;
+                M16( &h->mb.cache.non_zero_count[x264_scan8[26+16*ch]] ) = 0;
+            }
+
             if( !nz_dc ) /* Whole block is empty */
                 continue;
-            if( !x264_mb_optimize_chroma_dc( h, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) )
+            if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
             {
                 h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 0;
                 continue;
             }
             /* DC-only */
-            zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
-            idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
-            h->dctf.add8x8_idct_dc( p_dst, dct2x2 );
+            if( chroma422 )
+            {
+                zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
+                h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
+            }
+            else
+            {
+                zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
+                idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
+            }
+
+            for( int i = 0; i <= chroma422; i++ )
+                h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
         }
         else
         {
             h->mb.i_cbp_chroma = 1;
+
             if( nz_dc )
             {
-                zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
-                idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+                if( chroma422 )
+                {
+                    zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
+                    h->quantf.idct_dequant_2x4_dc( dct_dc, dct4x4, dequant_mf, i_qp+3 );
+                }
+                else
+                {
+                    zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
+                    idct_dequant_2x2_dc( dct_dc, dct4x4, dequant_mf, i_qp );
+                }
             }
-            h->dctf.add8x8_idct( p_dst, dct4x4 );
+
+            for( int i = 0; i <= chroma422; i++ )
+                h->dctf.add8x8_idct( p_dst + 8*i*FDEC_STRIDE, &dct4x4[4*i] );
         }
     }
 
@@ -457,6 +566,14 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
                            h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] | h->mb.i_cbp_chroma);
 }
 
+void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp )
+{
+    if( CHROMA_FORMAT == CHROMA_420 )
+        x264_mb_encode_chroma_internal( h, b_inter, i_qp, 0 );
+    else
+        x264_mb_encode_chroma_internal( h, b_inter, i_qp, 1 );
+}
+
 static void x264_macroblock_encode_skip( x264_t *h )
 {
     M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;
@@ -467,7 +584,7 @@ static void x264_macroblock_encode_skip( x264_t *h )
     M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 2]] ) = 0;
     M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 0]] ) = 0;
     M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 2]] ) = 0;
-    if( CHROMA444 )
+    if( CHROMA_FORMAT >= CHROMA_422 )
     {
         M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] ) = 0;
         M32( &h->mb.cache.non_zero_count[x264_scan8[16+10]] ) = 0;
@@ -483,26 +600,32 @@ static void x264_macroblock_encode_skip( x264_t *h )
  * Intra prediction for predictive lossless mode.
  *****************************************************************************/
 
-void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode )
+void x264_predict_lossless_chroma( x264_t *h, int i_mode )
 {
+    int height = 16 >> h->mb.chroma_v_shift;
     if( i_mode == I_PRED_CHROMA_V )
     {
-        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, 8 );
-        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, 8 );
+        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, height );
+        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, height );
         memcpy( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[1]-FDEC_STRIDE, 8*sizeof(pixel) );
         memcpy( h->mb.pic.p_fdec[2], h->mb.pic.p_fdec[2]-FDEC_STRIDE, 8*sizeof(pixel) );
     }
     else if( i_mode == I_PRED_CHROMA_H )
     {
-        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, 8 );
-        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, 8 );
+        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, height );
+        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, height );
         x264_copy_column8( h->mb.pic.p_fdec[1]+4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+4*FDEC_STRIDE-1 );
         x264_copy_column8( h->mb.pic.p_fdec[2]+4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+4*FDEC_STRIDE-1 );
+        if( CHROMA_FORMAT == CHROMA_422 )
+        {
+            x264_copy_column8( h->mb.pic.p_fdec[1]+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+12*FDEC_STRIDE-1 );
+            x264_copy_column8( h->mb.pic.p_fdec[2]+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+12*FDEC_STRIDE-1 );
+        }
     }
     else
     {
-        h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
-        h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
+        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
     }
 }
 
@@ -563,8 +686,9 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc[p], FENC_STRIDE, 16 );
         if( chroma )
         {
-            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
-            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 );
+            int height = 16 >> h->mb.chroma_v_shift;
+            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, height );
+            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, height );
         }
         return;
     }
@@ -598,22 +722,26 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
 
             if( chroma )
             {
+                int v_shift = h->mb.chroma_v_shift;
+                int height = 16 >> v_shift;
+
                 /* Special case for mv0, which is (of course) very common in P-skip mode. */
                 if( mvx | mvy )
                     h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
                                      h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
-                                     mvx, mvy, 8, 8 );
+                                     mvx, 2*mvy>>v_shift, 8, height );
                 else
-                    h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] );
+                    h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
+                                                         h->mb.pic.i_stride[1], height );
 
                 if( h->sh.weight[0][1].weightfn )
                     h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
                                                        h->mb.pic.p_fdec[1], FDEC_STRIDE,
-                                                       &h->sh.weight[0][1], 8 );
+                                                       &h->sh.weight[0][1], height );
                 if( h->sh.weight[0][2].weightfn )
                     h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
                                                        h->mb.pic.p_fdec[2], FDEC_STRIDE,
-                                                       &h->sh.weight[0][2], 8 );
+                                                       &h->sh.weight[0][2], height );
             }
         }
 
@@ -861,18 +989,18 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
     {
         if( IS_INTRA( h->mb.i_type ) )
         {
-            const int i_mode = h->mb.i_chroma_pred_mode;
+            int i_mode = h->mb.i_chroma_pred_mode;
             if( h->mb.b_lossless )
-                x264_predict_lossless_8x8_chroma( h, i_mode );
+                x264_predict_lossless_chroma( h, i_mode );
             else
             {
-                h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
-                h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
+                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
             }
         }
 
         /* encode the 8x8 blocks */
-        x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
+        x264_mb_encode_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
     }
     else
         h->mb.i_cbp_chroma = 0;
@@ -920,13 +1048,10 @@ void x264_macroblock_encode( x264_t *h )
  *****************************************************************************/
 static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
 {
-    ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
-    ALIGNED_ARRAY_16( dctcoef, dct2x2,[4] );
+    ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
     ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
     ALIGNED_4( int16_t mvp[2] );
-
     int i_qp = h->mb.i_qp;
-    int thresh, ssd;
 
     for( int p = 0; p < plane_count; p++ )
     {
@@ -966,11 +1091,13 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b
         i_qp = h->mb.i_chroma_qp;
     }
 
-    if( chroma )
+    if( chroma == CHROMA_420 || chroma == CHROMA_422 )
     {
-        /* encode chroma */
         i_qp = h->mb.i_chroma_qp;
-        thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
+        int chroma422 = chroma == CHROMA_422;
+        int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
+        int ssd;
+        ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
 
         if( !b_bidir )
         {
@@ -978,9 +1105,10 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b
             if( M32( mvp ) )
                 h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
                                  h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
-                                 mvp[0], mvp[1], 8, 8 );
+                                 mvp[0], mvp[1]<<chroma422, 8, chroma422?16:8 );
             else
-                h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] );
+                h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
+                                                     h->mb.pic.i_stride[1], chroma422?16:8 );
         }
 
         for( int ch = 0; ch < 2; ch++ )
@@ -991,11 +1119,11 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b
             if( !b_bidir && h->sh.weight[0][1+ch].weightfn )
                 h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
                                                       h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
-                                                      &h->sh.weight[0][1+ch], 8 );
+                                                      &h->sh.weight[0][1+ch], chroma422?16:8 );
 
             /* there is almost never a termination during chroma, but we can't avoid the check entirely */
             /* so instead we check SSD and skip the actual check if the score is low enough. */
-            ssd = h->pixf.ssd[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
+            ssd = h->pixf.ssd[chroma422?PIXEL_8x16:PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
             if( ssd < thresh )
                 continue;
 
@@ -1003,28 +1131,38 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b
              * threshold check, so we can save time by doing a DC-only DCT. */
             if( h->mb.b_noise_reduction )
             {
-                h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
-                for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+                for( int i = 0; i <= chroma422; i++ )
+                    h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
+
+                for( int i4x4 = 0; i4x4 < (chroma422?8:4); i4x4++ )
                 {
                     h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
-                    dct2x2[i4x4] = dct4x4[i4x4][0];
+                    dct_dc[i4x4] = dct4x4[i4x4][0];
                 }
             }
             else
-                h->dctf.sub8x8_dct_dc( dct2x2, p_src, p_dst );
+            {
+                if( chroma422 )
+                    h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
+                else
+                    h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
+            }
 
-            if( h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ) )
-                return 0;
+            for( int i = 0; i <= chroma422; i++ )
+                if( h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4PC][i_qp+3*chroma422][0] >> 1,
+                                            h->quant4_bias[CQM_4PC][i_qp+3*chroma422][0] << 1 ) )
+                    return 0;
 
             /* If there wasn't a termination in DC, we can check against a much higher threshold. */
             if( ssd < thresh*4 )
                 continue;
 
             if( !h->mb.b_noise_reduction )
-                h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
+               for( int i = 0; i <= chroma422; i++ )
+                    h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
 
             /* calculate dct coeffs */
-            for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
+            for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < (chroma422?8:4); i4x4++ )
             {
                 dct4x4[i4x4][0] = 0;
                 if( h->mb.b_noise_reduction )
@@ -1045,10 +1183,12 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b
 
 int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
 {
-    if( CHROMA444 )
-        return x264_macroblock_probe_skip_internal( h, b_bidir, 3, 0 );
+    if( CHROMA_FORMAT == CHROMA_444 )
+        return x264_macroblock_probe_skip_internal( h, b_bidir, 3, CHROMA_444 );
+    else if( CHROMA_FORMAT == CHROMA_422 )
+        return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_422 );
     else
-        return x264_macroblock_probe_skip_internal( h, b_bidir, 1, 1 );
+        return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_420 );
 }
 
 /****************************************************************************
@@ -1096,6 +1236,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
     int x = i8&1;
     int y = i8>>1;
     int nz;
+    int chroma422 = chroma == CHROMA_422;
 
     h->mb.i_cbp_chroma = 0;
     h->mb.i_cbp_luma &= ~(1 << i8);
@@ -1128,15 +1269,20 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
             }
             h->mb.i_cbp_luma |= nnz8x8 << i8;
         }
-        if( chroma )
+        if( chroma == CHROMA_420 || chroma == CHROMA_422 )
         {
             for( int ch = 0; ch < 2; ch++ )
             {
                 dctcoef dc;
-                pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
-                pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
-                nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*16], p_fenc, p_fdec, &dc );
-                h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*16]] = nz;
+                pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
+                pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
+
+                for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
+                {
+                    int offset = chroma422 ? 8*y + 2*i4x4 + x : i8;
+                    nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+offset+ch*16], p_fenc+4*i4x4*FENC_STRIDE, p_fdec+4*i4x4*FDEC_STRIDE, &dc );
+                    h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
+                }
             }
             h->mb.i_cbp_chroma = 0x02;
         }
@@ -1212,30 +1358,36 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
             }
         }
 
-        if( chroma )
+        if( chroma == CHROMA_420 || chroma == CHROMA_422 )
         {
             i_qp = h->mb.i_chroma_qp;
             for( int ch = 0; ch < 2; ch++ )
             {
-                ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
-                pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
-                pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
-                h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
-                if( h->mb.b_noise_reduction )
-                    h->quantf.denoise_dct( dct4x4, h->nr_residual_sum[2], h->nr_offset[2], 16 );
-                dct4x4[0] = 0;
+                ALIGNED_ARRAY_16( dctcoef, dct4x4,[2],[16] );
+                pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
+                pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
 
-                if( h->mb.b_trellis )
-                    nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
-                else
-                    nz = h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
-
-                h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*16]] = nz;
-                if( nz )
+                for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
                 {
-                    h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*16], dct4x4 );
-                    h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
-                    h->dctf.add4x4_idct( p_fdec, dct4x4 );
+                    h->dctf.sub4x4_dct( dct4x4[i4x4], p_fenc + 4*i4x4*FENC_STRIDE, p_fdec + 4*i4x4*FDEC_STRIDE );
+
+                    if( h->mb.b_noise_reduction )
+                        h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
+                    dct4x4[i4x4][0] = 0;
+
+                    if( h->mb.b_trellis )
+                        nz = x264_quant_4x4_trellis( h, dct4x4[i4x4], CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
+                    else
+                        nz = h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
+
+                    int offset = chroma422 ? ((5*i8) & 0x09) + 2*i4x4 : i8;
+                    h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
+                    if( nz )
+                    {
+                        h->zigzagf.scan_4x4( h->dct.luma4x4[16+offset+ch*16], dct4x4[i4x4] );
+                        h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[CQM_4PC], i_qp );
+                        h->dctf.add4x4_idct( p_fdec + 4*i4x4*FDEC_STRIDE, dct4x4[i4x4] );
+                    }
                 }
             }
             h->mb.i_cbp_chroma = 0x02;
@@ -1246,9 +1398,11 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
 void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
 {
     if( CHROMA444 )
-        x264_macroblock_encode_p8x8_internal( h, i8, 3, 0 );
+        x264_macroblock_encode_p8x8_internal( h, i8, 3, CHROMA_444 );
+    else if( CHROMA_FORMAT == CHROMA_422 )
+        x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_422 );
     else
-        x264_macroblock_encode_p8x8_internal( h, i8, 1, 1 );
+        x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_420 );
 }
 
 /*****************************************************************************
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index 5e3b188d..d8ca95dc 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -41,10 +41,10 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir );
 #define x264_macroblock_probe_bskip( h )\
     x264_macroblock_probe_skip( h, 1 )
 
-void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode );
 void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode );
 void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] );
 void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode );
+void x264_predict_lossless_chroma( x264_t *h, int i_mode );
 
 void x264_macroblock_encode      ( x264_t *h );
 void x264_macroblock_write_cabac ( x264_t *h, x264_cabac_t *cb );
@@ -54,12 +54,13 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 );
 void x264_macroblock_encode_p4x4( x264_t *h, int i4 );
 void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode );
 void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge );
-void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp );
+void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp );
 
 void x264_cabac_mb_skip( x264_t *h, int b_skip );
 
-int x264_quant_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
-                             int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx );
+int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp,
+                                int ctx_block_cat, int b_intra, int idx );
+int x264_quant_chroma_dc_trellis( x264_t *h, dctcoef *dct, int i_qp, int b_intra, int idx );
 int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
                              int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx );
 int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
diff --git a/encoder/me.c b/encoder/me.c
index e21f2ca8..1c8c8bb3 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -805,17 +805,16 @@ if( b_refine_qpel || (dir^1) != odir ) \
         } \
         else \
         { \
-            h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
+            h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], \
+                             mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \
             if( m->weight[1].weightfn ) \
-                m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 16, pix, 16, \
-                                                                      &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \
-            cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \
+                m->weight[1].weightfn[bw>>3]( pix, 16, pix, 16, &m->weight[1], bh>>chroma_v_shift ); \
+            cost += h->pixf.mbcmp[chromapix]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \
             if( cost < bcost ) \
             { \
                 if( m->weight[2].weightfn ) \
-                    m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix+8, 16, pix+8, 16, \
-                                                                          &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \
-                cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \
+                    m->weight[2].weightfn[bw>>3]( pix+8, 16, pix+8, 16, &m->weight[2], bh>>chroma_v_shift ); \
+                cost += h->pixf.mbcmp[chromapix]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \
             } \
         } \
     } \
@@ -830,7 +829,9 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
     const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
     const int i_pixel = m->i_pixel;
     const int b_chroma_me = h->mb.b_chroma_me && (i_pixel <= PIXEL_8x8 || CHROMA444);
-    const int mvy_offset = MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+    int chromapix = h->luma2chroma_pixel[i_pixel];
+    int chroma_v_shift = h->mb.chroma_v_shift;
+    int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
 
     ALIGNED_ARRAY_16( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
 
@@ -952,7 +953,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
         }\
         else\
             h->mc.mc_chroma( pixu_buf[list][i], pixv_buf[list][i], 8, m->p_fref[4], m->i_stride[1],\
-                             mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
+                             mvx, 2*(mvy+mv##list##y_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift );\
     }\
 }
 
@@ -976,14 +977,17 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
     ALIGNED_ARRAY_16( pixel, pixu_buf,[2],[9][16*16] );
     ALIGNED_ARRAY_16( pixel, pixv_buf,[2],[9][16*16] );
     pixel *src[3][2][9];
-    int chromasize = CHROMA444 ? 8 : 4;
+    int chromapix = h->luma2chroma_pixel[i_pixel];
+    int chroma_v_shift = h->mb.chroma_v_shift;
+    int chroma_x = (8 >> h->mb.chroma_h_shift) * x;
+    int chroma_y = (8 >> chroma_v_shift) * y;
     pixel *pix  = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
-    pixel *pixu = &h->mb.pic.p_fdec[1][chromasize*x + chromasize*y*FDEC_STRIDE];
-    pixel *pixv = &h->mb.pic.p_fdec[2][chromasize*x + chromasize*y*FDEC_STRIDE];
+    pixel *pixu = &h->mb.pic.p_fdec[1][chroma_x + chroma_y*FDEC_STRIDE];
+    pixel *pixv = &h->mb.pic.p_fdec[2][chroma_x + chroma_y*FDEC_STRIDE];
     int ref0 = h->mb.cache.ref[0][s8];
     int ref1 = h->mb.cache.ref[1][s8];
-    const int mv0y_offset = MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
-    const int mv1y_offset = MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+    const int mv0y_offset = chroma_v_shift & MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+    const int mv1y_offset = chroma_v_shift & MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
     int stride[3][2][9];
     int bm0x = m0->mv[0];
     int bm0y = m0->mv[1];
@@ -1071,8 +1075,8 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
                         }
                         else
                         {
-                            h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
-                            h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
+                            h->mc.avg[chromapix]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
+                            h->mc.avg[chromapix]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
                         }
                         uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel );
                         COPY2_IF_LT( bcostrd, costrd, bestj, j );
@@ -1153,13 +1157,12 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
         } \
         else if( m->i_pixel <= PIXEL_8x8 ) \
         { \
-            h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
+            h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], \
+                             mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \
             if( m->weight[1].weightfn ) \
-                m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pixu, FDEC_STRIDE, pixu, FDEC_STRIDE, \
-                                                                      &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \
+                m->weight[1].weightfn[bw>>3]( pixu, FDEC_STRIDE, pixu, FDEC_STRIDE, &m->weight[1], bh>>chroma_v_shift ); \
             if( m->weight[2].weightfn ) \
-                m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pixv, FDEC_STRIDE, pixv, FDEC_STRIDE, \
-                                                                      &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \
+                m->weight[2].weightfn[bw>>3]( pixv, FDEC_STRIDE, pixv, FDEC_STRIDE, &m->weight[2], bh>>chroma_v_shift ); \
         } \
         cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
         COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
@@ -1173,7 +1176,8 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
     const int bw = x264_pixel_size[m->i_pixel].w;
     const int bh = x264_pixel_size[m->i_pixel].h;
     const int i_pixel = m->i_pixel;
-    const int mvy_offset = MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+    int chroma_v_shift = h->mb.chroma_v_shift;
+    int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
 
     uint64_t bcost = COST_MAX64;
     int bmx = m->mv[0];
@@ -1193,8 +1197,8 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
     }
     else
     {
-        pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
-        pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+        pixu = &h->mb.pic.p_fdec[1][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4];
+        pixv = &h->mb.pic.p_fdec[2][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4];
     }
 
     h->mb.b_skip_mc = 1;
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index a64a9997..dfe52248 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -219,18 +219,21 @@ static ALWAYS_INLINE uint32_t ac_energy_var( uint64_t sum_ssd, int shift, x264_f
 
 static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i, int b_chroma, int b_field, int b_store )
 {
-    int w = b_chroma ? 8 : 16;
+    int height = b_chroma ? 16>>h->mb.chroma_v_shift : 16;
     int stride = frame->i_stride[i];
     int offset = b_field
-        ? 16 * mb_x + w * (mb_y&~1) * stride + (mb_y&1) * stride
-        : 16 * mb_x + w * mb_y * stride;
+        ? 16 * mb_x + height * (mb_y&~1) * stride + (mb_y&1) * stride
+        : 16 * mb_x + height * mb_y * stride;
     stride <<= b_field;
     if( b_chroma )
     {
-        ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*8] );
-        h->mc.load_deinterleave_8x8x2_fenc( pix, frame->plane[1] + offset, stride );
-        return ac_energy_var( h->pixf.var[PIXEL_8x8]( pix, FENC_STRIDE ), 6, frame, 1, b_store )
-             + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6, frame, 2, b_store );
+        ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*16] );
+        int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
+        int shift = 7 - h->mb.chroma_v_shift;
+
+        h->mc.load_deinterleave_chroma_fenc( pix, frame->plane[1] + offset, stride, height );
+        return ac_energy_var( h->pixf.var[chromapix]( pix,               FENC_STRIDE ), shift, frame, 1, b_store )
+             + ac_energy_var( h->pixf.var[chromapix]( pix+FENC_STRIDE/2, FENC_STRIDE ), shift, frame, 2, b_store );
     }
     else
         return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[i] + offset, stride ), 8, frame, i, b_store );
@@ -379,9 +382,8 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_off
     {
         uint64_t ssd = frame->i_pixel_ssd[i];
         uint64_t sum = frame->i_pixel_sum[i];
-        int size = CHROMA444 || !i ? 16 : 8;
-        int width = h->mb.i_mb_width*size;
-        int height = h->mb.i_mb_height*size;
+        int width  = 16*h->mb.i_mb_width  >> (i && h->mb.chroma_h_shift);
+        int height = 16*h->mb.i_mb_height >> (i && h->mb.chroma_v_shift);
         frame->i_pixel_ssd[i] = ssd - (sum * sum + width * height / 2) / (width * height);
     }
 }
@@ -1279,8 +1281,8 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead )
         if( h->param.b_bluray_compat )
             mincr = 4;
 
-        /* High 10 / High 4:4:4 Predictive doesn't require minCR, so just set the maximum to a large value. */
-        if( h->sps->i_profile_idc >= PROFILE_HIGH10 )
+        /* Profiles above High don't require minCR, so just set the maximum to a large value. */
+        if( h->sps->i_profile_idc > PROFILE_HIGH )
             rc->frame_size_maximum = 1e9;
         else
         {
diff --git a/encoder/rdo.c b/encoder/rdo.c
index f994fa02..4ca07508 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -146,7 +146,7 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
 
 static inline int ssd_mb( x264_t *h )
 {
-    int chroma_size = CHROMA444 ? PIXEL_16x16 : PIXEL_8x8;
+    int chroma_size = h->luma2chroma_pixel[PIXEL_16x16];
     int chroma_ssd = ssd_plane(h, chroma_size, 1, 0, 0) + ssd_plane(h, chroma_size, 2, 0, 0);
     chroma_ssd = ((uint64_t)chroma_ssd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
     return ssd_plane(h, PIXEL_16x16, 0, 0, 0) + chroma_ssd;
@@ -227,7 +227,6 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
 {
     uint64_t i_ssd, i_bits;
     int i8 = i4 >> 2;
-    int chromassd;
 
     if( i_pixel == PIXEL_16x16 )
     {
@@ -246,19 +245,13 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
     if( i_pixel == PIXEL_8x16 )
         x264_macroblock_encode_p8x8( h, i8+2 );
 
-    i_ssd = ssd_plane( h, i_pixel, 0, (i8&1)*8, (i8>>1)*8 );
-    if( CHROMA444 )
-    {
-        chromassd = ssd_plane( h, i_pixel, 1, (i8&1)*8, (i8>>1)*8 )
-                  + ssd_plane( h, i_pixel, 2, (i8&1)*8, (i8>>1)*8 );
-    }
-    else
-    {
-        chromassd = ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
-                  + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
-    }
-    chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
-    i_ssd += chromassd;
+    int ssd_x = 8*(i8&1);
+    int ssd_y = 8*(i8>>1);
+    i_ssd = ssd_plane( h, i_pixel, 0, ssd_x, ssd_y );
+    int chromapix = h->luma2chroma_pixel[i_pixel];
+    int chromassd = ssd_plane( h, chromapix, 1, ssd_x>>h->mb.chroma_h_shift, ssd_y>>h->mb.chroma_v_shift )
+                  + ssd_plane( h, chromapix, 2, ssd_x>>h->mb.chroma_h_shift, ssd_y>>h->mb.chroma_v_shift );
+    i_ssd += ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
 
     if( h->param.b_cabac )
     {
@@ -343,14 +336,16 @@ static uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode
     return (i_ssd<<8) + i_bits;
 }
 
-static uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
+static uint64_t x264_rd_cost_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
 {
     uint64_t i_ssd, i_bits;
 
     if( b_dct )
-        x264_mb_encode_8x8_chroma( h, 0, h->mb.i_chroma_qp );
-    i_ssd = ssd_plane( h, PIXEL_8x8, 1, 0, 0 ) +
-            ssd_plane( h, PIXEL_8x8, 2, 0, 0 );
+        x264_mb_encode_chroma( h, 0, h->mb.i_chroma_qp );
+
+    int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
+    i_ssd = ssd_plane( h, chromapix, 1, 0, 0 )
+          + ssd_plane( h, chromapix, 2, 0, 0 );
 
     h->mb.i_chroma_pred_mode = i_mode;
 
@@ -358,11 +353,11 @@ static uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode,
     {
         x264_cabac_t cabac_tmp;
         COPY_CABAC;
-        x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
+        x264_chroma_size_cabac( h, &cabac_tmp );
         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
     else
-        i_bits = x264_i8x8_chroma_size_cavlc( h ) * i_lambda2;
+        i_bits = x264_chroma_size_cavlc( h ) * i_lambda2;
 
     return (i_ssd<<8) + i_bits;
 }
@@ -443,7 +438,8 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
                          int ctx_block_cat, int i_lambda2, int b_ac,
                          int b_chroma, int dc, int i_coefs, int idx )
 {
-    int abs_coefs[64], signs[64];
+    udctcoef abs_coefs[64];
+    int8_t signs[64];
     trellis_node_t nodes[2][8];
     trellis_node_t *nodes_cur = nodes[0];
     trellis_node_t *nodes_prev = nodes[1];
@@ -451,6 +447,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
     const int b_interlaced = MB_INTERLACED;
     uint8_t *cabac_state_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
     uint8_t *cabac_state_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
+    const uint8_t *levelgt1_ctx = b_chroma && dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
     const int f = 1 << 15; // no deadzone
     int i_last_nnz;
     int i;
@@ -486,7 +483,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
     {
         int coef = dct[zigzag[i]];
         abs_coefs[i] = abs(coef);
-        signs[i] = coef < 0 ? -1 : 1;
+        signs[i] = coef>>31 | 1;
     }
 
     /* init trellis */
@@ -519,7 +516,8 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
         {
             // no need to calculate ssd of 0s: it's the same in all nodes.
             // no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s.
-            int sigindex = i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] : i;
+            int sigindex = !dc && i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
+                           b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
             const uint32_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )
                                      * (uint64_t)i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
             for( int j = 1; j < 8; j++ )
@@ -546,8 +544,10 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
 
         if( i < i_coefs-1 )
         {
-            int sigindex = i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] : i;
-            int lastindex = i_coefs == 64 ? last_coeff_flag_offset_8x8[i] : i;
+            int sigindex  = !dc && i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
+                            b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
+            int lastindex = !dc && i_coefs == 64 ? last_coeff_flag_offset_8x8[i] :
+                            b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
             cost_sig[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );
             cost_sig[1] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );
             cost_last[0] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 );
@@ -599,7 +599,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
                         f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[node_ctx]], i_prefix > 0 );
                         if( i_prefix > 0 )
                         {
-                            uint8_t *ctx = &n.cabac_state[coeff_abs_levelgt1_ctx[node_ctx]];
+                            uint8_t *ctx = &n.cabac_state[levelgt1_ctx[node_ctx]];
                             f8_bits += cabac_size_unary[i_prefix][*ctx];
                             *ctx = cabac_transition_unary[i_prefix][*ctx];
                             if( abs_level >= 15 )
@@ -695,7 +695,8 @@ int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
     int64_t score = 1ULL<<62;
     int i, j;
     const int f = 1<<15;
-    int nC = ctx_block_cat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, ctx_block_cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];
+    int nC = b_chroma && dc ? 3 + (i_coefs>>2)
+                            : ct_index[x264_mb_predict_non_zero_code( h, !b_chroma && dc ? (idx - LUMA_DC)*16 : idx )];
 
     /* Code for handling 8x8dct -> 4x4dct CAVLC munging.  Input/output use a different
      * step/start/end than internal processing. */
@@ -857,24 +858,46 @@ zeroblock:
     return 0;
 }
 
-const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3};
-
-int x264_quant_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
-                           int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx )
+int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, int ctx_block_cat, int b_intra, int idx )
 {
     if( h->param.b_cabac )
         return quant_trellis_cabac( h, dct,
-            h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
-            NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[MB_INTERLACED],
-            ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 1, ctx_block_cat==DCT_CHROMA_DC ? 4 : 16, idx );
+            h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], NULL, x264_zigzag_scan4[MB_INTERLACED],
+            ctx_block_cat, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx );
+
+    return quant_trellis_cavlc( h, dct,
+        h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], NULL, x264_zigzag_scan4[MB_INTERLACED],
+        DCT_LUMA_DC, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx, 0 );
+}
 
-    if( ctx_block_cat != DCT_CHROMA_DC )
-        ctx_block_cat = DCT_LUMA_DC;
+static const uint8_t x264_zigzag_scan2x2[4] = { 0, 1, 2, 3 };
+static const uint8_t x264_zigzag_scan2x4[8] = { 0, 2, 1, 4, 6, 3, 5, 7 };
+
+int x264_quant_chroma_dc_trellis( x264_t *h, dctcoef *dct, int i_qp, int b_intra, int idx )
+{
+    const uint8_t *zigzag;
+    int num_coefs;
+    int quant_cat = CQM_4IC+1 - b_intra;
+
+    if( CHROMA_FORMAT == CHROMA_422 )
+    {
+        zigzag = x264_zigzag_scan2x4;
+        num_coefs = 8;
+    }
+    else
+    {
+        zigzag = x264_zigzag_scan2x2;
+        num_coefs = 4;
+    }
+
+    if( h->param.b_cabac )
+        return quant_trellis_cabac( h, dct,
+            h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], NULL, zigzag,
+            DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx );
 
     return quant_trellis_cavlc( h, dct,
-        h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
-        NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[MB_INTERLACED],
-        ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 1, ctx_block_cat==DCT_CHROMA_DC ? 4 : 16, idx, 0 );
+        h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], NULL, zigzag,
+        DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx, 0 );
 }
 
 int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
diff --git a/encoder/set.c b/encoder/set.c
index a498c945..5e1ff642 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -104,11 +104,14 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
     sps->i_id = i_id;
     sps->i_mb_width = ( param->i_width + 15 ) / 16;
     sps->i_mb_height= ( param->i_height + 15 ) / 16;
-    sps->i_chroma_format_idc = csp >= X264_CSP_I444 ? 3 : 1;
+    sps->i_chroma_format_idc = csp >= X264_CSP_I444 ? CHROMA_444 :
+                               csp >= X264_CSP_I422 ? CHROMA_422 : CHROMA_420;
 
     sps->b_qpprime_y_zero_transform_bypass = param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0;
-    if( sps->b_qpprime_y_zero_transform_bypass || sps->i_chroma_format_idc == 3 )
+    if( sps->b_qpprime_y_zero_transform_bypass || sps->i_chroma_format_idc == CHROMA_444 )
         sps->i_profile_idc  = PROFILE_HIGH444_PREDICTIVE;
+    else if( sps->i_chroma_format_idc == CHROMA_422 )
+        sps->i_profile_idc  = PROFILE_HIGH422;
     else if( BIT_DEPTH > 8 )
         sps->i_profile_idc  = PROFILE_HIGH10;
     else if( param->analyse.b_transform_8x8 || param->i_cqm_preset != X264_CQM_FLAT )
@@ -132,11 +135,8 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
         sps->b_constraint_set3 = 1; /* level 1b with Baseline, Main or Extended profile is signalled via constraint_set3 */
         sps->i_level_idc      = 11;
     }
-    /* High 10 Intra profile */
-    if( param->i_keyint_max == 1 && sps->i_profile_idc == PROFILE_HIGH10 )
-        sps->b_constraint_set3 = 1;
-    /* High 4:4:4 Intra profile */
-    if( param->i_keyint_max == 1 && sps->i_profile_idc == PROFILE_HIGH444_PREDICTIVE )
+    /* Intra profiles */
+    if( param->i_keyint_max == 1 && sps->i_profile_idc > PROFILE_HIGH )
         sps->b_constraint_set3 = 1;
 
     sps->vui.i_num_reorder_frames = param->i_bframe_pyramid ? 2 : param->i_bframe ? 1 : 0;
@@ -302,11 +302,12 @@ void x264_sps_write( bs_t *s, x264_sps_t *sps )
     bs_write1( s, sps->b_crop );
     if( sps->b_crop )
     {
-        int cropshift = sps->i_chroma_format_idc != 3;
-        bs_write_ue( s, sps->crop.i_left   >> cropshift );
-        bs_write_ue( s, sps->crop.i_right  >> cropshift );
-        bs_write_ue( s, sps->crop.i_top    >> cropshift );
-        bs_write_ue( s, sps->crop.i_bottom >> cropshift );
+        int h_shift = sps->i_chroma_format_idc == CHROMA_420 || sps->i_chroma_format_idc == CHROMA_422;
+        int v_shift = sps->i_chroma_format_idc == CHROMA_420;
+        bs_write_ue( s, sps->crop.i_left   >> h_shift );
+        bs_write_ue( s, sps->crop.i_right  >> h_shift );
+        bs_write_ue( s, sps->crop.i_top    >> v_shift );
+        bs_write_ue( s, sps->crop.i_bottom >> v_shift );
     }
 
     bs_write1( s, sps->b_vui );
@@ -757,7 +758,7 @@ int x264_validate_levels( x264_t *h, int verbose )
     int ret = 0;
     int mbs = h->sps->i_mb_width * h->sps->i_mb_height;
     int dpb = mbs * 384 * h->sps->vui.i_max_dec_frame_buffering;
-    int cbp_factor = h->sps->i_profile_idc==PROFILE_HIGH444_PREDICTIVE ? 16 :
+    int cbp_factor = h->sps->i_profile_idc>=PROFILE_HIGH422 ? 16 :
                      h->sps->i_profile_idc==PROFILE_HIGH10 ? 12 :
                      h->sps->i_profile_idc==PROFILE_HIGH ? 5 : 4;
 
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 5a91c167..0acda252 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -96,12 +96,11 @@ static NOINLINE pixel *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc
     return ref->lowres[0];
 }
 
-/* How data is organized for chroma weightp 4:2:0:
+/* How data is organized for 4:2:0/4:2:2 chroma weightp:
  * [U: ref] [U: fenc]
  * [V: ref] [V: fenc]
  * fenc = ref + offset
- * v = u + stride * chroma height
- * We'll need more room if we do 4:2:2. */
+ * v = u + stride * chroma height */
 
 static NOINLINE void x264_weight_cost_init_chroma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dstu, pixel *dstv )
 {
@@ -110,21 +109,23 @@ static NOINLINE void x264_weight_cost_init_chroma( x264_t *h, x264_frame_t *fenc
     int i_offset = i_stride / 2;
     int i_lines = fenc->i_lines[1];
     int i_width = fenc->i_width[1];
-    int cw = h->mb.i_mb_width  << 3;
-    int ch = h->mb.i_mb_height << 3;
+    int v_shift = h->mb.chroma_v_shift;
+    int cw = 8*h->mb.i_mb_width;
+    int ch = 16*h->mb.i_mb_height >> v_shift;
+    int height = 16 >> v_shift;
 
     if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
     {
         x264_frame_expand_border_chroma( h, ref, 1 );
-        for( int y = 0, mb_xy = 0, pel_offset_y = 0; y < i_lines; y += 8, pel_offset_y = y*i_stride )
+        for( int y = 0, mb_xy = 0, pel_offset_y = 0; y < i_lines; y += height, pel_offset_y = y*i_stride )
             for( int x = 0, pel_offset_x = 0; x < i_width; x += 8, mb_xy++, pel_offset_x += 8 )
             {
                 pixel *pixu = dstu + pel_offset_y + pel_offset_x;
                 pixel *pixv = dstv + pel_offset_y + pel_offset_x;
-                pixel *src1 =  ref->plane[1] + pel_offset_y + pel_offset_x*2; /* NV12 */
+                pixel *src1 =  ref->plane[1] + pel_offset_y + pel_offset_x*2; /* NV12/NV16 */
                 int mvx = fenc->lowres_mvs[0][ref0_distance][mb_xy][0];
                 int mvy = fenc->lowres_mvs[0][ref0_distance][mb_xy][1];
-                h->mc.mc_chroma( pixu, pixv, i_stride, src1, i_stride, mvx, mvy, 8, 8 );
+                h->mc.mc_chroma( pixu, pixv, i_stride, src1, i_stride, mvx, 2*mvy>>v_shift, 8, height );
             }
     }
     else
@@ -223,15 +224,17 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f
     int i_lines = fenc->i_lines[1];
     int i_width = fenc->i_width[1];
     pixel *src = ref + i_offset;
-    ALIGNED_ARRAY_16( pixel, buf, [8*8] );
+    ALIGNED_ARRAY_16( pixel, buf, [8*16] );
     int pixoff = 0;
+    int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
+    int height = 16 >> h->mb.chroma_v_shift;
     ALIGNED_16( static pixel flat[8] ) = {0};
     if( w )
     {
-        for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+        for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
             for( int x = 0; x < i_width; x += 8, pixoff += 8 )
             {
-                w->weightfn[8>>2]( buf, 8, &ref[pixoff], i_stride, w, 8 );
+                w->weightfn[8>>2]( buf, 8, &ref[pixoff], i_stride, w, height );
                 /* The naive and seemingly sensible algorithm is to use mbcmp as in luma.
                  * But testing shows that for chroma the DC coefficient is by far the most
                  * important part of the coding cost.  Thus a more useful chroma weight is
@@ -239,16 +242,16 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f
                  * pixels.
                  *
                  * FIXME: add a (faster) asm sum function to replace sad. */
-                cost += abs( h->pixf.sad_aligned[PIXEL_8x8](          buf,        8, flat, 0 ) -
-                             h->pixf.sad_aligned[PIXEL_8x8]( &src[pixoff], i_stride, flat, 0 ) );
+                cost += abs( h->pixf.sad_aligned[chromapix](          buf,        8, flat, 0 ) -
+                             h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) );
             }
         cost += x264_weight_slice_header_cost( h, w, 1 );
     }
     else
-        for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+        for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
             for( int x = 0; x < i_width; x += 8, pixoff += 8 )
-                cost += abs( h->pixf.sad_aligned[PIXEL_8x8]( &ref[pixoff], i_stride, flat, 0 ) -
-                             h->pixf.sad_aligned[PIXEL_8x8]( &src[pixoff], i_stride, flat, 0 ) );
+                cost += abs( h->pixf.sad_aligned[chromapix]( &ref[pixoff], i_stride, flat, 0 ) -
+                             h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) );
     x264_emms();
     return cost;
 }
diff --git a/filters/video/depth.c b/filters/video/depth.c
index 25dde257..9ea2cbcf 100644
--- a/filters/video/depth.c
+++ b/filters/video/depth.c
@@ -46,15 +46,17 @@ static int depth_filter_csp_is_supported( int csp )
     return csp_mask == X264_CSP_I420 ||
            csp_mask == X264_CSP_I422 ||
            csp_mask == X264_CSP_I444 ||
-           csp_mask == X264_CSP_YV24 ||
            csp_mask == X264_CSP_YV12 ||
-           csp_mask == X264_CSP_NV12;
+           csp_mask == X264_CSP_YV16 ||
+           csp_mask == X264_CSP_YV24 ||
+           csp_mask == X264_CSP_NV12 ||
+           csp_mask == X264_CSP_NV16;
 }
 
 static int csp_num_interleaved( int csp, int plane )
 {
     int csp_mask = csp & X264_CSP_MASK;
-    return ( csp_mask == X264_CSP_NV12 && plane == 1 ) ? 2 : 1;
+    return ( (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV16) && plane == 1 ) ? 2 : 1;
 }
 
 /* The dithering algorithm is based on Sierra-2-4A error diffusion. It has been
diff --git a/filters/video/resize.c b/filters/video/resize.c
index 878a4d77..87687268 100644
--- a/filters/video/resize.c
+++ b/filters/video/resize.c
@@ -137,6 +137,7 @@ static int convert_csp_to_pix_fmt( int csp )
     {
         case X264_CSP_YV12: /* specially handled via swapping chroma */
         case X264_CSP_I420: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV420P16 : PIX_FMT_YUV420P;
+        case X264_CSP_YV16: /* specially handled via swapping chroma */
         case X264_CSP_I422: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV422P16 : PIX_FMT_YUV422P;
         case X264_CSP_YV24: /* specially handled via swapping chroma */
         case X264_CSP_I444: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV444P16 : PIX_FMT_YUV444P;
@@ -467,11 +468,11 @@ static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x2
     h->dst.pix_fmt = convert_csp_to_pix_fmt( h->dst_csp );
     h->scale = h->dst;
 
-    /* swap chroma planes if YV12/YV24 is involved, as libswscale works with I420/I444 */
+    /* swap chroma planes if YV12/YV16/YV24 is involved, as libswscale works with I420/I422/I444 */
     int src_csp = info->csp & (X264_CSP_MASK | X264_CSP_OTHER);
     int dst_csp = h->dst_csp & (X264_CSP_MASK | X264_CSP_OTHER);
-    h->pre_swap_chroma  = src_csp == X264_CSP_YV12 || src_csp == X264_CSP_YV24;
-    h->post_swap_chroma = dst_csp == X264_CSP_YV12 || dst_csp == X264_CSP_YV24;
+    h->pre_swap_chroma  = src_csp == X264_CSP_YV12 || src_csp == X264_CSP_YV16 || src_csp == X264_CSP_YV24;
+    h->post_swap_chroma = dst_csp == X264_CSP_YV12 || dst_csp == X264_CSP_YV16 || dst_csp == X264_CSP_YV24;
 
     int src_pix_fmt = convert_csp_to_pix_fmt( info->csp );
 
diff --git a/input/avs.c b/input/avs.c
index 59fab8cc..0169746d 100644
--- a/input/avs.c
+++ b/input/avs.c
@@ -219,15 +219,22 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
     }
 #if !HAVE_SWSCALE
     /* if swscale is not available, convert the CSP if necessary */
-    if( (opt->output_csp == X264_CSP_I420 && !avs_is_yv12( vi )) || (opt->output_csp == X264_CSP_I444 && !avs_is_yv24( vi )) ||
-        (opt->output_csp == X264_CSP_RGB && !avs_is_rgb( vi )) )
+    if( (opt->output_csp == X264_CSP_I420 && !avs_is_yv12( vi )) || (opt->output_csp == X264_CSP_I422 && !avs_is_yv16( vi )) ||
+        (opt->output_csp == X264_CSP_I444 && !avs_is_yv24( vi )) || (opt->output_csp == X264_CSP_RGB && !avs_is_rgb( vi )) )
     {
-        FAIL_IF_ERROR( avs_version < 2.6f && opt->output_csp == X264_CSP_I444, "avisynth >= 2.6 is required for i444 output\n" )
+        FAIL_IF_ERROR( avs_version < 2.6f && (opt->output_csp == X264_CSP_I422 || opt->output_csp == X264_CSP_I444),
+                       "avisynth >= 2.6 is required for i422/i444 output\n" )
 
-        const char *csp = opt->output_csp == X264_CSP_I420 ? "YV12" : (opt->output_csp == X264_CSP_I444 ? "YV24" : "RGB");
+        const char *csp = opt->output_csp == X264_CSP_I420 ? "YV12" :
+                          opt->output_csp == X264_CSP_I422 ? "YV16" :
+                          opt->output_csp == X264_CSP_I444 ? "YV24" : "RGB";
         x264_cli_log( "avs", X264_LOG_WARNING, "converting input clip to %s\n", csp );
-        FAIL_IF_ERROR( opt->output_csp == X264_CSP_I420 && (vi->width&1 || vi->height&1),
-                       "input clip width or height not divisible by 2 (%dx%d)\n", vi->width, vi->height )
+        FAIL_IF_ERROR( opt->output_csp < X264_CSP_I444 && (vi->width&1),
+                       "input clip width not divisible by 2 (%dx%d)\n", vi->width, vi->height )
+        FAIL_IF_ERROR( opt->output_csp == X264_CSP_I420 && info->interlaced && (vi->height&3),
+                       "input clip height not divisible by 4 (%dx%d)\n", vi->width, vi->height )
+        FAIL_IF_ERROR( (opt->output_csp == X264_CSP_I420 || info->interlaced) && (vi->height&1),
+                       "input clip height not divisible by 2 (%dx%d)\n", vi->width, vi->height )
         const char *arg_name[2] = { NULL, "interlaced" };
         AVS_Value arg_arr[2] = { res, avs_new_value_bool( info->interlaced ) };
         char conv_func[14] = { "ConvertTo" };
@@ -251,13 +258,13 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
         info->csp = X264_CSP_BGR | X264_CSP_VFLIP;
     else if( avs_is_yv24( vi ) )
         info->csp = X264_CSP_I444;
+    else if( avs_is_yv16( vi ) )
+        info->csp = X264_CSP_I422;
     else if( avs_is_yv12( vi ) )
         info->csp = X264_CSP_I420;
 #if HAVE_SWSCALE
     else if( avs_is_yuy2( vi ) )
         info->csp = PIX_FMT_YUYV422 | X264_CSP_OTHER;
-    else if( avs_is_yv16( vi ) )
-        info->csp = X264_CSP_I422;
     else if( avs_is_yv411( vi ) )
         info->csp = PIX_FMT_YUV411P | X264_CSP_OTHER;
     else if( avs_is_y8( vi ) )
diff --git a/input/input.c b/input/input.c
index 084499ae..27c2c3df 100644
--- a/input/input.c
+++ b/input/input.c
@@ -29,9 +29,11 @@ const x264_cli_csp_t x264_cli_csps[] = {
     [X264_CSP_I420] = { "i420", 3, { 1, .5, .5 }, { 1, .5, .5 }, 2, 2 },
     [X264_CSP_I422] = { "i422", 3, { 1, .5, .5 }, { 1,  1,  1 }, 2, 1 },
     [X264_CSP_I444] = { "i444", 3, { 1,  1,  1 }, { 1,  1,  1 }, 1, 1 },
-    [X264_CSP_YV24] = { "yv24", 3, { 1,  1,  1 }, { 1,  1,  1 }, 1, 1 },
     [X264_CSP_YV12] = { "yv12", 3, { 1, .5, .5 }, { 1, .5, .5 }, 2, 2 },
+    [X264_CSP_YV16] = { "yv16", 3, { 1, .5, .5 }, { 1,  1,  1 }, 2, 1 },
+    [X264_CSP_YV24] = { "yv24", 3, { 1,  1,  1 }, { 1,  1,  1 }, 1, 1 },
     [X264_CSP_NV12] = { "nv12", 2, { 1,  1 },     { 1, .5 },     2, 2 },
+    [X264_CSP_NV16] = { "nv16", 2, { 1,  1 },     { 1,  1 },     2, 1 },
     [X264_CSP_BGR]  = { "bgr",  1, { 3 },         { 1 },         1, 1 },
     [X264_CSP_BGRA] = { "bgra", 1, { 4 },         { 1 },         1, 1 },
     [X264_CSP_RGB]  = { "rgb",  1, { 3 },         { 1 },         1, 1 },
diff --git a/input/input.h b/input/input.h
index 4a4bb0b2..bd7e4218 100644
--- a/input/input.h
+++ b/input/input.h
@@ -103,8 +103,7 @@ extern cli_input_t timecode_input;
 extern cli_input_t cli_input;
 
 /* extended colorspace list that isn't supported by libx264 but by the cli */
-#define X264_CSP_I422           X264_CSP_MAX     /* yuv 4:2:2 planar    */
-#define X264_CSP_CLI_MAX       (X264_CSP_MAX+1)  /* end of list         */
+#define X264_CSP_CLI_MAX        X264_CSP_MAX     /* end of list         */
 #define X264_CSP_OTHER          0x4000           /* non x264 colorspace */
 
 typedef struct
diff --git a/tools/checkasm.c b/tools/checkasm.c
index bb1fafc8..0eb1ed54 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -77,11 +77,12 @@ const char *bench_pattern = "";
 char func_name[100];
 static bench_func_t benchs[MAX_FUNCS];
 
-static const char *pixel_names[10] = { "16x16", "16x8", "8x16", "8x8", "8x4", "4x8", "4x4", "4x2", "2x4", "2x2" };
+static const char *pixel_names[12] = { "16x16", "16x8", "8x16", "8x8", "8x4", "4x8", "4x4", "4x16", "4x2", "2x8", "2x4", "2x2" };
 static const char *intra_predict_16x16_names[7] = { "v", "h", "dc", "p", "dcl", "dct", "dc8" };
 static const char *intra_predict_8x8c_names[7] = { "dc", "h", "v", "p", "dcl", "dct", "dc8" };
 static const char *intra_predict_4x4_names[12] = { "v", "h", "dc", "ddl", "ddr", "vr", "hd", "vl", "hu", "dcl", "dct", "dc8" };
 static const char **intra_predict_8x8_names = intra_predict_4x4_names;
+static const char **intra_predict_8x16c_names = intra_predict_8x8c_names;
 
 #define set_func_name(...) snprintf( func_name, sizeof(func_name), __VA_ARGS__ )
 
@@ -274,7 +275,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
 
 #define TEST_PIXEL( name, align ) \
     ok = 1, used_asm = 0; \
-    for( int i = 0; i < 7; i++ ) \
+    for( int i = 0; i < 8; i++ ) \
     { \
         int res_c, res_asm; \
         if( pixel_asm.name[i] != pixel_ref.name[i] ) \
@@ -374,24 +375,28 @@ static int check_pixel( int cpu_ref, int cpu_new )
 
     ok = 1; used_asm = 0;
     TEST_PIXEL_VAR( PIXEL_16x16 );
+    TEST_PIXEL_VAR( PIXEL_8x16 );
     TEST_PIXEL_VAR( PIXEL_8x8 );
     report( "pixel var :" );
 
-    ok = 1; used_asm = 0;
-    if( pixel_asm.var2_8x8 != pixel_ref.var2_8x8 )
-    {
-        int res_c, res_asm, ssd_c, ssd_asm;
-        set_func_name( "var2_8x8" );
-        used_asm = 1;
-        res_c   = call_c( pixel_c.var2_8x8, pbuf1, 16, pbuf2, 16, &ssd_c );
-        res_asm = call_a( pixel_asm.var2_8x8, pbuf1, 16, pbuf2, 16, &ssd_asm );
-        if( res_c != res_asm || ssd_c != ssd_asm )
-        {
-            ok = 0;
-            fprintf( stderr, "var2_8x8: %d != %d or %d != %d [FAILED]\n", res_c, res_asm, ssd_c, ssd_asm );
-        }
+#define TEST_PIXEL_VAR2( i ) \
+    if( pixel_asm.var2[i] != pixel_ref.var2[i] ) \
+    { \
+        int res_c, res_asm, ssd_c, ssd_asm; \
+        set_func_name( "%s_%s", "var2", pixel_names[i] ); \
+        used_asm = 1; \
+        res_c   = call_c( pixel_c.var2[i], pbuf1, 16, pbuf2, 16, &ssd_c ); \
+        res_asm = call_a( pixel_asm.var2[i], pbuf1, 16, pbuf2, 16, &ssd_asm ); \
+        if( res_c != res_asm || ssd_c != ssd_asm ) \
+        { \
+            ok = 0; \
+            fprintf( stderr, "var2[%d]: %d != %d or %d != %d [FAILED]\n", i, res_c, res_asm, ssd_c, ssd_asm ); \
+        } \
     }
 
+    ok = 1; used_asm = 0;
+    TEST_PIXEL_VAR2( PIXEL_8x16 );
+    TEST_PIXEL_VAR2( PIXEL_8x8 );
     report( "pixel var2 :" );
 
     ok = 1; used_asm = 0;
@@ -490,12 +495,14 @@ static int check_pixel( int cpu_ref, int cpu_new )
     memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) );
     ok = 1; used_asm = 0;
     TEST_INTRA_X3( intra_satd_x3_16x16, 0 );
+    TEST_INTRA_X3( intra_satd_x3_8x16c, 0 );
     TEST_INTRA_X3( intra_satd_x3_8x8c, 0 );
     TEST_INTRA_X3( intra_sa8d_x3_8x8, 1, edge );
     TEST_INTRA_X3( intra_satd_x3_4x4, 0 );
     report( "intra satd_x3 :" );
     ok = 1; used_asm = 0;
     TEST_INTRA_X3( intra_sad_x3_16x16, 0 );
+    TEST_INTRA_X3( intra_sad_x3_8x16c, 0 );
     TEST_INTRA_X3( intra_sad_x3_8x8c, 0 );
     TEST_INTRA_X3( intra_sad_x3_8x8, 1, edge );
     TEST_INTRA_X3( intra_sad_x3_4x4, 0 );
@@ -597,7 +604,7 @@ static int check_dct( int cpu_ref, int cpu_new )
     ALIGNED_16( dctcoef dct2[16][16] );
     ALIGNED_16( dctcoef dct4[16][16] );
     ALIGNED_16( dctcoef dct8[4][64] );
-    ALIGNED_16( dctcoef dctdc[2][4] );
+    ALIGNED_16( dctcoef dctdc[2][8] );
     x264_t h_buf;
     x264_t *h = &h_buf;
 
@@ -671,6 +678,7 @@ static int check_dct( int cpu_ref, int cpu_new )
     TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16 );
     TEST_DCT( sub8x8_dct, dct1, dct2, 16*4 );
     TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4 );
+    TEST_DCT( sub8x16_dct_dc, dctdc[0], dctdc[1], 8 );
     TEST_DCT( sub16x16_dct, dct1, dct2, 16*16 );
     report( "sub_dct4 :" );
 
@@ -757,6 +765,36 @@ static int check_dct( int cpu_ref, int cpu_new )
     TEST_DCTDC( idct4x4dc );
 #undef TEST_DCTDC
 
+#define TEST_DCTDC_CHROMA( name )\
+    ok = 1; used_asm = 0;\
+    if( dct_asm.name != dct_ref.name )\
+    {\
+        set_func_name( #name );\
+        used_asm = 1;\
+        uint16_t *p = (uint16_t*)buf1;\
+        for( int i = 0; i < 16 && ok; i++ )\
+        {\
+            for( int j = 0; j < 8; j++ )\
+                dct1[j][0] = !i ? (j^j>>1^j>>2)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\
+                           : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\
+                           : ((*p++)&0x1fff)-0x1000; /* general case */\
+            memcpy( dct2, dct1, 8*16 * sizeof(dctcoef) );\
+            call_c1( dct_c.name, dctdc[0], dct1 );\
+            call_a1( dct_asm.name, dctdc[1], dct2 );\
+            if( memcmp( dctdc[0], dctdc[1], 8 * sizeof(dctcoef) ) || memcmp( dct1, dct2, 8*16 * sizeof(dctcoef) ) )\
+            {\
+                ok = 0;\
+                fprintf( stderr, #name " [FAILED]\n" ); \
+            }\
+        }\
+        call_c2( dct_c.name, dctdc[0], dct1 );\
+        call_a2( dct_asm.name, dctdc[1], dct2 );\
+    }\
+    report( #name " :" );
+
+    TEST_DCTDC_CHROMA( dct2x4dc );
+#undef TEST_DCTDC_CHROMA
+
     x264_zigzag_function_t zigzag_c[2];
     x264_zigzag_function_t zigzag_ref[2];
     x264_zigzag_function_t zigzag_asm[2];
@@ -986,7 +1024,7 @@ static int check_mc( int cpu_ref, int cpu_new )
 #define MC_TEST_AVG( name, weight ) \
 { \
     ok = 1, used_asm = 0; \
-    for( int i = 0; i < 10; i++ ) \
+    for( int i = 0; i < 12; i++ ) \
     { \
         memcpy( pbuf3, pbuf1+320, 320 * sizeof(pixel) ); \
         memcpy( pbuf4, pbuf1+320, 320 * sizeof(pixel) ); \
@@ -1085,34 +1123,49 @@ static int check_mc( int cpu_ref, int cpu_new )
     report( "mc offsetsub :" );
 
     ok = 1; used_asm = 0;
-    if( mc_a.store_interleave_8x8x2 != mc_ref.store_interleave_8x8x2 )
-    {
-        set_func_name( "store_interleave_8x8x2" );
-        used_asm = 1;
-        memset( pbuf3, 0, 64*8 );
-        memset( pbuf4, 0, 64*8 );
-        call_c( mc_c.store_interleave_8x8x2, pbuf3, 64, pbuf1, pbuf1+16 );
-        call_a( mc_a.store_interleave_8x8x2, pbuf4, 64, pbuf1, pbuf1+16 );
-        if( memcmp( pbuf3, pbuf4, 64*8 ) )
-            ok = 0;
-    }
-    if( mc_a.load_deinterleave_8x8x2_fenc != mc_ref.load_deinterleave_8x8x2_fenc )
-    {
-        set_func_name( "load_deinterleave_8x8x2_fenc" );
-        used_asm = 1;
-        call_c( mc_c.load_deinterleave_8x8x2_fenc, pbuf3, pbuf1, 64 );
-        call_a( mc_a.load_deinterleave_8x8x2_fenc, pbuf4, pbuf1, 64 );
-        if( memcmp( pbuf3, pbuf4, FENC_STRIDE*8 ) )
-            ok = 0;
-    }
-    if( mc_a.load_deinterleave_8x8x2_fdec != mc_ref.load_deinterleave_8x8x2_fdec )
+    for( int height = 8; height <= 16; height += 8 )
     {
-        set_func_name( "load_deinterleave_8x8x2_fdec" );
-        used_asm = 1;
-        call_c( mc_c.load_deinterleave_8x8x2_fdec, pbuf3, pbuf1, 64 );
-        call_a( mc_a.load_deinterleave_8x8x2_fdec, pbuf4, pbuf1, 64 );
-        if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*8 ) )
-            ok = 0;
+        if( mc_a.store_interleave_chroma != mc_ref.store_interleave_chroma )
+        {
+            set_func_name( "store_interleave_chroma" );
+            used_asm = 1;
+            memset( pbuf3, 0, 64*height );
+            memset( pbuf4, 0, 64*height );
+            call_c( mc_c.store_interleave_chroma, pbuf3, 64, pbuf1, pbuf1+16, height );
+            call_a( mc_a.store_interleave_chroma, pbuf4, 64, pbuf1, pbuf1+16, height );
+            if( memcmp( pbuf3, pbuf4, 64*height ) )
+            {
+                ok = 0;
+                fprintf( stderr, "store_interleave_chroma FAILED: h=%d\n", height );
+                break;
+            }
+        }
+        if( mc_a.load_deinterleave_chroma_fenc != mc_ref.load_deinterleave_chroma_fenc )
+        {
+            set_func_name( "load_deinterleave_chroma_fenc" );
+            used_asm = 1;
+            call_c( mc_c.load_deinterleave_chroma_fenc, pbuf3, pbuf1, 64, height );
+            call_a( mc_a.load_deinterleave_chroma_fenc, pbuf4, pbuf1, 64, height );
+            if( memcmp( pbuf3, pbuf4, FENC_STRIDE*height ) )
+            {
+                ok = 0;
+                fprintf( stderr, "load_deinterleave_chroma_fenc FAILED: h=%d\n", height );
+                break;
+            }
+        }
+        if( mc_a.load_deinterleave_chroma_fdec != mc_ref.load_deinterleave_chroma_fdec )
+        {
+            set_func_name( "load_deinterleave_chroma_fdec" );
+            used_asm = 1;
+            call_c( mc_c.load_deinterleave_chroma_fdec, pbuf3, pbuf1, 64, height );
+            call_a( mc_a.load_deinterleave_chroma_fdec, pbuf4, pbuf1, 64, height );
+            if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*height ) )
+            {
+                ok = 0;
+                fprintf( stderr, "load_deinterleave_chroma_fdec FAILED: h=%d\n", height );
+                break;
+            }
+        }
     }
     report( "store_interleave :" );
 
@@ -1411,11 +1464,13 @@ static int check_deblock( int cpu_ref, int cpu_new )
 
     TEST_DEBLOCK( deblock_luma[0], 0, tcs[i] );
     TEST_DEBLOCK( deblock_luma[1], 1, tcs[i] );
-    TEST_DEBLOCK( deblock_chroma[0], 0, tcs[i] );
+    TEST_DEBLOCK( deblock_h_chroma_420, 0, tcs[i] );
+    TEST_DEBLOCK( deblock_h_chroma_422, 0, tcs[i] );
     TEST_DEBLOCK( deblock_chroma[1], 1, tcs[i] );
     TEST_DEBLOCK( deblock_luma_intra[0], 0 );
     TEST_DEBLOCK( deblock_luma_intra[1], 1 );
-    TEST_DEBLOCK( deblock_chroma_intra[0], 0 );
+    TEST_DEBLOCK( deblock_h_chroma_420_intra, 0 );
+    TEST_DEBLOCK( deblock_h_chroma_422_intra, 0 );
     TEST_DEBLOCK( deblock_chroma_intra[1], 1 );
 
     if( db_a.deblock_strength != db_ref.deblock_strength )
@@ -1471,6 +1526,8 @@ static int check_quant( int cpu_ref, int cpu_new )
     x264_quant_function_t qf_a;
     ALIGNED_16( dctcoef dct1[64] );
     ALIGNED_16( dctcoef dct2[64] );
+    ALIGNED_16( dctcoef dct3[8][16] );
+    ALIGNED_16( dctcoef dct4[8][16] );
     ALIGNED_16( uint8_t cqm_buf[64] );
     int ret = 0, ok, used_asm;
     int oks[3] = {1,1,1}, used_asms[3] = {0,0,0};
@@ -1602,7 +1659,7 @@ static int check_quant( int cpu_ref, int cpu_new )
             for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
             { \
                 INIT_QUANT##w(1) \
-                call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                qf_c.qname( dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
                 memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
                 call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
                 call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
@@ -1631,7 +1688,7 @@ static int check_quant( int cpu_ref, int cpu_new )
             { \
                 for( int i = 0; i < 16; i++ ) \
                     dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; \
-                call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
+                qf_c.qname( dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
                 memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
                 call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
                 call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
@@ -1647,27 +1704,75 @@ static int check_quant( int cpu_ref, int cpu_new )
 
         TEST_DEQUANT_DC( quant_4x4_dc, dequant_4x4_dc, CQM_4IY, 4 );
 
-#define TEST_OPTIMIZE_CHROMA_DC( qname, optname, w ) \
+        if( qf_a.idct_dequant_2x4_dc != qf_ref.idct_dequant_2x4_dc )
+        {
+            set_func_name( "idct_dequant_2x4_dc_%s", i_cqm?"cqm":"flat" );
+            used_asms[1] = 1;
+            for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- )
+            {
+                for( int i = 0; i < 8; i++ )
+                    dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16;
+                qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
+                qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
+                call_c( qf_c.idct_dequant_2x4_dc, dct1, dct3, h->dequant4_mf[CQM_4IC], qp+3 );
+                call_a( qf_a.idct_dequant_2x4_dc, dct1, dct4, h->dequant4_mf[CQM_4IC], qp+3 );
+                for( int i = 0; i < 8; i++ )
+                    if( dct3[i][0] != dct4[i][0] )
+                    {
+                        oks[1] = 0;
+                        fprintf( stderr, "idct_dequant_2x4_dc (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm );
+                        break;
+                    }
+            }
+        }
+
+        if( qf_a.idct_dequant_2x4_dconly != qf_ref.idct_dequant_2x4_dconly )
+        {
+            set_func_name( "idct_dequant_2x4_dc_%s", i_cqm?"cqm":"flat" );
+            used_asms[1] = 1;
+            for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- )
+            {
+                for( int i = 0; i < 8; i++ )
+                    dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16;
+                qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
+                qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
+                memcpy( dct2, dct1, 8*sizeof(dctcoef) );
+                call_c1( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 );
+                call_a1( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 );
+                if( memcmp( dct1, dct2, 8*sizeof(dctcoef) ) )
+                {
+                    oks[1] = 0;
+                    fprintf( stderr, "idct_dequant_2x4_dconly (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm );
+                    break;
+                }
+                call_c2( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 );
+                call_a2( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 );
+            }
+        }
+
+#define TEST_OPTIMIZE_CHROMA_DC( optname, size ) \
         if( qf_a.optname != qf_ref.optname ) \
         { \
             set_func_name( #optname ); \
             used_asms[2] = 1; \
             for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
             { \
-                int dmf = h->dequant4_mf[CQM_4IC][qp%6][0] << qp/6; \
+                int qpdc = qp + (size == 8 ? 3 : 0); \
+                int dmf = h->dequant4_mf[CQM_4IC][qpdc%6][0] << qpdc/6; \
                 if( dmf > 32*64 ) \
                     continue; \
-                for( int i = 16; ; i <<= 1 )\
+                for( int i = 16; ; i <<= 1 ) \
                 { \
                     int res_c, res_asm; \
                     int max = X264_MIN( i, PIXEL_MAX*16 ); \
-                    for( int j = 0; j < w*w; j++ ) \
+                    for( int j = 0; j < size; j++ ) \
                         dct1[j] = rand()%(max*2+1) - max; \
-                    call_c1( qf_c.qname, dct1, h->quant4_mf[CQM_4IC][qp][0]>>1, h->quant4_bias[CQM_4IC][qp][0]>>1 ); \
-                    memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
+                    for( int j = 0; i <= size; j += 4 ) \
+                        qf_c.quant_2x2_dc( &dct1[j], h->quant4_mf[CQM_4IC][qpdc][0]>>1, h->quant4_bias[CQM_4IC][qpdc][0]>>1 ); \
+                    memcpy( dct2, dct1, size*sizeof(dctcoef) ); \
                     res_c   = call_c1( qf_c.optname, dct1, dmf ); \
                     res_asm = call_a1( qf_a.optname, dct2, dmf ); \
-                    if( res_c != res_asm || memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \
+                    if( res_c != res_asm || memcmp( dct1, dct2, size*sizeof(dctcoef) ) ) \
                     { \
                         oks[2] = 0; \
                         fprintf( stderr, #optname "(qp=%d, res_c=%d, res_asm=%d): [FAILED]\n", qp, res_c, res_asm ); \
@@ -1680,7 +1785,8 @@ static int check_quant( int cpu_ref, int cpu_new )
             } \
         }
 
-        TEST_OPTIMIZE_CHROMA_DC( quant_2x2_dc, optimize_chroma_dc, 2 );
+        TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x2_dc, 4 );
+        TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x4_dc, 8 );
 
         x264_cqm_delete( h );
     }
@@ -1751,7 +1857,7 @@ static int check_quant( int cpu_ref, int cpu_new )
     TEST_DECIMATE( decimate_score15, 4, 1, 7 );
     report( "decimate_score :" );
 
-#define TEST_LAST( last, lastname, w, ac ) \
+#define TEST_LAST( last, lastname, size, ac ) \
     if( qf_a.last != qf_ref.last ) \
     { \
         set_func_name( #lastname ); \
@@ -1759,8 +1865,8 @@ static int check_quant( int cpu_ref, int cpu_new )
         for( int i = 0; i < 100; i++ ) \
         { \
             int nnz = 0; \
-            int max = rand() & (w*w-1); \
-            memset( dct1, 0, w*w*sizeof(dctcoef) ); \
+            int max = rand() & (size-1); \
+            memset( dct1, 0, size*sizeof(dctcoef) ); \
             for( int idx = ac; idx < max; idx++ ) \
                 nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
             if( !nnz ) \
@@ -1777,13 +1883,14 @@ static int check_quant( int cpu_ref, int cpu_new )
     }
 
     ok = 1; used_asm = 0;
-    TEST_LAST( coeff_last[DCT_CHROMA_DC],  coeff_last4, 2, 0 );
-    TEST_LAST( coeff_last[  DCT_LUMA_AC], coeff_last15, 4, 1 );
-    TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 4, 0 );
-    TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 8, 0 );
+    TEST_LAST( coeff_last4              , coeff_last4,   4, 0 );
+    TEST_LAST( coeff_last8              , coeff_last8,   8, 0 );
+    TEST_LAST( coeff_last[  DCT_LUMA_AC], coeff_last15, 16, 1 );
+    TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 16, 0 );
+    TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 64, 0 );
     report( "coeff_last :" );
 
-#define TEST_LEVELRUN( lastname, name, w, ac ) \
+#define TEST_LEVELRUN( lastname, name, size, ac ) \
     if( qf_a.lastname != qf_ref.lastname ) \
     { \
         set_func_name( #name ); \
@@ -1792,8 +1899,8 @@ static int check_quant( int cpu_ref, int cpu_new )
         { \
             x264_run_level_t runlevel_c, runlevel_a; \
             int nnz = 0; \
-            int max = rand() & (w*w-1); \
-            memset( dct1, 0, w*w*sizeof(dctcoef) ); \
+            int max = rand() & (size-1); \
+            memset( dct1, 0, size*sizeof(dctcoef) ); \
             memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \
             memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \
             for( int idx = ac; idx < max; idx++ ) \
@@ -1814,9 +1921,10 @@ static int check_quant( int cpu_ref, int cpu_new )
     }
 
     ok = 1; used_asm = 0;
-    TEST_LEVELRUN( coeff_level_run[DCT_CHROMA_DC],  coeff_level_run4, 2, 0 );
-    TEST_LEVELRUN( coeff_level_run[  DCT_LUMA_AC], coeff_level_run15, 4, 1 );
-    TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 4, 0 );
+    TEST_LEVELRUN( coeff_level_run4              , coeff_level_run4,   4, 0 );
+    TEST_LEVELRUN( coeff_level_run8              , coeff_level_run8,   8, 0 );
+    TEST_LEVELRUN( coeff_level_run[  DCT_LUMA_AC], coeff_level_run15, 16, 1 );
+    TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 16, 0 );
     report( "coeff_level_run :" );
 
     return ret;
@@ -1832,6 +1940,7 @@ static int check_intra( int cpu_ref, int cpu_new )
     {
         x264_predict_t      predict_16x16[4+3];
         x264_predict_t      predict_8x8c[4+3];
+        x264_predict_t      predict_8x16c[4+3];
         x264_predict8x8_t   predict_8x8[9+3];
         x264_predict_t      predict_4x4[9+3];
         x264_predict_8x8_filter_t predict_8x8_filter;
@@ -1839,16 +1948,19 @@ static int check_intra( int cpu_ref, int cpu_new )
 
     x264_predict_16x16_init( 0, ip_c.predict_16x16 );
     x264_predict_8x8c_init( 0, ip_c.predict_8x8c );
+    x264_predict_8x16c_init( 0, ip_c.predict_8x16c );
     x264_predict_8x8_init( 0, ip_c.predict_8x8, &ip_c.predict_8x8_filter );
     x264_predict_4x4_init( 0, ip_c.predict_4x4 );
 
     x264_predict_16x16_init( cpu_ref, ip_ref.predict_16x16 );
     x264_predict_8x8c_init( cpu_ref, ip_ref.predict_8x8c );
+    x264_predict_8x16c_init( cpu_ref, ip_ref.predict_8x16c );
     x264_predict_8x8_init( cpu_ref, ip_ref.predict_8x8, &ip_ref.predict_8x8_filter );
     x264_predict_4x4_init( cpu_ref, ip_ref.predict_4x4 );
 
     x264_predict_16x16_init( cpu_new, ip_a.predict_16x16 );
     x264_predict_8x8c_init( cpu_new, ip_a.predict_8x8c );
+    x264_predict_8x16c_init( cpu_new, ip_a.predict_8x16c );
     x264_predict_8x8_init( cpu_new, ip_a.predict_8x8, &ip_a.predict_8x8_filter );
     x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 );
 
@@ -1856,7 +1968,7 @@ static int check_intra( int cpu_ref, int cpu_new )
 
     ip_c.predict_8x8_filter( fdec+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
 
-#define INTRA_TEST( name, dir, w, align, bench, ... )\
+#define INTRA_TEST( name, dir, w, h, align, bench, ... )\
     if( ip_a.name[dir] != ip_ref.name[dir] )\
     {\
         set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\
@@ -1874,7 +1986,7 @@ static int check_intra( int cpu_ref, int cpu_new )
                 for( int k = -1; k < 16; k++ )\
                     printf( "%2x ", edge[16+k] );\
                 printf( "\n" );\
-                for( int j = 0; j < w; j++ )\
+                for( int j = 0; j < h; j++ )\
                 {\
                     printf( "%2x ", edge[14-j] );\
                     for( int k = 0; k < w; k++ )\
@@ -1882,7 +1994,7 @@ static int check_intra( int cpu_ref, int cpu_new )
                     printf( "\n" );\
                 }\
                 printf( "\n" );\
-                for( int j = 0; j < w; j++ )\
+                for( int j = 0; j < h; j++ )\
                 {\
                     printf( "   " );\
                     for( int k = 0; k < w; k++ )\
@@ -1895,13 +2007,15 @@ static int check_intra( int cpu_ref, int cpu_new )
     }
 
     for( int i = 0; i < 12; i++ )
-        INTRA_TEST(   predict_4x4, i,  4,  4, );
+        INTRA_TEST(   predict_4x4, i,  4,  4,  4, );
+    for( int i = 0; i < 7; i++ )
+        INTRA_TEST(  predict_8x8c, i,  8,  8, 16, );
     for( int i = 0; i < 7; i++ )
-        INTRA_TEST(  predict_8x8c, i,  8, 16, );
+        INTRA_TEST( predict_8x16c, i,  8, 16, 16, );
     for( int i = 0; i < 7; i++ )
-        INTRA_TEST( predict_16x16, i, 16, 16, );
+        INTRA_TEST( predict_16x16, i, 16, 16, 16, );
     for( int i = 0; i < 12; i++ )
-        INTRA_TEST(   predict_8x8, i,  8,  8, , edge );
+        INTRA_TEST(   predict_8x8, i,  8,  8,  8, , edge );
 
     set_func_name("intra_predict_8x8_filter");
     if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter )
@@ -1926,31 +2040,33 @@ static int check_intra( int cpu_ref, int cpu_new )
         }
     }
 
-#define EXTREMAL_PLANE(size) \
+#define EXTREMAL_PLANE( w, h ) \
     { \
         int max[7]; \
         for( int j = 0; j < 7; j++ ) \
             max[j] = test ? rand()&PIXEL_MAX : PIXEL_MAX; \
         fdec[48-1-FDEC_STRIDE] = (i&1)*max[0]; \
-        for( int j = 0; j < size/2; j++ ) \
+        for( int j = 0; j < w/2; j++ ) \
             fdec[48+j-FDEC_STRIDE] = (!!(i&2))*max[1]; \
-        for( int j = size/2; j < size-1; j++ ) \
+        for( int j = w/2; j < w-1; j++ ) \
             fdec[48+j-FDEC_STRIDE] = (!!(i&4))*max[2]; \
-        fdec[48+(size-1)-FDEC_STRIDE] = (!!(i&8))*max[3]; \
-        for( int j = 0; j < size/2; j++ ) \
+        fdec[48+(w-1)-FDEC_STRIDE] = (!!(i&8))*max[3]; \
+        for( int j = 0; j < h/2; j++ ) \
             fdec[48+j*FDEC_STRIDE-1] = (!!(i&16))*max[4]; \
-        for( int j = size/2; j < size-1; j++ ) \
+        for( int j = h/2; j < h-1; j++ ) \
             fdec[48+j*FDEC_STRIDE-1] = (!!(i&32))*max[5]; \
-        fdec[48+(size-1)*FDEC_STRIDE-1] = (!!(i&64))*max[6]; \
+        fdec[48+(h-1)*FDEC_STRIDE-1] = (!!(i&64))*max[6]; \
     }
     /* Extremal test case for planar prediction. */
     for( int test = 0; test < 100 && ok; test++ )
         for( int i = 0; i < 128 && ok; i++ )
         {
-            EXTREMAL_PLANE(  8 );
-            INTRA_TEST(  predict_8x8c, I_PRED_CHROMA_P,  8, 64, 1 );
-            EXTREMAL_PLANE( 16 );
-            INTRA_TEST( predict_16x16,  I_PRED_16x16_P, 16, 64, 1 );
+            EXTREMAL_PLANE(  8,  8 );
+            INTRA_TEST(  predict_8x8c, I_PRED_CHROMA_P,  8,  8, 64, 1 );
+            EXTREMAL_PLANE(  8, 16 );
+            INTRA_TEST( predict_8x16c, I_PRED_CHROMA_P,  8, 16, 64, 1 );
+            EXTREMAL_PLANE( 16, 16 );
+            INTRA_TEST( predict_16x16,  I_PRED_16x16_P, 16, 16, 64, 1 );
         }
     report( "intra pred :" );
     return ret;
diff --git a/x264.c b/x264.c
index 72399569..025bc767 100644
--- a/x264.c
+++ b/x264.c
@@ -121,7 +121,7 @@ static const char * const muxer_names[] =
 
 static const char * const pulldown_names[] = { "none", "22", "32", "64", "double", "triple", "euro", 0 };
 static const char * const log_level_names[] = { "none", "error", "warning", "info", "debug", 0 };
-static const char * const output_csp_names[] = { "i420", "i444", "rgb", 0 };
+static const char * const output_csp_names[] = { "i420", "i422", "i444", "rgb", 0 };
 
 typedef struct
 {
@@ -1131,6 +1131,8 @@ static int init_vid_filters( char *sequence, hnd_t *handle, video_info_t *info,
     int csp = info->csp & X264_CSP_MASK;
     if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp > X264_CSP_NV12) )
         param->i_csp = X264_CSP_I420;
+    else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_NV16) )
+        param->i_csp = X264_CSP_I422;
     else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp > X264_CSP_YV24) )
         param->i_csp = X264_CSP_I444;
     else if( output_csp == X264_CSP_RGB && (csp < X264_CSP_BGR || csp > X264_CSP_RGB) )
@@ -1355,7 +1357,8 @@ static int parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
             case OPT_OUTPUT_CSP:
                 FAIL_IF_ERROR( parse_enum_value( optarg, output_csp_names, &output_csp ), "Unknown output csp `%s'\n", optarg )
                 // correct the parsed value to the libx264 csp value
-                output_csp = !output_csp ? X264_CSP_I420 : (output_csp == 1 ? X264_CSP_I444 : X264_CSP_RGB);
+                static const uint8_t output_csp_fix[] = { X264_CSP_I420, X264_CSP_I422, X264_CSP_I444, X264_CSP_RGB };
+                param->i_csp = output_csp = output_csp_fix[output_csp];
                 break;
             default:
 generic_option:
diff --git a/x264.h b/x264.h
index 2cdcfb7c..34ad872c 100644
--- a/x264.h
+++ b/x264.h
@@ -41,7 +41,7 @@
 
 #include "x264_config.h"
 
-#define X264_BUILD 117
+#define X264_BUILD 118
 
 /* x264_t:
  *      opaque handler for encoder */
@@ -181,12 +181,15 @@ static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 };
 #define X264_CSP_I420           0x0001  /* yuv 4:2:0 planar */
 #define X264_CSP_YV12           0x0002  /* yvu 4:2:0 planar */
 #define X264_CSP_NV12           0x0003  /* yuv 4:2:0, with one y plane and one packed u+v */
-#define X264_CSP_I444           0x0004  /* yuv 4:4:4 planar */
-#define X264_CSP_YV24           0x0005  /* yvu 4:4:4 planar */
-#define X264_CSP_BGR            0x0006  /* packed bgr 24bits   */
-#define X264_CSP_BGRA           0x0007  /* packed bgr 32bits   */
-#define X264_CSP_RGB            0x0008  /* packed rgb 24bits   */
-#define X264_CSP_MAX            0x0009  /* end of list */
+#define X264_CSP_I422           0x0004  /* yuv 4:2:2 planar */
+#define X264_CSP_YV16           0x0005  /* yvu 4:2:2 planar */
+#define X264_CSP_NV16           0x0006  /* yuv 4:2:2, with one y plane and one packed u+v */
+#define X264_CSP_I444           0x0007  /* yuv 4:4:4 planar */
+#define X264_CSP_YV24           0x0008  /* yvu 4:4:4 planar */
+#define X264_CSP_BGR            0x0009  /* packed bgr 24bits   */
+#define X264_CSP_BGRA           0x000a  /* packed bgr 32bits   */
+#define X264_CSP_RGB            0x000b  /* packed rgb 24bits   */
+#define X264_CSP_MAX            0x000c  /* end of list */
 #define X264_CSP_VFLIP          0x1000  /* the csp is vertically flipped */
 #define X264_CSP_HIGH_DEPTH     0x2000  /* the csp has a depth of 16 bits per pixel component */
 
@@ -242,7 +245,7 @@ typedef struct x264_param_t
     /* Video Properties */
     int         i_width;
     int         i_height;
-    int         i_csp;  /* CSP of encoded bitstream, only i420 supported */
+    int         i_csp;         /* CSP of encoded bitstream */
     int         i_level_idc;
     int         i_frame_total; /* number of frames to encode if known, else 0 */
 
@@ -579,7 +582,7 @@ void    x264_param_apply_fastfirstpass( x264_param_t * );
 /* x264_param_apply_profile:
  *      Applies the restrictions of the given profile.
  *      Currently available profiles are, from most to least restrictive: */
-static const char * const x264_profile_names[] = { "baseline", "main", "high", "high10", 0 };
+static const char * const x264_profile_names[] = { "baseline", "main", "high", "high10", "high422", "high444", 0 };
 
 /*      (can be NULL, in which case the function will do nothing)
  *
-- 
2.40.0