From: Loren Merritt <pengvado@videolan.org>
Date: Fri, 3 Jun 2005 05:33:15 +0000 (+0000)
Subject: 8x8 transform and 8x8 intra prediction.
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=1ab45c8f7411f7b4453ddff66919910e823ed33b;p=libx264

8x8 transform and 8x8 intra prediction.
(backend only, not yet used by mb analysis)


git-svn-id: svn://svn.videolan.org/x264/trunk@246 df754926-b1dd-0310-bc7b-ec298dee348c
---

diff --git a/common/amd64/predict-a.asm b/common/amd64/predict-a.asm
index 979baf26..96e7593a 100644
--- a/common/amd64/predict-a.asm
+++ b/common/amd64/predict-a.asm
@@ -48,17 +48,17 @@ BITS 64
 
 SECTION .text
 
-cglobal predict_8x8_v_mmx
+cglobal predict_8x8c_v_mmx
 cglobal predict_16x16_v_mmx
 
 ;-----------------------------------------------------------------------------
 ;
-; void predict_8x8_v_mmx( uint8_t *src, int i_stride )
+; void predict_8x8c_v_mmx( uint8_t *src, int i_stride )
 ;
 ;-----------------------------------------------------------------------------
 
 ALIGN 16
-predict_8x8_v_mmx :
+predict_8x8c_v_mmx :
     movsxd      rcx, esi        ; i_stride
 
     sub         rdi             , rcx               ; esi <-- line -1
diff --git a/common/cabac.c b/common/cabac.c
index 03f52373..7583c741 100644
--- a/common/cabac.c
+++ b/common/cabac.c
@@ -35,7 +35,7 @@ static int binCount = 0;
 #endif
 
 
-static const int x264_cabac_context_init_I[399][2] =
+static const int x264_cabac_context_init_I[460][2] =
 {
     /* 0 - 10 */
     { 20, -15 }, {  2, 54 },  {  3,  74 }, { 20, -15 },
@@ -181,10 +181,30 @@ static const int x264_cabac_context_init_I[399][2] =
     { 31, -7 },  { 35, -15 }, { 34, -3 },    { 34, 3 },
     { 36, -1 },  { 34, 5 },   { 32, 11 },    { 35, 5 },
     { 34, 12 },  { 39, 11 },  { 30, 29 },    { 34, 26 },
-    { 29, 39 },  { 19, 66 }
+    { 29, 39 },  { 19, 66 },
+
+    /* 399 -> 435 */
+    {  31,  21 }, {  31,  31 }, {  25,  50 },
+    { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11,  85 },
+    { -15,  92 }, { -14,  89 }, { -26,  71 }, { -15,  81 },
+    { -14,  80 }, {   0,  68 }, { -14,  70 }, { -24,  56 },
+    { -23,  68 }, { -24,  50 }, { -11,  74 }, {  23, -13 },
+    {  26, -13 }, {  40, -15 }, {  49, -14 }, {  44,   3 },
+    {  45,   6 }, {  44,  34 }, {  33,  54 }, {  19,  82 },
+    {  -3,  75 }, {  -1,  23 }, {   1,  34 }, {   1,  43 },
+    {   0,  54 }, {  -2,  55 }, {   0,  61 }, {   1,  64 },
+    {   0,  68 }, {  -9,  92 },
+
+    /* 436 -> 459 */
+    { -14, 106 }, { -13,  97 }, { -15,  90 }, { -12,  90 },
+    { -18,  88 }, { -10,  73 }, {  -9,  79 }, { -14,  86 },
+    { -10,  73 }, { -10,  70 }, { -10,  69 }, {  -5,  66 },
+    {  -9,  64 }, {  -5,  58 }, {   2,  59 }, {  21, -10 },
+    {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
+    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 }
 };
 
-static const int x264_cabac_context_init_PB[3][399][2] =
+static const int x264_cabac_context_init_PB[3][460][2] =
 {
     /* i_cabac_init_idc == 0 */
     {
@@ -321,7 +341,25 @@ static const int x264_cabac_context_init_PB[3][399][2] =
         {  23,  42 }, {  19,  57 }, {  22,  53 }, {  22,  61 },
         {  11,  86 },
 
-
+        /* 399 -> 435 */
+        {  12,  40 }, {  11,  51 }, {  14,  59 },
+        {  -4,  79 }, {  -7,  71 }, {  -5,  69 }, {  -9,  70 },
+        {  -8,  66 }, { -10,  68 }, { -19,  73 }, { -12,  69 },
+        { -16,  70 }, { -15,  67 }, { -20,  62 }, { -19,  70 },
+        { -16,  66 }, { -22,  65 }, { -20,  63 }, {   9,  -2 },
+        {  26,  -9 }, {  33,  -9 }, {  39,  -7 }, {  41,  -2 },
+        {  45,   3 }, {  49,   9 }, {  45,  27 }, {  36,  59 },
+        {  -6,  66 }, {  -7,  35 }, {  -7,  42 }, {  -8,  45 },
+        {  -5,  48 }, { -12,  56 }, {  -6,  60 }, {  -5,  62 },
+        {  -8,  66 }, {  -8,  76 },
+
+        /* 436 -> 459 */
+        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
+        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
+        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
+        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  21, -13 },
+        {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
+        {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
     },
 
     /* i_cabac_init_idc == 1 */
@@ -459,6 +497,25 @@ static const int x264_cabac_context_init_PB[3][399][2] =
         {  18,  50 }, {  12,  70 }, {  21,  54 }, {  14,  71 },
         {  11,  83 },
 
+        /* 399 -> 435 */
+        {  24,  32 }, {  21,  49 }, {  21,  54 },
+        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
+        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,   8 },
+        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
+        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  17, -10 },
+        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
+        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
+        {  -5,  71 }, {   0,  24 }, {  -1,  36 }, {  -2,  42 },
+        {  -2,  52 }, {  -9,  57 }, {  -6,  53 }, {  -4,  65 },
+        {  -4,  67 }, {  -7,  82 },
+
+        /* 436 -> 459 */
+        {  -3,  81 }, {  -3,  76 }, {  -7,  72 }, {  -6,  78 },
+        { -12,  72 }, { -14,  68 }, {  -3,  70 }, {  -6,  76 },
+        {  -5,  66 }, {  -5,  62 }, {   0,  57 }, {  -4,  61 },
+        {  -9,  60 }, {   1,  54 }, {   2,  58 }, {  17, -10 },
+        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
+        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
     },
 
     /* i_cabac_init_idc == 2 */
@@ -595,6 +652,26 @@ static const int x264_cabac_context_init_PB[3][399][2] =
         {  22,  42 }, {  16,  60 }, {  15,  52 }, {  14,  60 },
         {   3,  78 }, { -16, 123 }, {  21,  53 }, {  22,  56 },
         {  25,  61 },
+
+        /* 399 -> 435 */
+        {  21,  33 }, {  19,  50 }, {  17,  61 },
+        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
+        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
+        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
+        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
+        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
+        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
+        {  -9,  71 }, {  -7,  37 }, {  -8,  44 }, { -11,  49 },
+        { -10,  56 }, { -12,  59 }, {  -8,  63 }, {  -9,  67 },
+        {  -6,  68 }, { -10,  79 },
+
+        /* 436 -> 459 */
+        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
+        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
+        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
+        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
+        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
+        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
     }
 };
 
@@ -720,7 +797,7 @@ static const int x264_cabac_entropy[128] =
  *****************************************************************************/
 void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model )
 {
-    const int (*cabac_context_init)[399][2];
+    const int (*cabac_context_init)[460][2];
     int i;
 
     if( i_slice_type == SLICE_TYPE_I )
@@ -732,7 +809,7 @@ void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int
         cabac_context_init = &x264_cabac_context_init_PB[i_model];
     }
 
-    for( i = 0; i < 399; i++ )
+    for( i = 0; i < 436; i++ )
     {
         int i_pre_state;
 
@@ -865,7 +942,7 @@ void x264_cabac_model_update( x264_cabac_t *cb, int i_slice_type, int i_qp )
 
         i_cost = 0; /* fix8 */
 
-        for( i_ctx = 0; i_ctx < 399; i_ctx++ )
+        for( i_ctx = 0; i_ctx < 436; i_ctx++ )
         {
             int i_weight;
             int i_model_state;
diff --git a/common/cabac.h b/common/cabac.h
index 3051789d..945fb17a 100644
--- a/common/cabac.h
+++ b/common/cabac.h
@@ -34,12 +34,13 @@ typedef struct
     } slice[3];
 
     /* context */
+    /* states 436-459 are for interlacing, so are omitted for now */
     struct
     {
         int i_state;
         int i_mps;
         int i_count;
-    } ctxstate[399];
+    } ctxstate[436];
 
     /* state */
     int i_low;
diff --git a/common/common.c b/common/common.c
index eabb20b2..20b1c2bb 100644
--- a/common/common.c
+++ b/common/common.c
@@ -104,7 +104,7 @@ void    x264_param_default( x264_param_t *param )
     param->i_log_level = X264_LOG_INFO;
 
     /* */
-    param->analyse.intra = X264_ANALYSE_I4x4;
+    param->analyse.intra = X264_ANALYSE_I4x4 | X264_ANALYSE_I8x8;
     param->analyse.inter = X264_ANALYSE_I4x4 | X264_ANALYSE_PSUB16x16 | X264_ANALYSE_BSUB16x16;
     param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_TEMPORAL;
     param->analyse.i_me_method = X264_ME_HEX;
diff --git a/common/common.h b/common/common.h
index 2ad1543b..0cd53ae1 100644
--- a/common/common.h
+++ b/common/common.h
@@ -289,6 +289,8 @@ struct x264_t
     {
         DECLARE_ALIGNED( int, luma16x16_dc[16], 16 );
         DECLARE_ALIGNED( int, chroma_dc[2][4], 16 );
+        // FIXME merge with union
+        DECLARE_ALIGNED( int, luma8x8[4][64], 16 );
         union
         {
             DECLARE_ALIGNED( int, residual_ac[15], 16 );
@@ -326,6 +328,8 @@ struct x264_t
 
         /* neighboring MBs */
         unsigned int i_neighbour;
+        unsigned int i_neighbour8[4];       /* neighbours of each 8x8 or 4x4 block that are available */
+        unsigned int i_neighbour4[16];      /* at the time the block is coded */
         int     i_mb_type_top; 
         int     i_mb_type_left; 
         int     i_mb_type_topleft; 
@@ -343,11 +347,13 @@ struct x264_t
         int8_t   *ref[2];                   /* mb ref. set to -1 if non used (intra or Lx only) */
         int16_t (*mvr[2][16])[2];           /* 16x16 mv for each possible ref */
         int8_t  *skipbp;                    /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
+        int8_t  *mb_transform_size;         /* transform_size_8x8_flag of each mb */
 
         /* current value */
         int     i_type;
         int     i_partition;
         int     i_sub_partition[4];
+        int     b_transform_8x8;
 
         int     i_cbp_luma;
         int     i_cbp_chroma;
@@ -373,7 +379,7 @@ struct x264_t
         /* cache */
         struct
         {
-            /* real intra4x4_pred_mode if I_4X4, I_PRED_4x4_DC if mb available, -1 if not */
+            /* real intra4x4_pred_mode if I_4X4 or I_8X8, I_PRED_4x4_DC if mb available, -1 if not */
             int     intra4x4_pred_mode[X264_SCAN8_SIZE];
 
             /* i_non_zero_count if availble else 0x80 */
@@ -391,6 +397,9 @@ struct x264_t
 
             int16_t direct_mv[2][X264_SCAN8_SIZE][2];
             int8_t  direct_ref[2][X264_SCAN8_SIZE];
+
+            /* top and left neighbors. 1=>8x8, 0=>4x4 */
+            int8_t  transform_size[2];
         } cache;
 
         /* */
@@ -427,7 +436,7 @@ struct x264_t
             /* ? */
             int i_misc_bits;
             /* MB type counts */
-            int i_mb_count[18];
+            int i_mb_count[19];
             int i_mb_count_p;
             int i_mb_count_skip;
             /* Estimated (SATD) cost as Intra/Predicted frame */
@@ -449,13 +458,14 @@ struct x264_t
         float   f_psnr_mean_u[5];
         float   f_psnr_mean_v[5];
         /* */
-        int64_t i_mb_count[5][18];
+        int64_t i_mb_count[5][19];
 
     } stat;
 
     /* CPU functions dependants */
     x264_predict_t      predict_16x16[4+3];
-    x264_predict_t      predict_8x8[4+3];
+    x264_predict_t      predict_8x8c[4+3];
+    x264_predict8x8_t   predict_8x8[9+3];
     x264_predict_t      predict_4x4[9+3];
 
     x264_pixel_function_t pixf;
diff --git a/common/dct.c b/common/dct.c
index 4cb59722..7c6b2b2a 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -256,6 +256,136 @@ static void add16x16_idct( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] )
     add8x8_idct( &p_dst[8*i_dst+8], i_dst, &dct[12] );
 }
 
+/****************************************************************************
+ * 8x8 transform:
+ ****************************************************************************/
+
+static inline void dct8_1d( int16_t src[8][8], int16_t dst[8][8] )
+{
+    int i;
+    for( i = 0; i < 8; i++ )
+    {
+        const int s07 = src[i][0] + src[i][7];
+        const int s16 = src[i][1] + src[i][6];
+        const int s25 = src[i][2] + src[i][5];
+        const int s34 = src[i][3] + src[i][4];
+  
+        const int a0 = s07 + s34;
+        const int a1 = s16 + s25;
+        const int a2 = s07 - s34;
+        const int a3 = s16 - s25;
+  
+        const int d07 = src[i][0] - src[i][7];
+        const int d16 = src[i][1] - src[i][6];
+        const int d25 = src[i][2] - src[i][5];
+        const int d34 = src[i][3] - src[i][4];
+  
+        const int a4 = d16 + d25 + (d07 + (d07>>1));
+        const int a5 = d07 - d34 - (d25 + (d25>>1));
+        const int a6 = d07 + d34 - (d16 + (d16>>1));
+        const int a7 = d16 - d25 + (d34 + (d34>>1));
+  
+        dst[0][i] =  a0 + a1;
+        dst[1][i] =  a4 + (a7>>2);
+        dst[2][i] =  a2 + (a3>>1);
+        dst[3][i] =  a5 + (a6>>2);
+        dst[4][i] =  a0 - a1;
+        dst[5][i] =  a6 - (a5>>2);
+        dst[6][i] = (a2>>1) - a3;
+        dst[7][i] = (a4>>2) - a7;
+    }
+}
+
+static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    int16_t d[8][8];
+    int16_t tmp[8][8];
+    int y, x;
+
+    for( y = 0; y < 8; y++ )
+    {
+        for( x = 0; x < 8; x++ )
+        {
+            d[y][x] = pix1[x] - pix2[x];
+        }
+        pix1 += i_pix1;
+        pix2 += i_pix2;
+    }
+
+    dct8_1d( d, tmp );
+    dct8_1d( tmp, dct );
+}
+
+static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    sub8x8_dct8( dct[0],  pix1,             i_pix1,  pix2,             i_pix2 );
+    sub8x8_dct8( dct[1], &pix1[8],          i_pix1, &pix2[8],          i_pix2 );
+    sub8x8_dct8( dct[2], &pix1[8*i_pix1],   i_pix1, &pix2[8*i_pix2],   i_pix2 );
+    sub8x8_dct8( dct[3], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 );
+}
+
+static inline void idct8_1d( int16_t src[8][8], int16_t dst[8][8] )
+{
+    int i;
+    for( i = 0; i < 8; i++ )
+    {
+        const int a0 =  src[i][0] + src[i][4];
+        const int a2 =  src[i][0] - src[i][4];
+        const int a4 = (src[i][2]>>1) - src[i][6];
+        const int a6 = (src[i][6]>>1) + src[i][2];
+
+        const int b0 = a0 + a6;
+        const int b2 = a2 + a4;
+        const int b4 = a2 - a4;
+        const int b6 = a0 - a6;
+
+        const int a1 = -src[i][3] + src[i][5] - src[i][7] - (src[i][7]>>1);
+        const int a3 =  src[i][1] + src[i][7] - src[i][3] - (src[i][3]>>1);
+        const int a5 = -src[i][1] + src[i][7] + src[i][5] + (src[i][5]>>1);
+        const int a7 =  src[i][3] + src[i][5] + src[i][1] + (src[i][1]>>1);
+
+        const int b1 = (a7>>2) + a1;
+        const int b3 =  a3 + (a5>>2);
+        const int b5 = (a3>>2) - a5;
+        const int b7 =  a7 - (a1>>2);
+
+        dst[0][i] = b0 + b7;
+        dst[7][i] = b0 - b7;
+        dst[1][i] = b2 + b5;
+        dst[6][i] = b2 - b5;
+        dst[2][i] = b4 + b3;
+        dst[5][i] = b4 - b3;
+        dst[3][i] = b6 + b1;
+        dst[4][i] = b6 - b1;
+    }
+}
+
+static void add8x8_idct8( uint8_t *p_dst, int i_dst, int16_t dct[8][8] )
+{
+    int16_t d[8][8];
+    int16_t tmp[8][8];
+    int y, x;
+
+    idct8_1d( dct, tmp );
+    idct8_1d( tmp, d );
+
+    for( y = 0; y < 8; y++ )
+    {
+        for( x = 0; x < 8; x++ )
+        {
+            p_dst[x] = clip_uint8( p_dst[x] + ((d[y][x] + 32) >> 6) );
+        }
+        p_dst += i_dst;
+    }
+}
+
+static void add16x16_idct8( uint8_t *p_dst, int i_dst, int16_t dct[4][8][8] )
+{
+    add8x8_idct8( &p_dst[0],         i_dst, dct[0] );
+    add8x8_idct8( &p_dst[8],         i_dst, dct[1] );
+    add8x8_idct8( &p_dst[8*i_dst],   i_dst, dct[2] );
+    add8x8_idct8( &p_dst[8*i_dst+8], i_dst, dct[3] );
+}
 
 
 /****************************************************************************
@@ -269,8 +399,14 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
     dctf->sub8x8_dct    = sub8x8_dct;
     dctf->add8x8_idct   = add8x8_idct;
 
-    dctf->sub16x16_dct    = sub16x16_dct;
-    dctf->add16x16_idct   = add16x16_idct;
+    dctf->sub16x16_dct  = sub16x16_dct;
+    dctf->add16x16_idct = add16x16_idct;
+
+    dctf->sub8x8_dct8   = sub8x8_dct8;
+    dctf->add8x8_idct8  = add8x8_idct8;
+
+    dctf->sub16x16_dct8  = sub16x16_dct8;
+    dctf->add16x16_idct8 = add16x16_idct8;
 
     dctf->dct4x4dc  = dct4x4dc;
     dctf->idct4x4dc = idct4x4dc;
diff --git a/common/dct.h b/common/dct.h
index bedbbf43..c0493137 100644
--- a/common/dct.h
+++ b/common/dct.h
@@ -35,6 +35,11 @@ typedef struct
     void (*sub16x16_dct)   ( int16_t dct[16][4][4],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
     void (*add16x16_idct)  ( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] );
 
+    void (*sub8x8_dct8)   ( int16_t dct[8][8],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+    void (*add8x8_idct8)  ( uint8_t *p_dst, int i_dst, int16_t dct[8][8] );
+
+    void (*sub16x16_dct8)   ( int16_t dct[4][8][8],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+    void (*add16x16_idct8)  ( uint8_t *p_dst, int i_dst, int16_t dct[4][8][8] );
 
     void (*dct4x4dc) ( int16_t d[4][4] );
     void (*idct4x4dc)( int16_t d[4][4] );
diff --git a/common/frame.c b/common/frame.c
index 92c0ffdd..69a42b8a 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -644,6 +644,18 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type )
         const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
         int i_edge;
         int i_dir;
+        const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
+
+        /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
+         * entropy coding, but per 64 coeffs for the purpose of deblocking */
+        if( !h->param.b_cabac && b_8x8_transform )
+        {
+            uint32_t *nnz = (uint32_t*)h->mb.non_zero_count[mb_xy];
+            if( nnz[0] ) nnz[0] = 0x01010101;
+            if( nnz[1] ) nnz[1] = 0x01010101;
+            if( nnz[2] ) nnz[2] = 0x01010101;
+            if( nnz[3] ) nnz[3] = 0x01010101;
+        }
 
         /* i_dir == 0 -> vertical edge
          * i_dir == 1 -> horizontal edge */
@@ -719,9 +731,12 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type )
                 if( i_dir == 0 )
                 {
                     /* vertical edge */
-                    deblocking_filter_edgev( h, &h->fdec->plane[0][16 * mb_y * h->fdec->i_stride[0]+ 16 * mb_x + 4 * i_edge],
-                                                h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1);
-                    if( (i_edge % 2) == 0  )
+                    if( !b_8x8_transform || !(i_edge & 1) )
+                    {
+                        deblocking_filter_edgev( h, &h->fdec->plane[0][16 * mb_y * h->fdec->i_stride[0]+ 16 * mb_x + 4 * i_edge],
+                                                 h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1);
+                    }
+                    if( !(i_edge & 1) )
                     {
                         /* U/V planes */
                         int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +
@@ -735,10 +750,13 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type )
                 else
                 {
                     /* horizontal edge */
-                    deblocking_filter_edgeh( h, &h->fdec->plane[0][(16*mb_y + 4 * i_edge) * h->fdec->i_stride[0]+ 16 * mb_x],
-                                                h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1 );
+                    if( !b_8x8_transform || !(i_edge & 1) )
+                    {
+                        deblocking_filter_edgeh( h, &h->fdec->plane[0][(16*mb_y + 4 * i_edge) * h->fdec->i_stride[0]+ 16 * mb_x],
+                                                 h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1 );
+                    }
                     /* U/V planes */
-                    if( ( i_edge % 2  ) == 0 )
+                    if( !(i_edge & 1) )
                     {
                         int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +
                                       i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;
diff --git a/common/i386/predict-a.asm b/common/i386/predict-a.asm
index 3237ebb6..3be28a1c 100644
--- a/common/i386/predict-a.asm
+++ b/common/i386/predict-a.asm
@@ -56,17 +56,17 @@ SECTION .data
 
 SECTION .text
 
-cglobal predict_8x8_v_mmx
+cglobal predict_8x8c_v_mmx
 cglobal predict_16x16_v_mmx
 
 ;-----------------------------------------------------------------------------
 ;
-; void predict_8x8_v_mmx( uint8_t *src, int i_stride )
+; void predict_8x8c_v_mmx( uint8_t *src, int i_stride )
 ;
 ;-----------------------------------------------------------------------------
 
 ALIGN 16
-predict_8x8_v_mmx :
+predict_8x8c_v_mmx :
 
     ;push       edi
     ;push       esi
diff --git a/common/i386/predict.c b/common/i386/predict.c
index cf5c26e9..4b55596f 100644
--- a/common/i386/predict.c
+++ b/common/i386/predict.c
@@ -177,7 +177,7 @@ static void predict_16x16_v( uint8_t *src, int i_stride )
 /****************************************************************************
  * 8x8 prediction for intra chroma block DC, H, V, P
  ****************************************************************************/
-static void predict_8x8_dc_128( uint8_t *src, int i_stride )
+static void predict_8x8c_dc_128( uint8_t *src, int i_stride )
 {
     int y;
 
@@ -191,7 +191,7 @@ static void predict_8x8_dc_128( uint8_t *src, int i_stride )
         src += i_stride;
     }
 }
-static void predict_8x8_dc_left( uint8_t *src, int i_stride )
+static void predict_8x8c_dc_left( uint8_t *src, int i_stride )
 {
     int y;
     uint32_t dc0 = 0, dc1 = 0;
@@ -222,7 +222,7 @@ static void predict_8x8_dc_left( uint8_t *src, int i_stride )
     }
 
 }
-static void predict_8x8_dc_top( uint8_t *src, int i_stride )
+static void predict_8x8c_dc_top( uint8_t *src, int i_stride )
 {
     int y, x;
     uint32_t dc0 = 0, dc1 = 0;
@@ -244,7 +244,7 @@ static void predict_8x8_dc_top( uint8_t *src, int i_stride )
         src += i_stride;
     }
 }
-static void predict_8x8_dc( uint8_t *src, int i_stride )
+static void predict_8x8c_dc( uint8_t *src, int i_stride )
 {
     int y;
     int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
@@ -291,7 +291,7 @@ static void predict_8x8_dc( uint8_t *src, int i_stride )
     }
 }
 
-static void predict_8x8_h( uint8_t *src, int i_stride )
+static void predict_8x8c_h( uint8_t *src, int i_stride )
 {
     int i;
 
@@ -307,10 +307,10 @@ static void predict_8x8_h( uint8_t *src, int i_stride )
     }
 }
 
-extern void predict_8x8_v_mmx( uint8_t *src, int i_stride );
+extern void predict_8x8c_v_mmx( uint8_t *src, int i_stride );
 
 #if 0
-static void predict_8x8_v( uint8_t *src, int i_stride )
+static void predict_8x8c_v( uint8_t *src, int i_stride )
 {
     int i;
 
@@ -326,7 +326,7 @@ static void predict_8x8_v( uint8_t *src, int i_stride )
 
 
 /****************************************************************************
- * 4x4 prediction for intra luma block DC, H, V, P
+ * 4x4 prediction for intra luma block
  ****************************************************************************/
 static void predict_4x4_dc_128( uint8_t *src, int i_stride )
 {
@@ -422,14 +422,14 @@ void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] )
     pf[I_PRED_16x16_DC_128 ]= predict_16x16_dc_128;
 }
 
-void x264_predict_8x8_init_mmxext( x264_predict_t pf[7] )
+void x264_predict_8x8c_init_mmxext( x264_predict_t pf[7] )
 {
-    pf[I_PRED_CHROMA_V ]     = predict_8x8_v_mmx;
-    pf[I_PRED_CHROMA_H ]     = predict_8x8_h;
-    pf[I_PRED_CHROMA_DC]     = predict_8x8_dc;
-    pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8_dc_left;
-    pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8_dc_top;
-    pf[I_PRED_CHROMA_DC_128 ]= predict_8x8_dc_128;
+    pf[I_PRED_CHROMA_V ]     = predict_8x8c_v_mmx;
+    pf[I_PRED_CHROMA_H ]     = predict_8x8c_h;
+    pf[I_PRED_CHROMA_DC]     = predict_8x8c_dc;
+    pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8c_dc_left;
+    pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8c_dc_top;
+    pf[I_PRED_CHROMA_DC_128 ]= predict_8x8c_dc_128;
 }
 
 void x264_predict_4x4_init_mmxext( x264_predict_t pf[12] )
diff --git a/common/i386/predict.h b/common/i386/predict.h
index b00b1e59..9cec1ed2 100644
--- a/common/i386/predict.h
+++ b/common/i386/predict.h
@@ -25,7 +25,7 @@
 #define _I386_PREDICT_H 1
 
 void x264_predict_16x16_init_mmxext ( x264_predict_t pf[7] );
-void x264_predict_8x8_init_mmxext   ( x264_predict_t pf[7] );
+void x264_predict_8x8c_init_mmxext  ( x264_predict_t pf[7] );
 void x264_predict_4x4_init_mmxext   ( x264_predict_t pf[12] );
 
 #endif
diff --git a/common/macroblock.c b/common/macroblock.c
index 95772744..49804e58 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -54,23 +54,71 @@ static const int dequant_mf[6][4][4] =
     { {18, 23, 18, 23}, {23, 29, 23, 29}, {18, 23, 18, 23}, {23, 29, 23, 29} }
 };
 
-#if 0
-static const int i_chroma_qp_table[52] =
+static const int dequant8_mf[6][8][8] =
 {
-     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
-    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
-    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
-    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
-    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
-    39, 39
+  {
+    {20, 19, 25, 19, 20, 19, 25, 19},
+    {19, 18, 24, 18, 19, 18, 24, 18},
+    {25, 24, 32, 24, 25, 24, 32, 24},
+    {19, 18, 24, 18, 19, 18, 24, 18},
+    {20, 19, 25, 19, 20, 19, 25, 19},
+    {19, 18, 24, 18, 19, 18, 24, 18},
+    {25, 24, 32, 24, 25, 24, 32, 24},
+    {19, 18, 24, 18, 19, 18, 24, 18}
+  }, {
+    {22, 21, 28, 21, 22, 21, 28, 21},
+    {21, 19, 26, 19, 21, 19, 26, 19},
+    {28, 26, 35, 26, 28, 26, 35, 26},
+    {21, 19, 26, 19, 21, 19, 26, 19},
+    {22, 21, 28, 21, 22, 21, 28, 21},
+    {21, 19, 26, 19, 21, 19, 26, 19},
+    {28, 26, 35, 26, 28, 26, 35, 26},
+    {21, 19, 26, 19, 21, 19, 26, 19}
+  }, {
+    {26, 24, 33, 24, 26, 24, 33, 24},
+    {24, 23, 31, 23, 24, 23, 31, 23},
+    {33, 31, 42, 31, 33, 31, 42, 31},
+    {24, 23, 31, 23, 24, 23, 31, 23},
+    {26, 24, 33, 24, 26, 24, 33, 24},
+    {24, 23, 31, 23, 24, 23, 31, 23},
+    {33, 31, 42, 31, 33, 31, 42, 31},
+    {24, 23, 31, 23, 24, 23, 31, 23}
+  }, {
+    {28, 26, 35, 26, 28, 26, 35, 26},
+    {26, 25, 33, 25, 26, 25, 33, 25},
+    {35, 33, 45, 33, 35, 33, 45, 33},
+    {26, 25, 33, 25, 26, 25, 33, 25},
+    {28, 26, 35, 26, 28, 26, 35, 26},
+    {26, 25, 33, 25, 26, 25, 33, 25},
+    {35, 33, 45, 33, 35, 33, 45, 33},
+    {26, 25, 33, 25, 26, 25, 33, 25}
+  }, {
+    {32, 30, 40, 30, 32, 30, 40, 30},
+    {30, 28, 38, 28, 30, 28, 38, 28},
+    {40, 38, 51, 38, 40, 38, 51, 38},
+    {30, 28, 38, 28, 30, 28, 38, 28},
+    {32, 30, 40, 30, 32, 30, 40, 30},
+    {30, 28, 38, 28, 30, 28, 38, 28},
+    {40, 38, 51, 38, 40, 38, 51, 38},
+    {30, 28, 38, 28, 30, 28, 38, 28}
+  }, {
+    {36, 34, 46, 34, 36, 34, 46, 34},
+    {34, 32, 43, 32, 34, 32, 43, 32},
+    {46, 43, 58, 43, 46, 43, 58, 43},
+    {34, 32, 43, 32, 34, 32, 43, 32},
+    {36, 34, 46, 34, 36, 34, 46, 34},
+    {34, 32, 43, 32, 34, 32, 43, 32},
+    {46, 43, 58, 43, 46, 43, 58, 43},
+    {34, 32, 43, 32, 34, 32, 43, 32}
+  }
 };
-#endif
 
 int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
 {
     const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1];
     const int mb = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 8];
-    const int m  = X264_MIN( ma, mb );
+    const int m  = X264_MIN( x264_mb_pred_mode4x4_fix(ma),
+                             x264_mb_pred_mode4x4_fix(mb) );
 
     if( m < 0 )
         return I_PRED_4x4_DC;
@@ -92,6 +140,24 @@ int x264_mb_predict_non_zero_code( x264_t *h, int idx )
     return i_ret & 0x7f;
 }
 
+int x264_mb_transform_8x8_allowed( x264_t *h, int i_mb_type )
+{
+    int i;
+    if( i_mb_type == P_8x8 || i_mb_type == B_8x8 )
+    {
+        for( i = 0; i < 4; i++ )
+            if( !IS_SUB8x8(h->mb.i_sub_partition[i])
+                || ( h->mb.i_sub_partition[i] == D_DIRECT_8x8 && !h->sps->b_direct8x8_inference ) )
+            {
+                return 0;
+            }
+    }
+    if( i_mb_type == B_DIRECT && !h->sps->b_direct8x8_inference )
+        return 0;
+
+    return 1;
+}
+
 /****************************************************************************
  * Scan and Quant functions
  ****************************************************************************/
@@ -166,6 +232,44 @@ void x264_mb_dequant_4x4( int16_t dct[4][4], int i_qscale )
     }
 }
 
+void x264_mb_dequant_8x8( int16_t dct[8][8], int i_qscale )
+{
+    const int i_mf = i_qscale%6;
+    int y;
+
+    if( i_qscale >= 12 )
+    {
+        const int i_qbits = (i_qscale/6) - 2;
+        for( y = 0; y < 8; y++ )
+        {
+            dct[y][0] = ( dct[y][0] * dequant8_mf[i_mf][y][0] ) << i_qbits;
+            dct[y][1] = ( dct[y][1] * dequant8_mf[i_mf][y][1] ) << i_qbits;
+            dct[y][2] = ( dct[y][2] * dequant8_mf[i_mf][y][2] ) << i_qbits;
+            dct[y][3] = ( dct[y][3] * dequant8_mf[i_mf][y][3] ) << i_qbits;
+            dct[y][4] = ( dct[y][4] * dequant8_mf[i_mf][y][4] ) << i_qbits;
+            dct[y][5] = ( dct[y][5] * dequant8_mf[i_mf][y][5] ) << i_qbits;
+            dct[y][6] = ( dct[y][6] * dequant8_mf[i_mf][y][6] ) << i_qbits;
+            dct[y][7] = ( dct[y][7] * dequant8_mf[i_mf][y][7] ) << i_qbits;
+        }
+    }
+    else
+    {
+        const int i_qbits = 2 - (i_qscale/6);
+        const int i_round = i_qbits; // 1<<(i_qbits-1)
+        for( y = 0; y < 8; y++ )
+        {
+            dct[y][0] = ( dct[y][0] * dequant8_mf[i_mf][y][0] + i_round ) >> i_qbits;
+            dct[y][1] = ( dct[y][1] * dequant8_mf[i_mf][y][1] + i_round ) >> i_qbits;
+            dct[y][2] = ( dct[y][2] * dequant8_mf[i_mf][y][2] + i_round ) >> i_qbits;
+            dct[y][3] = ( dct[y][3] * dequant8_mf[i_mf][y][3] + i_round ) >> i_qbits;
+            dct[y][4] = ( dct[y][4] * dequant8_mf[i_mf][y][4] + i_round ) >> i_qbits;
+            dct[y][5] = ( dct[y][5] * dequant8_mf[i_mf][y][5] + i_round ) >> i_qbits;
+            dct[y][6] = ( dct[y][6] * dequant8_mf[i_mf][y][6] + i_round ) >> i_qbits;
+            dct[y][7] = ( dct[y][7] * dequant8_mf[i_mf][y][7] + i_round ) >> i_qbits;
+        }
+    }
+}
+
 void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] )
 {
     const int i8 = x264_scan8[idx];
@@ -829,9 +933,10 @@ void x264_macroblock_cache_init( x264_t *h )
     h->mb.i_b8_stride = h->sps->i_mb_width * 2;
     h->mb.i_b4_stride = h->sps->i_mb_width * 4;
 
-    h->mb.qp  = x264_malloc( i_mb_count * sizeof( int8_t) );
-    h->mb.cbp = x264_malloc( i_mb_count * sizeof( int16_t) );
-    h->mb.skipbp = x264_malloc( i_mb_count * sizeof( int8_t) );
+    h->mb.qp  = x264_malloc( i_mb_count * sizeof(int8_t) );
+    h->mb.cbp = x264_malloc( i_mb_count * sizeof(int16_t) );
+    h->mb.skipbp = x264_malloc( i_mb_count * sizeof(int8_t) );
+    h->mb.mb_transform_size = x264_malloc( i_mb_count * sizeof(int8_t) );
 
     /* 0 -> 3 top(4), 4 -> 6 : left(3) */
     h->mb.intra4x4_pred_mode = x264_malloc( i_mb_count * 7 * sizeof( int8_t ) );
@@ -874,6 +979,7 @@ void x264_macroblock_cache_end( x264_t *h )
     }
     x264_free( h->mb.intra4x4_pred_mode );
     x264_free( h->mb.non_zero_count );
+    x264_free( h->mb.mb_transform_size );
     x264_free( h->mb.skipbp );
     x264_free( h->mb.cbp );
     x264_free( h->mb.qp );
@@ -1070,6 +1176,14 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
     else
         h->mb.i_mb_type_topleft = -1;
 
+    if( h->param.analyse.b_transform_8x8 )
+    {
+        h->mb.cache.transform_size[0] = (h->mb.i_neighbour&MB_LEFT)
+                                      && h->mb.mb_transform_size[i_left_xy];
+        h->mb.cache.transform_size[1] = (h->mb.i_neighbour&MB_TOP)
+                                      && h->mb.mb_transform_size[i_top_xy];
+    }
+
     /* load ref/mv/mvd */
     if( h->sh.i_type != SLICE_TYPE_I )
     {
@@ -1237,12 +1351,37 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
             }
         }
     }
+
+    // FIXME skip this if I_4x4 and I_8x8 are disabled?
+    // assumes MB_TOPRIGHT = MB_TOP<<1
+    h->mb.i_neighbour4[0] =
+    h->mb.i_neighbour8[0] = (h->mb.i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT))
+                            | ((h->mb.i_neighbour & MB_TOP) ? MB_TOPRIGHT : 0);
+    h->mb.i_neighbour4[4] =
+    h->mb.i_neighbour4[1] = MB_LEFT | ((h->mb.i_neighbour & MB_TOP) ? (MB_TOP|MB_TOPLEFT|MB_TOPRIGHT) : 0);
+    h->mb.i_neighbour4[2] =
+    h->mb.i_neighbour4[8] =
+    h->mb.i_neighbour4[10] =
+    h->mb.i_neighbour8[2] = MB_TOP|MB_TOPRIGHT | ((h->mb.i_neighbour & MB_LEFT) ? (MB_LEFT|MB_TOPLEFT) : 0);
+    h->mb.i_neighbour4[3] =
+    h->mb.i_neighbour4[7] =
+    h->mb.i_neighbour4[11] =
+    h->mb.i_neighbour4[13] =
+    h->mb.i_neighbour4[15] =
+    h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
+    h->mb.i_neighbour4[5] =
+    h->mb.i_neighbour8[1] = MB_LEFT | (h->mb.i_neighbour & MB_TOPRIGHT)
+                            | ((h->mb.i_neighbour & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0);
+    h->mb.i_neighbour4[6] =
+    h->mb.i_neighbour4[9] =
+    h->mb.i_neighbour4[12] =
+    h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
 }
 
 void x264_macroblock_cache_save( x264_t *h )
 {
     const int i_mb_xy = h->mb.i_mb_xy;
-    const int i_mb_type = h->mb.i_type;
+    const int i_mb_type = x264_mb_type_fix[h->mb.i_type];
     const int s8x8 = h->mb.i_b8_stride;
     const int s4x4 = h->mb.i_b4_stride;
     const int i_mb_4x4 = h->mb.i_b4_xy;
@@ -1295,6 +1434,8 @@ void x264_macroblock_cache_save( x264_t *h )
         }
     }
 
+    h->mb.mb_transform_size[i_mb_xy] = h->mb.b_transform_8x8;
+
     if( !IS_INTRA( i_mb_type ) )
     {
         int i_list;
diff --git a/common/macroblock.h b/common/macroblock.h
index 7f26a2c0..a3b55271 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -36,35 +36,43 @@ enum macroblock_position_e
 
 
 /* XXX mb_type isn't the one written in the bitstream -> only internal usage */
-#define IS_INTRA(type) ( (type) == I_4x4 || (type) == I_16x16 )
+#define IS_INTRA(type) ( (type) == I_4x4 || (type) == I_8x8 || (type) == I_16x16 )
 #define IS_SKIP(type)  ( (type) == P_SKIP || (type) == B_SKIP )
 #define IS_DIRECT(type)  ( (type) == B_DIRECT )
 enum mb_class_e
 {
     I_4x4           = 0,
-    I_16x16         = 1,
-    I_PCM           = 2,
-
-    P_L0            = 3,
-    P_8x8           = 4,
-    P_SKIP          = 5,
-
-    B_DIRECT        = 6,
-    B_L0_L0         = 7,
-    B_L0_L1         = 8,
-    B_L0_BI         = 9,
-    B_L1_L0         = 10,
-    B_L1_L1         = 11,
-    B_L1_BI         = 12,
-    B_BI_L0         = 13,
-    B_BI_L1         = 14,
-    B_BI_BI         = 15,
-    B_8x8           = 16,
-    B_SKIP          = 17,
+    I_8x8           = 1,
+    I_16x16         = 2,
+    I_PCM           = 3,
+
+    P_L0            = 4,
+    P_8x8           = 5,
+    P_SKIP          = 6,
+
+    B_DIRECT        = 7,
+    B_L0_L0         = 8,
+    B_L0_L1         = 9,
+    B_L0_BI         = 10,
+    B_L1_L0         = 11,
+    B_L1_L1         = 12,
+    B_L1_BI         = 13,
+    B_BI_L0         = 14,
+    B_BI_L1         = 15,
+    B_BI_BI         = 16,
+    B_8x8           = 17,
+    B_SKIP          = 18,
+};
+static const int x264_mb_type_fix[19] =
+{
+    I_4x4, I_4x4, I_16x16, I_PCM,
+    P_L0, P_8x8, P_SKIP,
+    B_DIRECT, B_L0_L0, B_L0_L1, B_L0_BI, B_L1_L0, B_L1_L1,
+    B_L1_BI, B_BI_L0, B_BI_L1, B_BI_BI, B_8x8, B_SKIP
 };
-static const int x264_mb_type_list0_table[18][2] =
+static const int x264_mb_type_list0_table[19][2] =
 {
-    {0,0}, {0,0}, {0,0},    /* INTRA */
+    {0,0}, {0,0}, {0,0}, {0,0}, /* INTRA */
     {1,1},                  /* P_L0 */
     {0,0},                  /* P_8x8 */
     {1,1},                  /* P_SKIP */
@@ -75,9 +83,9 @@ static const int x264_mb_type_list0_table[18][2] =
     {0,0},                  /* B_8x8 */
     {0,0}                   /* B_SKIP */
 };
-static const int x264_mb_type_list1_table[18][2] =
+static const int x264_mb_type_list1_table[19][2] =
 {
-    {0,0}, {0,0}, {0,0},    /* INTRA */
+    {0,0}, {0,0}, {0,0}, {0,0}, /* INTRA */
     {0,0},                  /* P_L0 */
     {0,0},                  /* P_8x8 */
     {0,0},                  /* P_SKIP */
@@ -160,6 +168,7 @@ void x264_macroblock_bipred_init( x264_t *h );
 void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int i_qscale );
 void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int i_qscale );
 void x264_mb_dequant_4x4( int16_t dct[4][4], int i_qscale );
+void x264_mb_dequant_8x8( int16_t dct[8][8], int i_qscale );
 
 /* x264_mb_predict_mv_16x16:
  *      set mvp with predicted mv for D_16x16 block
@@ -192,8 +201,10 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int mvc[5][2
 
 int  x264_mb_predict_intra4x4_mode( x264_t *h, int idx );
 int  x264_mb_predict_non_zero_code( x264_t *h, int idx );
+int  x264_mb_transform_8x8_allowed( x264_t *h, int i_mb_type );
 
 void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale );
+void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale );
 
 void x264_mb_mc( x264_t *h );
 
@@ -244,6 +255,11 @@ static inline void x264_macroblock_cache_skip( x264_t *h, int x, int y, int widt
         }
     }
 }
+static inline void x264_macroblock_cache_intra8x8_pred( x264_t *h, int x, int y, int i_mode )
+{
+    int *cache = &h->mb.cache.intra4x4_pred_mode[X264_SCAN8_0+x+8*y];
+    cache[0] = cache[1] = cache[8] = cache[9] = i_mode;
+}
 
 #endif
 
diff --git a/common/predict.c b/common/predict.c
index 13c98603..5ce34e27 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -32,8 +32,8 @@
 #include <stdlib.h>
 #include <stdarg.h>
 
-#include "x264.h"
-#include "predict.h"
+#include "common.h"
+#include "macroblock.h"
 
 #ifdef _MSC_VER
 #undef HAVE_MMXEXT  /* not finished now */
@@ -197,7 +197,7 @@ static void predict_16x16_p( uint8_t *src, int i_stride )
 /****************************************************************************
  * 8x8 prediction for intra chroma block DC, H, V, P
  ****************************************************************************/
-static void predict_8x8_dc_128( uint8_t *src, int i_stride )
+static void predict_8x8c_dc_128( uint8_t *src, int i_stride )
 {
     int x,y;
 
@@ -210,7 +210,7 @@ static void predict_8x8_dc_128( uint8_t *src, int i_stride )
         src += i_stride;
     }
 }
-static void predict_8x8_dc_left( uint8_t *src, int i_stride )
+static void predict_8x8c_dc_left( uint8_t *src, int i_stride )
 {
     int x,y;
     int dc0 = 0, dc1 = 0;
@@ -233,7 +233,7 @@ static void predict_8x8_dc_left( uint8_t *src, int i_stride )
         src += i_stride;
     }
 }
-static void predict_8x8_dc_top( uint8_t *src, int i_stride )
+static void predict_8x8c_dc_top( uint8_t *src, int i_stride )
 {
     int x,y;
     int dc0 = 0, dc1 = 0;
@@ -256,7 +256,7 @@ static void predict_8x8_dc_top( uint8_t *src, int i_stride )
         src += i_stride;
     }
 }
-static void predict_8x8_dc( uint8_t *src, int i_stride )
+static void predict_8x8c_dc( uint8_t *src, int i_stride )
 {
     int x,y;
     int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
@@ -297,7 +297,7 @@ static void predict_8x8_dc( uint8_t *src, int i_stride )
     }
 }
 
-static void predict_8x8_h( uint8_t *src, int i_stride )
+static void predict_8x8c_h( uint8_t *src, int i_stride )
 {
     int i,j;
 
@@ -314,7 +314,7 @@ static void predict_8x8_h( uint8_t *src, int i_stride )
         src += i_stride;
     }
 }
-static void predict_8x8_v( uint8_t *src, int i_stride )
+static void predict_8x8c_v( uint8_t *src, int i_stride )
 {
     int i,j;
 
@@ -327,7 +327,7 @@ static void predict_8x8_v( uint8_t *src, int i_stride )
     }
 }
 
-static void predict_8x8_p( uint8_t *src, int i_stride )
+static void predict_8x8c_p( uint8_t *src, int i_stride )
 {
     int i;
     int x,y;
@@ -362,7 +362,7 @@ static void predict_8x8_p( uint8_t *src, int i_stride )
 }
 
 /****************************************************************************
- * 4x4 prediction for intra luma block DC, H, V, P
+ * 4x4 prediction for intra luma block
  ****************************************************************************/
 static void predict_4x4_dc_128( uint8_t *src, int i_stride )
 {
@@ -638,6 +638,245 @@ static void predict_4x4_hu( uint8_t *src, int i_stride )
     src[3*i_stride+3]= l3;
 }
 
+/****************************************************************************
+ * 8x8 prediction for intra luma block
+ ****************************************************************************/
+
+#define SRC(x,y) src[(x)+(y)*i_stride]
+#define PL(y) \
+    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_LEFT \
+    const int l0 = ((i_neighbor&MB_TOPLEFT ? SRC(-1,-1) : SRC(-1,0)) \
+                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
+    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
+    const int l7 = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2;
+
+#define PT(x) \
+    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_TOP \
+    const int t0 = ((i_neighbor&MB_TOPLEFT ? SRC(-1,-1) : SRC(0,-1)) \
+                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
+    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
+    const int t7 = ((i_neighbor&MB_TOPRIGHT ? SRC(8,-1) : SRC(7,-1)) \
+                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2; \
+
+#define PTR(x) \
+    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
+#define PREDICT_8x8_LOAD_TOPRIGHT \
+    int t8, t9, t10, t11, t12, t13, t14, t15; \
+    if(i_neighbor&MB_TOPRIGHT) { \
+        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
+        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
+    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
+
+#define PREDICT_8x8_LOAD_TOPLEFT \
+    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2;
+
+#define PREDICT_8x8_DC(v) \
+    int y; \
+    for( y = 0; y < 8; y++ ) { \
+        ((uint32_t*)src)[0] = \
+        ((uint32_t*)src)[1] = v; \
+        src += i_stride; \
+    }
+
+static void predict_8x8_dc_128( uint8_t *src, int i_stride, int i_neighbor )
+{
+    PREDICT_8x8_DC(0x80808080);
+}
+static void predict_8x8_dc_left( uint8_t *src, int i_stride, int i_neighbor )
+{
+    PREDICT_8x8_LOAD_LEFT;
+    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
+    PREDICT_8x8_DC(dc);
+}
+static void predict_8x8_dc_top( uint8_t *src, int i_stride, int i_neighbor )
+{
+    PREDICT_8x8_LOAD_TOP;
+    const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
+    PREDICT_8x8_DC(dc);
+}
+static void predict_8x8_dc( uint8_t *src, int i_stride, int i_neighbor )
+{
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOP;
+    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
+                         +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
+    PREDICT_8x8_DC(dc);
+}
+static void predict_8x8_h( uint8_t *src, int i_stride, int i_neighbor )
+{
+    PREDICT_8x8_LOAD_LEFT;
+#define ROW(y) ((uint32_t*)(src+y*i_stride))[0] =\
+               ((uint32_t*)(src+y*i_stride))[1] = 0x01010101U * l##y
+    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
+#undef ROW
+}
+static void predict_8x8_v( uint8_t *src, int i_stride, int i_neighbor )
+{
+    int y;
+    PREDICT_8x8_LOAD_TOP;
+    src[0] = t0;
+    src[1] = t1;
+    src[2] = t2;
+    src[3] = t3;
+    src[4] = t4;
+    src[5] = t5;
+    src[6] = t6;
+    src[7] = t7;
+    for( y = 1; y < 8; y++ )
+        *(uint64_t*)(src+y*i_stride) = *(uint64_t*)src;
+}
+static void predict_8x8_ddl( uint8_t *src, int i_stride, int i_neighbor )
+{
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_TOPRIGHT;
+    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
+    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
+    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
+    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
+    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
+    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
+    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
+    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
+    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
+}
+static void predict_8x8_ddr( uint8_t *src, int i_stride, int i_neighbor )
+{
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
+    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
+    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
+    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
+    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
+    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
+    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
+    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
+    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
+  
+}
+static void predict_8x8_vr( uint8_t *src, int i_stride, int i_neighbor )
+{
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+    /* produce warning as l7 is unused */
+    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
+    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
+    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
+    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
+    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
+    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
+    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
+    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
+    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
+    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
+    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
+    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
+    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
+    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
+    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(7,0)= (t6 + t7 + 1) >> 1;
+}
+static void predict_8x8_hd( uint8_t *src, int i_stride, int i_neighbor )
+{
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_LEFT;
+    PREDICT_8x8_LOAD_TOPLEFT;
+    /* produce warning as t7 is unused */
+    SRC(0,7)= (l6 + l7 + 1) >> 1;
+    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
+    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
+    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
+    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
+    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
+    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
+    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
+    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
+    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
+    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
+    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
+    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
+    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
+    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
+    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
+    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
+    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
+    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
+    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
+    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
+    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
+}
+static void predict_8x8_vl( uint8_t *src, int i_stride, int i_neighbor )
+{
+    PREDICT_8x8_LOAD_TOP;
+    PREDICT_8x8_LOAD_TOPRIGHT;
+    SRC(0,0)= (t0 + t1 + 1) >> 1;
+    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
+    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
+    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
+    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
+    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
+    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
+    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
+    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
+    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
+    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
+    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
+    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
+    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
+    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
+    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
+    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
+    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
+    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
+    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
+    SRC(7,6)= (t10 + t11 + 1) >> 1;
+    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
+}
+static void predict_8x8_hu( uint8_t *src, int i_stride, int i_neighbor )
+{
+    PREDICT_8x8_LOAD_LEFT;
+    SRC(0,0)= (l0 + l1 + 1) >> 1;
+    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
+    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
+    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
+    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
+    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
+    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
+    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
+    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
+    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
+    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
+    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
+    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
+    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
+    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
+    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
+    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
+    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
+}
+
 /****************************************************************************
  * Exported functions:
  ****************************************************************************/
@@ -659,24 +898,40 @@ void x264_predict_16x16_init( int cpu, x264_predict_t pf[7] )
 #endif
 }
 
-void x264_predict_8x8_init( int cpu, x264_predict_t pf[7] )
+void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
 {
-    pf[I_PRED_CHROMA_V ]     = predict_8x8_v;
-    pf[I_PRED_CHROMA_H ]     = predict_8x8_h;
-    pf[I_PRED_CHROMA_DC]     = predict_8x8_dc;
-    pf[I_PRED_CHROMA_P ]     = predict_8x8_p;
-    pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8_dc_left;
-    pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8_dc_top;
-    pf[I_PRED_CHROMA_DC_128 ]= predict_8x8_dc_128;
+    pf[I_PRED_CHROMA_V ]     = predict_8x8c_v;
+    pf[I_PRED_CHROMA_H ]     = predict_8x8c_h;
+    pf[I_PRED_CHROMA_DC]     = predict_8x8c_dc;
+    pf[I_PRED_CHROMA_P ]     = predict_8x8c_p;
+    pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8c_dc_left;
+    pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8c_dc_top;
+    pf[I_PRED_CHROMA_DC_128 ]= predict_8x8c_dc_128;
 
 #ifdef HAVE_MMXEXT
     if( cpu&X264_CPU_MMXEXT )
     {
-        x264_predict_8x8_init_mmxext( pf );
+        x264_predict_8x8c_init_mmxext( pf );
     }
 #endif
 }
 
+void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12] )
+{
+    pf[I_PRED_8x8_V]      = predict_8x8_v;
+    pf[I_PRED_8x8_H]      = predict_8x8_h;
+    pf[I_PRED_8x8_DC]     = predict_8x8_dc;
+    pf[I_PRED_8x8_DDL]    = predict_8x8_ddl;
+    pf[I_PRED_8x8_DDR]    = predict_8x8_ddr;
+    pf[I_PRED_8x8_VR]     = predict_8x8_vr;
+    pf[I_PRED_8x8_HD]     = predict_8x8_hd;
+    pf[I_PRED_8x8_VL]     = predict_8x8_vl;
+    pf[I_PRED_8x8_HU]     = predict_8x8_hu;
+    pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left;
+    pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top;
+    pf[I_PRED_8x8_DC_128] = predict_8x8_dc_128;
+}
+
 void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
 {
     pf[I_PRED_4x4_V]      = predict_4x4_v;
diff --git a/common/predict.h b/common/predict.h
index 988e57fb..3a9554d7 100644
--- a/common/predict.h
+++ b/common/predict.h
@@ -25,6 +25,7 @@
 #define _PREDICT_H 1
 
 typedef void (*x264_predict_t)( uint8_t *src, int i_stride );
+typedef void (*x264_predict8x8_t)( uint8_t *src, int i_stride, int i_neighbor );
 
 enum intra_chroma_pred_e
 {
@@ -37,7 +38,7 @@ enum intra_chroma_pred_e
     I_PRED_CHROMA_DC_TOP  = 5,
     I_PRED_CHROMA_DC_128  = 6
 };
-static const int x264_mb_pred_mode8x8_fix[7] =
+static const int x264_mb_pred_mode8x8c_fix[7] =
 {
     I_PRED_CHROMA_DC, I_PRED_CHROMA_H, I_PRED_CHROMA_V, I_PRED_CHROMA_P,
     I_PRED_CHROMA_DC, I_PRED_CHROMA_DC,I_PRED_CHROMA_DC
@@ -76,17 +77,38 @@ enum intra4x4_pred_e
     I_PRED_4x4_DC_TOP  = 10,
     I_PRED_4x4_DC_128  = 11,
 };
-static const int x264_mb_pred_mode4x4_fix[12] =
+static const int x264_mb_pred_mode4x4_fix[13] =
 {
+    -1,
     I_PRED_4x4_V,   I_PRED_4x4_H,   I_PRED_4x4_DC,
     I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR,
     I_PRED_4x4_HD,  I_PRED_4x4_VL,  I_PRED_4x4_HU,
     I_PRED_4x4_DC,  I_PRED_4x4_DC,  I_PRED_4x4_DC
 };
+#define x264_mb_pred_mode4x4_fix(t) x264_mb_pred_mode4x4_fix[(t)+1]
+
+/* must use the same numbering as intra4x4_pred_e */
+enum intra8x8_pred_e
+{
+    I_PRED_8x8_V  = 0,
+    I_PRED_8x8_H  = 1,
+    I_PRED_8x8_DC = 2,
+    I_PRED_8x8_DDL= 3,
+    I_PRED_8x8_DDR= 4,
+    I_PRED_8x8_VR = 5,
+    I_PRED_8x8_HD = 6,
+    I_PRED_8x8_VL = 7,
+    I_PRED_8x8_HU = 8,
+
+    I_PRED_8x8_DC_LEFT = 9,
+    I_PRED_8x8_DC_TOP  = 10,
+    I_PRED_8x8_DC_128  = 11,
+};
 
 void x264_predict_16x16_init ( int cpu, x264_predict_t pf[7] );
-void x264_predict_8x8_init   ( int cpu, x264_predict_t pf[7] );
+void x264_predict_8x8c_init  ( int cpu, x264_predict_t pf[7] );
 void x264_predict_4x4_init   ( int cpu, x264_predict_t pf[12] );
+void x264_predict_8x8_init   ( int cpu, x264_predict8x8_t pf[12] );
 
 
 #endif
diff --git a/common/set.h b/common/set.h
index d559902f..3893bf64 100644
--- a/common/set.h
+++ b/common/set.h
@@ -28,7 +28,11 @@ enum profile_e
 {
     PROFILE_BASELINE = 66,
     PROFILE_MAIN = 77,
-    PROFILE_EXTENTED = 88
+    PROFILE_EXTENTED = 88,
+    PROFILE_HIGH = 100,
+    PROFILE_HIGH10 = 110,
+    PROFILE_HIGH422 = 122,
+    PROFILE_HIGH444 = 144
 };
 
 typedef struct
@@ -133,6 +137,9 @@ typedef struct
     int b_deblocking_filter_control;
     int b_constrained_intra_pred;
     int b_redundant_pic_cnt;
+
+    int b_transform_8x8_mode;
+
 } x264_pps_t;
 
 #endif
diff --git a/common/visualize.c b/common/visualize.c
index 78f2d91f..053c1b1b 100644
--- a/common/visualize.c
+++ b/common/visualize.c
@@ -147,6 +147,7 @@ void x264_visualize_show( x264_t *h )
     static const stringlist_t mb_types[] = {
         /* Block types marked as NULL will not be drawn */
         { I_4x4   , "red" },
+        { I_8x8   , "#ff5640" },
         { I_16x16 , "#ff8060" },
         { I_PCM   , "violet" },
         { P_L0    , "SlateBlue" },
@@ -256,7 +257,7 @@ void x264_visualize_show( x264_t *h )
             }
         }
 
-        if (v->i_type==I_4x4 || v->i_type==I_16x16 || v->i_type==I_PCM) {
+        if (IS_INTRA(v->i_type) || v->i_type==I_PCM) {
             /* Intra coded */
             if (v->i_type==I_16x16) {
                 switch (v->i_intra16x16_pred_mode) {
@@ -278,42 +279,44 @@ void x264_visualize_show( x264_t *h )
                     break;
                 }
             }
-            if (v->i_type==I_4x4) {
-                for (i=0; i<4; i++) for (j=0; j<4; j++) {
+            if (v->i_type==I_4x4 || v->i_type==I_8x8) {
+                const int di = v->i_type==I_8x8 ? 2 : 1;
+                const int zoom2 = zoom * di;
+                for (i=0; i<4; i+=di) for (j=0; j<4; j+=di) {
                     const int x0 = x + j*4*zoom;
                     const int y0 = y + i*4*zoom;
-                    if (drawbox) disp_rect(0, x0, y0, x0+4*zoom, y0+4*zoom);
+                    if (drawbox) disp_rect(0, x0, y0, x0+4*zoom2, y0+4*zoom2);
                     switch (v->intra4x4_pred_mode[i][j]) {
                     case I_PRED_4x4_V:		/* Vertical */
-                        disp_line(0, x0+0*zoom, y0+1*zoom, x0+4*zoom, y0+1*zoom);
+                        disp_line(0, x0+0*zoom2, y0+1*zoom2, x0+4*zoom2, y0+1*zoom2);
                         break;
                     case I_PRED_4x4_H:		/* Horizontal */
-                        disp_line(0, x0+1*zoom, y0+0*zoom, x0+1*zoom, y0+4*zoom);
+                        disp_line(0, x0+1*zoom2, y0+0*zoom2, x0+1*zoom2, y0+4*zoom2);
                         break;
                     case I_PRED_4x4_DC:		/* DC, average from top and left sides */
                     case I_PRED_4x4_DC_LEFT:
                     case I_PRED_4x4_DC_TOP:
                     case I_PRED_4x4_DC_128:
-                        disp_line(0, x0+1*zoom, y0+1*zoom, x0+4*zoom, y0+1*zoom);
-                        disp_line(0, x0+1*zoom, y0+1*zoom, x0+1*zoom, y0+4*zoom);
+                        disp_line(0, x0+1*zoom2, y0+1*zoom2, x0+4*zoom2, y0+1*zoom2);
+                        disp_line(0, x0+1*zoom2, y0+1*zoom2, x0+1*zoom2, y0+4*zoom2);
                         break;
                     case I_PRED_4x4_DDL:	/* Topright-bottomleft */
-                        disp_line(0, x0+0*zoom, y0+0*zoom, x0+4*zoom, y0+4*zoom);
+                        disp_line(0, x0+0*zoom2, y0+0*zoom2, x0+4*zoom2, y0+4*zoom2);
                         break;
                     case I_PRED_4x4_DDR:	/* Topleft-bottomright */
-                        disp_line(0, x0+0*zoom, y0+4*zoom, x0+4*zoom, y0+0*zoom);
+                        disp_line(0, x0+0*zoom2, y0+4*zoom2, x0+4*zoom2, y0+0*zoom2);
                         break;
                     case I_PRED_4x4_VR:		/* Mix of topleft-bottomright and vertical */
-                        disp_line(0, x0+0*zoom, y0+2*zoom, x0+4*zoom, y0+1*zoom);
+                        disp_line(0, x0+0*zoom2, y0+2*zoom2, x0+4*zoom2, y0+1*zoom2);
                         break;
                     case I_PRED_4x4_HD:		/* Mix of topleft-bottomright and horizontal */
-                        disp_line(0, x0+2*zoom, y0+0*zoom, x0+1*zoom, y0+4*zoom);
+                        disp_line(0, x0+2*zoom2, y0+0*zoom2, x0+1*zoom2, y0+4*zoom2);
                         break;
                     case I_PRED_4x4_VL:		/* Mix of topright-bottomleft and vertical */
-                        disp_line(0, x0+0*zoom, y0+1*zoom, x0+4*zoom, y0+2*zoom);
+                        disp_line(0, x0+0*zoom2, y0+1*zoom2, x0+4*zoom2, y0+2*zoom2);
                         break;
                     case I_PRED_4x4_HU:		/* Mix of topright-bottomleft and horizontal */
-                        disp_line(0, x0+1*zoom, y0+0*zoom, x0+2*zoom, y0+4*zoom);
+                        disp_line(0, x0+1*zoom2, y0+0*zoom2, x0+2*zoom2, y0+4*zoom2);
                         break;
                     }
                 }
diff --git a/encoder/analyse.c b/encoder/analyse.c
index d9a58d1d..99238651 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -77,16 +77,19 @@ typedef struct
     /* Take some shortcuts in intra search if intra is deemed unlikely */
     int b_fast_intra;
 
-    /* Luma part 16x16 and 4x4 modes stats */
+    /* Luma part */
     int i_sad_i16x16;
     int i_predict16x16;
 
+    int i_sad_i8x8;
+    int i_predict8x8[2][2];
+
     int i_sad_i4x4;
     int i_predict4x4[4][4];
 
     /* Chroma part */
-    int i_sad_i8x8;
-    int i_predict8x8;
+    int i_sad_i8x8chroma;
+    int i_predict8x8chroma;
 
     /* II: Inter part P/B frame */
     x264_mb_analysis_list_t l0;
@@ -126,8 +129,8 @@ static const uint8_t block_idx_y[16] = {
 };
 
 /* TODO: calculate CABAC costs */
-static const int i_mb_b_cost_table[18] = {
-    9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
+static const int i_mb_b_cost_table[19] = {
+    9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
 };
 static const int i_mb_b16x8_cost_table[16] = {
     0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
@@ -175,10 +178,13 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
                         && h->mb.i_subpel_refine >= 5;
     a->b_fast_intra = 0;
 
+    h->mb.b_transform_8x8 = 0;
+
     /* I: Intra part */
     a->i_sad_i16x16 =
+    a->i_sad_i8x8   =
     a->i_sad_i4x4   =
-    a->i_sad_i8x8   = COST_MAX;
+    a->i_sad_i8x8chroma = COST_MAX;
 
     /* II: Inter part P/B frame */
     if( h->sh.i_type != SLICE_TYPE_I )
@@ -244,7 +250,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
                || IS_INTRA( h->mb.i_mb_type_topleft )
                || IS_INTRA( h->mb.i_mb_type_topright )
                || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
-               || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_16x16])) )
+               || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
             { /* intra is likely */ }
             else
             {
@@ -294,7 +300,7 @@ static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, i
 }
 
 /* Max = 4 */
-static void predict_8x8_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
+static void predict_8x8chroma_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
 {
     if( i_neighbour & MB_TOPLEFT )
     {
@@ -327,30 +333,18 @@ static void predict_8x8_mode_available( unsigned int i_neighbour, int *mode, int
     }
 }
 
-/* MAX = 8 */
-static void predict_4x4_mode_available( unsigned int i_neighbour, int idx, int *mode, int *pi_count )
+/* MAX = 9 */
+static void predict_4x4_mode_available( unsigned int i_neighbour,
+                                        int *mode, int *pi_count )
 {
-    int b_a, b_b, b_c;
-    static const unsigned int needmb[16] =
-    {
-        MB_LEFT|MB_TOP, MB_TOP,
-        MB_LEFT,        MB_PRIVATE,
-        MB_TOP,         MB_TOP|MB_TOPRIGHT,
-        0,              MB_PRIVATE,
-        MB_LEFT,        0,
-        MB_LEFT,        MB_PRIVATE,
-        0,              MB_PRIVATE,
-        0,              MB_PRIVATE
-    };
-
-    /* FIXME even when b_c == 0 there is some case where missing pixels
+    /* FIXME even when b_tr == 0 there is some case where missing pixels
      * are emulated and thus more mode are available TODO
      * analysis and encode should be fixed too */
-    b_a = (needmb[idx]&i_neighbour&MB_LEFT) == (needmb[idx]&MB_LEFT);
-    b_b = (needmb[idx]&i_neighbour&MB_TOP) == (needmb[idx]&MB_TOP);
-    b_c = (needmb[idx]&i_neighbour&(MB_TOPRIGHT|MB_PRIVATE)) == (needmb[idx]&(MB_TOPRIGHT|MB_PRIVATE));
+    int b_l = i_neighbour & MB_LEFT;
+    int b_t = i_neighbour & MB_TOP;
+    int b_tr = i_neighbour & MB_TOPRIGHT;
 
-    if( b_a && b_b )
+    if( b_l && b_t )
     {
         *mode++ = I_PRED_4x4_DC;
         *mode++ = I_PRED_4x4_H;
@@ -359,24 +353,16 @@ static void predict_4x4_mode_available( unsigned int i_neighbour, int idx, int *
         *mode++ = I_PRED_4x4_VR;
         *mode++ = I_PRED_4x4_HD;
         *mode++ = I_PRED_4x4_HU;
-
         *pi_count = 7;
-
-        if( b_c )
-        {
-            *mode++ = I_PRED_4x4_DDL;
-            *mode++ = I_PRED_4x4_VL;
-            (*pi_count) += 2;
-        }
     }
-    else if( b_a && !b_b )
+    else if( b_l )
     {
         *mode++ = I_PRED_4x4_DC_LEFT;
         *mode++ = I_PRED_4x4_H;
         *mode++ = I_PRED_4x4_HU;
         *pi_count = 3;
     }
-    else if( !b_a && b_b )
+    else if( b_t )
     {
         *mode++ = I_PRED_4x4_DC_TOP;
         *mode++ = I_PRED_4x4_V;
@@ -387,6 +373,13 @@ static void predict_4x4_mode_available( unsigned int i_neighbour, int idx, int *
         *mode++ = I_PRED_4x4_DC_128;
         *pi_count = 1;
     }
+
+    if( b_t && b_tr )
+    {
+        *mode++ = I_PRED_4x4_DDL;
+        *mode++ = I_PRED_4x4_VL;
+        (*pi_count) += 2;
+    }
 }
 
 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res, int i_cost_inter )
@@ -455,7 +448,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res, int i_cos
             p_dst_by = p_dst + 4 * x + 4 * y * i_stride;
 
             i_best = COST_MAX;
-            predict_4x4_mode_available( h->mb.i_neighbour, idx, predict_mode, &i_max );
+            predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
             for( i = 0; i < i_max; i++ )
             {
                 int i_sad;
@@ -481,18 +474,69 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res, int i_cos
             }
             res->i_sad_i4x4 += i_best;
 
-            /* we need to encode this mb now (for next ones) */
+            /* we need to encode this block now (for next ones) */
             h->predict_4x4[res->i_predict4x4[x][y]]( p_dst_by, i_stride );
             x264_mb_encode_i4x4( h, idx, res->i_qp );
 
-            /* we need to store the 'fixed' version */
-            h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] =
-                x264_mb_pred_mode4x4_fix[res->i_predict4x4[x][y]];
+            h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = res->i_predict4x4[x][y];
         }
         res->i_sad_i4x4 += res->i_lambda * 24;    /* from JVT (SATD0) */
         if( h->sh.i_type == SLICE_TYPE_B )
             res->i_sad_i4x4 += res->i_lambda * i_mb_b_cost_table[I_4x4];
     }
+
+    /* 8x8 prediction selection */
+    if( flags & X264_ANALYSE_I8x8 )
+    {
+        res->i_sad_i8x8 = 0;
+        for( idx = 0; idx < 4; idx++ )
+        {
+            uint8_t *p_src_by;
+            uint8_t *p_dst_by;
+            int     i_best;
+            int x, y;
+            int i_pred_mode;
+
+            i_pred_mode= x264_mb_predict_intra4x4_mode( h, 4*idx );
+            x = idx&1;
+            y = idx>>1;
+
+            p_src_by = p_src + 8 * x + 8 * y * i_stride;
+            p_dst_by = p_dst + 8 * x + 8 * y * i_stride;
+
+            i_best = COST_MAX;
+            predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
+            for( i = 0; i < i_max; i++ )
+            {
+                int i_sad;
+                int i_mode;
+
+                i_mode = predict_mode[i];
+                h->predict_8x8[i_mode]( p_dst_by, i_stride, h->mb.i_neighbour );
+
+                i_sad = h->pixf.satd[PIXEL_8x8]( p_dst_by, i_stride,
+                                                 p_src_by, i_stride );
+
+                i_sad += res->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix[i_mode] ? 1 : 4);
+
+                if( i_best > i_sad )
+                {
+                    res->i_predict8x8[x][y] = i_mode;
+                    i_best = i_sad;
+                }
+            }
+            res->i_sad_i8x8 += i_best;
+
+            /* we need to encode this block now (for next ones) */
+            h->predict_8x8[res->i_predict8x8[x][y]]( p_dst_by, i_stride, h->mb.i_neighbour );
+            x264_mb_encode_i8x8( h, idx, res->i_qp );
+
+            x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, res->i_predict4x4[x][y] );
+        }
+//      res->i_sad_i8x8 += res->i_lambda * something;    // FIXME
+        if( h->sh.i_type == SLICE_TYPE_B )
+            res->i_sad_i8x8 += res->i_lambda * i_mb_b_cost_table[I_8x8];
+    }
 }
 
 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *res )
@@ -505,7 +549,7 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *res )
     uint8_t *p_dstc[2], *p_srcc[2];
     int      i_stride[2];
 
-    if( res->i_sad_i8x8 < COST_MAX )
+    if( res->i_sad_i8x8chroma < COST_MAX )
         return;
 
     /* 8x8 prediction selection for chroma */
@@ -517,8 +561,8 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *res )
     i_stride[0] = h->mb.pic.i_stride[1];
     i_stride[1] = h->mb.pic.i_stride[2];
 
-    predict_8x8_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
-    res->i_sad_i8x8 = COST_MAX;
+    predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
+    res->i_sad_i8x8chroma = COST_MAX;
     for( i = 0; i < i_max; i++ )
     {
         int i_sad;
@@ -527,23 +571,25 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *res )
         i_mode = predict_mode[i];
 
         /* we do the prediction */
-        h->predict_8x8[i_mode]( p_dstc[0], i_stride[0] );
-        h->predict_8x8[i_mode]( p_dstc[1], i_stride[1] );
+        h->predict_8x8c[i_mode]( p_dstc[0], i_stride[0] );
+        h->predict_8x8c[i_mode]( p_dstc[1], i_stride[1] );
 
         /* we calculate the cost */
         i_sad = h->pixf.satd[PIXEL_8x8]( p_dstc[0], i_stride[0],
                                          p_srcc[0], i_stride[0] ) +
                 h->pixf.satd[PIXEL_8x8]( p_dstc[1], i_stride[1],
                                          p_srcc[1], i_stride[1] ) +
-                res->i_lambda * bs_size_ue( x264_mb_pred_mode8x8_fix[i_mode] );
+                res->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
 
         /* if i_score is lower it is better */
-        if( res->i_sad_i8x8 > i_sad )
+        if( res->i_sad_i8x8chroma > i_sad )
         {
-            res->i_predict8x8 = i_mode;
-            res->i_sad_i8x8     = i_sad;
+            res->i_predict8x8chroma = i_mode;
+            res->i_sad_i8x8chroma   = i_sad;
         }
     }
+
+    h->mb.i_chroma_pred_mode = res->i_predict8x8chroma;
 }
 
 #define LOAD_FENC( m, src, xoff, yoff) \
@@ -1316,12 +1362,18 @@ void x264_macroblock_analyse( x264_t *h )
     /*--------------------------- Do the analysis ---------------------------*/
     if( h->sh.i_type == SLICE_TYPE_I )
     {
+        int i_cost;
         x264_mb_analyse_intra( h, &analysis, COST_MAX );
 
-        if( analysis.i_sad_i4x4 < analysis.i_sad_i16x16 )
+        i_cost = analysis.i_sad_i16x16;
+        h->mb.i_type = I_16x16;
+        if( analysis.i_sad_i4x4 < i_cost )
+        {
+            i_cost = analysis.i_sad_i4x4;
             h->mb.i_type = I_4x4;
-        else
-            h->mb.i_type = I_16x16;
+        }
+        if( analysis.i_sad_i8x8 < i_cost )
+            h->mb.i_type = I_8x8;
     }
     else if( h->sh.i_type == SLICE_TYPE_P )
     {
@@ -1493,8 +1545,8 @@ void x264_macroblock_analyse( x264_t *h )
              || ( analysis.i_sad_i4x4 < i_cost )))
             {
                 x264_mb_analyse_intra_chroma( h, &analysis );
-                analysis.i_sad_i16x16 += analysis.i_sad_i8x8;
-                analysis.i_sad_i4x4 += analysis.i_sad_i8x8;
+                analysis.i_sad_i16x16 += analysis.i_sad_i8x8chroma;
+                analysis.i_sad_i4x4 += analysis.i_sad_i8x8chroma;
             }
 
             i_intra_type = I_16x16;
@@ -1697,7 +1749,7 @@ void x264_macroblock_analyse( x264_t *h )
     }
 
     /*-------------------- Update MB from the analysis ----------------------*/
-    h->mb.type[h->mb.i_mb_xy] = h->mb.i_type;
+    h->mb.type[h->mb.i_mb_xy] = x264_mb_type_fix[h->mb.i_type];
     switch( h->mb.i_type )
     {
         case I_4x4:
@@ -1708,13 +1760,18 @@ void x264_macroblock_analyse( x264_t *h )
             }
 
             x264_mb_analyse_intra_chroma( h, &analysis );
-            h->mb.i_chroma_pred_mode = analysis.i_predict8x8;
+            break;
+        case I_8x8:
+            h->mb.b_transform_8x8 = 1;
+            for( i = 0; i < 4; i++ )
+                x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1),
+                    analysis.i_predict8x8[i&1][i>>1] );
+
+            x264_mb_analyse_intra_chroma( h, &analysis );
             break;
         case I_16x16:
             h->mb.i_intra16x16_pred_mode = analysis.i_predict16x16;
-
             x264_mb_analyse_intra_chroma( h, &analysis );
-            h->mb.i_chroma_pred_mode = analysis.i_predict8x8;
             break;
 
         case P_L0:
diff --git a/encoder/cabac.c b/encoder/cabac.c
index c2cae3dd..257191f0 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -47,7 +47,7 @@ static const uint8_t block_idx_xy[4][4] =
 static inline void x264_cabac_mb_type_intra( x264_t *h, int i_mb_type,
                     int ctx0, int ctx1, int ctx2, int ctx3, int ctx4, int ctx5 )
 {
-    if( i_mb_type == I_4x4 )
+    if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
     {
         x264_cabac_encode_decision( &h->cabac, ctx0, 0 );
     }
@@ -78,7 +78,7 @@ static inline void x264_cabac_mb_type_intra( x264_t *h, int i_mb_type,
 
 static void x264_cabac_mb_type( x264_t *h )
 {
-    const int i_mb_type = h->mb.i_type;
+    const int i_mb_type = x264_mb_type_fix[h->mb.i_type];
 
     if( h->sh.i_type == SLICE_TYPE_I )
     {
@@ -268,7 +268,8 @@ static void x264_cabac_mb_intra4x4_pred_mode( x264_t *h, int i_pred, int i_mode
         x264_cabac_encode_decision( &h->cabac, 69, (i_mode >> 2)&0x01 );
     }
 }
-static void x264_cabac_mb_intra8x8_pred_mode( x264_t *h )
+
+static void x264_cabac_mb_intra_chroma_pred_mode( x264_t *h )
 {
     const int i_mode  = h->mb.i_chroma_pred_mode;
     int       ctx = 0;
@@ -554,6 +555,13 @@ static inline void x264_cabac_mb_sub_b_partition( x264_t *h, int i_sub )
     }
 }
 
+static inline void x264_cabac_mb_transform_size( x264_t *h )
+{
+    int ctx = ( h->mb.cache.transform_size[0] == 1 )
+            + ( h->mb.cache.transform_size[1] == 1 );
+    x264_cabac_encode_decision( &h->cabac, 399 + ctx, h->mb.b_transform_8x8 );
+}
+
 static inline void x264_cabac_mb_ref( x264_t *h, int i_list, int idx )
 {
     const int i8 = x264_scan8[idx];
@@ -818,12 +826,24 @@ static int x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx )
 
 static void block_residual_write_cabac( x264_t *h, int i_ctxBlockCat, int i_idx, int *l, int i_count )
 {
-    static const int significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
-    static const int last_significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
-    static const int coeff_abs_level_m1_offset[5] = { 0, 10, 20, 30, 39 };
-
-    int i_coeff_abs_m1[16];
-    int i_coeff_sign[16];
+    static const int significant_coeff_flag_offset[6] = { 0, 15, 29, 44, 47, 297 };
+    static const int last_significant_coeff_flag_offset[6] = { 0, 15, 29, 44, 47, 251 };
+    static const int coeff_abs_level_m1_offset[6] = { 0, 10, 20, 30, 39, 199 };
+    static const int significant_coeff_flag_offset_8x8[63] = {
+        0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
+        4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
+        7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
+       12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12
+    };
+    static const int last_significant_coeff_flag_offset_8x8[63] = {
+        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+        3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+        5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
+    };
+
+    int i_coeff_abs_m1[64];
+    int i_coeff_sign[64];
     int i_coeff = 0;
     int i_last  = 0;
 
@@ -837,46 +857,50 @@ static void block_residual_write_cabac( x264_t *h, int i_ctxBlockCat, int i_idx,
      *                2-> Luma4x4   i_idx = luma4x4idx
      *                3-> DC Chroma i_idx = iCbCr
      *                4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx
+     *                5-> Luma8x8   i_idx = luma8x8idx
      */
 
-    //fprintf( stderr, "l[] = " );
     for( i = 0; i < i_count; i++ )
     {
-        //fprintf( stderr, "%d ", l[i] );
         if( l[i] != 0 )
         {
             i_coeff_abs_m1[i_coeff] = abs( l[i] ) - 1;
-            i_coeff_sign[i_coeff]   = ( l[i] < 0 ? 1 : 0);
+            i_coeff_sign[i_coeff]   = ( l[i] < 0 );
             i_coeff++;
 
             i_last = i;
         }
     }
-    //fprintf( stderr, "\n" );
 
-    if( i_coeff == 0 )
+    if( i_count != 64 )
     {
-        /* codec block flag */
-        x264_cabac_encode_decision( &h->cabac,  85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx ), 0 );
-        return;
+        /* coded block flag */
+        x264_cabac_encode_decision( &h->cabac, 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx ), i_coeff != 0 );
+        if( i_coeff == 0 )
+            return;
     }
 
-    /* block coded */
-    x264_cabac_encode_decision( &h->cabac,  85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx ), 1 );
     for( i = 0; i < i_count - 1; i++ )
     {
-        int i_ctxIdxInc;
+        int i_sig_ctxIdxInc;
+        int i_last_ctxIdxInc;
 
-        i_ctxIdxInc = X264_MIN( i, i_count - 2 );
+        if( i_ctxBlockCat == 5 )
+        {
+            i_sig_ctxIdxInc = significant_coeff_flag_offset_8x8[i];
+            i_last_ctxIdxInc = last_significant_coeff_flag_offset_8x8[i];
+        }
+        else
+            i_sig_ctxIdxInc = i_last_ctxIdxInc = i;
 
         if( l[i] != 0 )
         {
-            x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, 1 );
-            x264_cabac_encode_decision( &h->cabac, 166 + last_significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, i == i_last ? 1 : 0 );
+            x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_sig_ctxIdxInc, 1 );
+            x264_cabac_encode_decision( &h->cabac, 166 + last_significant_coeff_flag_offset[i_ctxBlockCat] + i_last_ctxIdxInc, i == i_last ? 1 : 0 );
         }
         else
         {
-            x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, 0 );
+            x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_sig_ctxIdxInc, 0 );
         }
         if( i == i_last )
         {
@@ -905,13 +929,9 @@ static void block_residual_write_cabac( x264_t *h, int i_ctxBlockCat, int i_idx,
             x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 1 );
             i_ctxIdxInc = 5 + X264_MIN( 4, i_abslevelgt1 ) + coeff_abs_level_m1_offset[i_ctxBlockCat];
             for( j = 0; j < i_prefix - 1; j++ )
-            {
                 x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 1 );
-            }
             if( i_prefix < 14 )
-            {
                 x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 0 );
-            }
         }
         /* suffix */
         if( i_coeff_abs_m1[i] >= 14 )
@@ -927,23 +947,16 @@ static void block_residual_write_cabac( x264_t *h, int i_ctxBlockCat, int i_idx,
             }
             x264_cabac_encode_bypass( &h->cabac, 0 );
             while( k-- )
-            {
                 x264_cabac_encode_bypass( &h->cabac, (i_suffix >> k)&0x01 );
-            }
         }
 
         /* write sign */
         x264_cabac_encode_bypass( &h->cabac, i_coeff_sign[i] );
 
-
         if( i_coeff_abs_m1[i] == 0 )
-        {
             i_abslevel1++;
-        }
         else
-        {
             i_abslevelgt1++;
-        }
     }
 }
 
@@ -992,17 +1005,21 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s )
 
     if( IS_INTRA( i_mb_type ) )
     {
-        /* Prediction */
-        if( i_mb_type == I_4x4 )
+        if( h->pps->b_transform_8x8_mode && i_mb_type != I_16x16 )
+            x264_cabac_mb_transform_size( h );
+
+        if( i_mb_type != I_16x16 )
         {
-            for( i = 0; i < 16; i++ )
+            int di = (i_mb_type == I_8x8) ? 4 : 1;
+            for( i = 0; i < 16; i += di )
             {
                 const int i_pred = x264_mb_predict_intra4x4_mode( h, i );
                 const int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
                 x264_cabac_mb_intra4x4_pred_mode( h, i_pred, i_mode );
             }
         }
-        x264_cabac_mb_intra8x8_pred_mode( h );
+
+        x264_cabac_mb_intra_chroma_pred_mode( h );
     }
     else if( i_mb_type == P_L0 )
     {
@@ -1068,12 +1085,8 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s )
             if( ( i_list ? h->sh.i_num_ref_idx_l1_active : h->sh.i_num_ref_idx_l0_active ) == 1 )
                 continue;
             for( i = 0; i < 4; i++ )
-            {
                 if( x264_mb_partition_listX_table[i_list][ h->mb.i_sub_partition[i] ] )
-                {
                     x264_cabac_mb_ref( h, i_list, 4*i );
-                }
-            }
         }
 
         x264_cabac_mb8x8_mvd( h, 0 );
@@ -1141,6 +1154,12 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s )
         x264_cabac_mb_cbp_chroma( h );
     }
 
+    if( h->pps->b_transform_8x8_mode && h->mb.i_cbp_luma && !IS_INTRA(i_mb_type)
+        && x264_mb_transform_8x8_allowed( h, i_mb_type ) )
+    {
+        x264_cabac_mb_transform_size( h );
+    }
+
     if( h->mb.i_cbp_luma > 0 || h->mb.i_cbp_chroma > 0 || i_mb_type == I_16x16 )
     {
         x264_cabac_mb_qp_delta( h );
@@ -1151,24 +1170,22 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s )
             /* DC Luma */
             block_residual_write_cabac( h, 0, 0, h->dct.luma16x16_dc, 16 );
 
+            /* AC Luma */
             if( h->mb.i_cbp_luma != 0 )
-            {
-                /* AC Luma */
                 for( i = 0; i < 16; i++ )
-                {
                     block_residual_write_cabac( h, 1, i, h->dct.block[i].residual_ac, 15 );
-                }
-            }
+        }
+        else if( h->mb.b_transform_8x8 )
+        {
+            for( i = 0; i < 4; i++ )
+                if( h->mb.i_cbp_luma & ( 1 << i ) )
+                    block_residual_write_cabac( h, 5, i, h->dct.luma8x8[i], 64 );
         }
         else
         {
             for( i = 0; i < 16; i++ )
-            {
                 if( h->mb.i_cbp_luma & ( 1 << ( i / 4 ) ) )
-                {
                     block_residual_write_cabac( h, 2, i, h->dct.block[i].luma4x4, 16 );
-                }
-            }
         }
 
         if( h->mb.i_cbp_chroma &0x03 )    /* Chroma DC residual present */
@@ -1179,9 +1196,7 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s )
         if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
         {
             for( i = 0; i < 8; i++ )
-            {
                 block_residual_write_cabac( h, 4, i, h->dct.block[16+i].residual_ac, 15 );
-            }
         }
     }
 
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index a9715c27..11a542eb 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -397,12 +397,15 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
         }
         return;
     }
-    else if( i_mb_type == I_4x4 )
+    else if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
     {
+        int di = i_mb_type == I_8x8 ? 4 : 1;
         bs_write_ue( s, i_mb_i_offset + 0 );
+        if( h->pps->b_transform_8x8_mode )
+            bs_write1( s, h->mb.b_transform_8x8 );
 
         /* Prediction: Luma */
-        for( i = 0; i < 16; i++ )
+        for( i = 0; i < 16; i += di )
         {
             int i_pred = x264_mb_predict_intra4x4_mode( h, i );
             int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
@@ -640,7 +643,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
     h->stat.frame.i_hdr_bits += i_mb_pos_tex - i_mb_pos_start;
 
     /* Coded block patern */
-    if( i_mb_type == I_4x4 )
+    if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
     {
         bs_write_ue( s, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
     }
@@ -649,6 +652,13 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
         bs_write_ue( s, inter_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
     }
 
+    /* transform size 8x8 flag */
+    if( h->pps->b_transform_8x8_mode && h->mb.i_cbp_luma && !IS_INTRA(i_mb_type)
+        && x264_mb_transform_8x8_allowed( h, i_mb_type ) )
+    {
+        bs_write1( s, h->mb.b_transform_8x8 );
+    }
+
     /* write residual */
     if( i_mb_type == I_16x16 )
     {
@@ -670,6 +680,19 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
     {
         bs_write_se( s, h->mb.qp[h->mb.i_mb_xy] - h->mb.i_last_qp );
 
+        /* shuffle 8x8 dct coeffs into 4x4 lists */
+        if( h->mb.b_transform_8x8 )
+        {
+            int i4;
+            for( i4 = 0; i4 < 16; i4++ )
+            {
+                for( i = 0; i < 16; i++ )
+                    h->dct.block[i4].luma4x4[i] = h->dct.luma8x8[i4>>2][(i4&3)+i*4];
+                h->mb.cache.non_zero_count[x264_scan8[i4]] =
+                    array_non_zero_count( h->dct.block[i4].luma4x4, 16 );
+            }
+        }
+
         for( i = 0; i < 16; i++ )
         {
             if( h->mb.i_cbp_luma & ( 1 << ( i / 4 ) ) )
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 854aa792..e5032c8c 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -50,7 +50,7 @@
 #endif
 
 //#define DEBUG_MB_TYPE
-//#define DEBUG_DUMP_FRAME
+#define DEBUG_DUMP_FRAME
 //#define DEBUG_BENCHMARK
 
 #ifdef DEBUG_BENCHMARK
@@ -408,6 +408,11 @@ static int x264_validate_parameters( x264_t *h )
     h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 5 );
     if( !(h->param.analyse.inter & X264_ANALYSE_PSUB16x16) )
         h->param.analyse.inter &= ~X264_ANALYSE_PSUB8x8;
+    if( !h->param.analyse.b_transform_8x8 )
+    {
+        h->param.analyse.inter &= ~X264_ANALYSE_I8x8;
+        h->param.analyse.intra &= ~X264_ANALYSE_I8x8;
+    }
     h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12);
     h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 2048);
 
@@ -426,7 +431,9 @@ static int x264_validate_parameters( x264_t *h )
 x264_t *x264_encoder_open   ( x264_param_t *param )
 {
     x264_t *h = x264_malloc( sizeof( x264_t ) );
-    int i, i_slice;
+    int i;
+
+    memset( h, 0, sizeof( x264_t ) );
 
     /* Create a copy of param */
     memcpy( &h->param, param, sizeof( x264_param_t ) );
@@ -536,6 +543,7 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
 
     /* init CPU functions */
     x264_predict_16x16_init( h->param.cpu, h->predict_16x16 );
+    x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c );
     x264_predict_8x8_init( h->param.cpu, h->predict_8x8 );
     x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
 
@@ -548,21 +556,6 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
     if( x264_ratecontrol_new( h ) < 0 )
         return NULL;
 
-    /* stat */
-    for( i_slice = 0; i_slice < 5; i_slice++ )
-    {
-        h->stat.i_slice_count[i_slice] = 0;
-        h->stat.i_slice_size[i_slice] = 0;
-        h->stat.i_slice_qp[i_slice] = 0;
-
-        h->stat.i_sqe_global[i_slice] = 0;
-        h->stat.f_psnr_average[i_slice] = 0.0;
-        h->stat.f_psnr_mean_y[i_slice] = h->stat.f_psnr_mean_u[i_slice] = h->stat.f_psnr_mean_v[i_slice] = 0.0;
-        
-        for( i = 0; i < 18; i++ )
-            h->stat.i_mb_count[i_slice][i] = 0;
-    }
-
     x264_log( h, X264_LOG_INFO, "using cpu capabilities %s%s%s%s%s%s\n",
              param->cpu&X264_CPU_MMX ? "MMX " : "",
              param->cpu&X264_CPU_MMXEXT ? "MMXEXT " : "",
@@ -889,6 +882,7 @@ static int x264_slice_write( x264_t *h )
     int i_skip;
     int mb_xy;
 
+    /* init stats */
     memset( &h->stat.frame, 0, sizeof(h->stat.frame) );
 
     /* Slice */
@@ -1468,7 +1462,7 @@ do_encode:
     h->stat.i_slice_size[i_slice_type] += i_frame_size + NALU_OVERHEAD;
     h->stat.i_slice_qp[i_slice_type] += i_global_qp;
 
-    for( i = 0; i < 18; i++ )
+    for( i = 0; i < 19; i++ )
     {
         h->stat.i_mb_count[h->sh.i_type][i] += h->stat.frame.i_mb_count[i];
     }
@@ -1500,13 +1494,14 @@ do_encode:
     }
     
     x264_log( h, X264_LOG_DEBUG,
-                  "frame=%4d QP=%i NAL=%d Slice:%c Poc:%-3d I4x4:%-4d I16x16:%-4d P:%-4d SKIP:%-4d size=%d bytes%s\n",
+                  "frame=%4d QP=%i NAL=%d Slice:%c Poc:%-3d I4:%-4d I8:%-4d I16:%-4d P:%-4d SKIP:%-4d size=%d bytes%s\n",
               h->i_frame - 1,
               i_global_qp,
               i_nal_ref_idc,
               i_slice_type == SLICE_TYPE_I ? 'I' : (i_slice_type == SLICE_TYPE_P ? 'P' : 'B' ),
               frame_psnr->i_poc,
               h->stat.frame.i_mb_count[I_4x4],
+              h->stat.frame.i_mb_count[I_8x8],
               h->stat.frame.i_mb_count[I_16x16],
               h->stat.frame.i_mb_count_p,
               h->stat.frame.i_mb_count_skip,
@@ -1516,12 +1511,12 @@ do_encode:
 
 #ifdef DEBUG_MB_TYPE
 {
-    static const char mb_chars[] = { 'i', 'I', 'C', 'P', '8', 'S',
+    static const char mb_chars[] = { 'i', 'i', 'I', 'C', 'P', '8', 'S',
         'D', '<', 'X', 'B', 'X', '>', 'B', 'B', 'B', 'B', '8', 'S' };
     int mb_xy;
     for( mb_xy = 0; mb_xy < h->sps->i_mb_width * h->sps->i_mb_height; mb_xy++ )
     {
-        if( h->mb.type[mb_xy] < 18 && h->mb.type[mb_xy] >= 0 )
+        if( h->mb.type[mb_xy] < 19 && h->mb.type[mb_xy] >= 0 )
             fprintf( stderr, "%c ", mb_chars[ h->mb.type[mb_xy] ] );
         else
             fprintf( stderr, "? " );
@@ -1609,8 +1604,9 @@ void    x264_encoder_close  ( x264_t *h )
         const int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_I];
         const double i_count = h->stat.i_slice_count[SLICE_TYPE_I] * h->mb.i_mb_count / 100.0;
         x264_log( h, X264_LOG_INFO,
-                  "slice I   Avg I4x4:%.1f%%  I16x16:%.1f%%\n",
+                  "slice I   Avg I4x4:%.1f%%  I8x8:%.1f%%  I16x16:%.1f%%\n",
                   i_mb_count[I_4x4]  / i_count,
+                  i_mb_count[I_8x8]  / i_count,
                   i_mb_count[I_16x16]/ i_count );
     }
     if( h->stat.i_slice_count[SLICE_TYPE_P] > 0 )
@@ -1618,8 +1614,9 @@ void    x264_encoder_close  ( x264_t *h )
         const int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_P];
         const double i_count = h->stat.i_slice_count[SLICE_TYPE_P] * h->mb.i_mb_count / 100.0;
         x264_log( h, X264_LOG_INFO,
-                  "slice P   Avg I4x4:%.1f%%  I16x16:%.1f%%  P:%.1f%%  P8x8:%.1f%%  PSKIP:%.1f%%\n",
+                  "slice P   Avg I4x4:%.1f%%  I8x8:%.1f%%  I16x16:%.1f%%  P:%.1f%%  P8x8:%.1f%%  PSKIP:%.1f%%\n",
                   i_mb_count[I_4x4]  / i_count,
+                  i_mb_count[I_8x8]  / i_count,
                   i_mb_count[I_16x16]/ i_count,
                   i_mb_count[P_L0]   / i_count,
                   i_mb_count[P_8x8]  / i_count,
@@ -1630,8 +1627,9 @@ void    x264_encoder_close  ( x264_t *h )
         const int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_B];
         const double i_count = h->stat.i_slice_count[SLICE_TYPE_B] * h->mb.i_mb_count / 100.0;
         x264_log( h, X264_LOG_INFO,
-                  "slice B   Avg I4x4:%.1f%%  I16x16:%.1f%%  P:%.1f%%  B:%.1f%%  B8x8:%.1f%%  DIRECT:%.1f%%  BSKIP:%.1f%%\n",
+                  "slice B   Avg I4x4:%.1f%%  I8x8:%.1f%%  I16x16:%.1f%%  P:%.1f%%  B:%.1f%%  B8x8:%.1f%%  DIRECT:%.1f%%  BSKIP:%.1f%%\n",
                   i_mb_count[I_4x4]    / i_count,
+                  i_mb_count[I_8x8]    / i_count,
                   i_mb_count[I_16x16]  / i_count,
                   (i_mb_count[B_L0_L0] + i_mb_count[B_L1_L1] + i_mb_count[B_L1_L0] + i_mb_count[B_L0_L1]) / i_count,
                   (i_mb_count[B_BI_BI] + i_mb_count[B_L0_BI] + i_mb_count[B_L1_BI] + i_mb_count[B_BI_L0] + i_mb_count[B_BI_L1]) / i_count,
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index ae9d1eea..d0d2d2cc 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -39,26 +39,85 @@ static const uint8_t block_idx_y[16] =
 };
 static const uint8_t block_idx_xy[4][4] =
 {
-    { 0, 2, 8,  10},
-    { 1, 3, 9,  11},
-    { 4, 6, 12, 14},
-    { 5, 7, 13, 15}
+    { 0, 2, 8,  10 },
+    { 1, 3, 9,  11 },
+    { 4, 6, 12, 14 },
+    { 5, 7, 13, 15 }
 };
 
 static const int quant_mf[6][4][4] =
 {
-    {  { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243},
-       { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243}  },
-    {  { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660},
-       { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660}  },
-    {  { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194},
-       { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194}  },
-    {  {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647},
-       {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647}  },
-    {  {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355},
-       {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355}  },
-    {  {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893},
-       {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893}  }
+    { { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 },
+      { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 } },
+    { { 11916, 7490, 11916, 7490 }, { 7490, 4660, 7490, 4660 },
+      { 11916, 7490, 11916, 7490 }, { 7490, 4660, 7490, 4660 } },
+    { { 10082, 6554, 10082, 6554 }, { 6554, 4194, 6554, 4194 },
+      { 10082, 6554, 10082, 6554 }, { 6554, 4194, 6554, 4194 } },
+    { {  9362, 5825,  9362, 5825 }, { 5825, 3647, 5825, 3647 },
+      {  9362, 5825,  9362, 5825 }, { 5825, 3647, 5825, 3647 } },
+    { {  8192, 5243,  8192, 5243 }, { 5243, 3355, 5243, 3355 },
+      {  8192, 5243,  8192, 5243 }, { 5243, 3355, 5243, 3355 } },
+    { {  7282, 4559,  7282, 4559 }, { 4559, 2893, 4559, 2893 },
+      {  7282, 4559,  7282, 4559 }, { 4559, 2893, 4559, 2893 } }
+};
+
+const int quant8_mf[6][8][8] =
+{
+  {
+    { 13107, 12222, 16777, 12222, 13107, 12222, 16777, 12222 },
+    { 12222, 11428, 15481, 11428, 12222, 11428, 15481, 11428 },
+    { 16777, 15481, 20972, 15481, 16777, 15481, 20972, 15481 },
+    { 12222, 11428, 15481, 11428, 12222, 11428, 15481, 11428 },
+    { 13107, 12222, 16777, 12222, 13107, 12222, 16777, 12222 },
+    { 12222, 11428, 15481, 11428, 12222, 11428, 15481, 11428 },
+    { 16777, 15481, 20972, 15481, 16777, 15481, 20972, 15481 },
+    { 12222, 11428, 15481, 11428, 12222, 11428, 15481, 11428 }
+  }, {
+    { 11916, 11058, 14980, 11058, 11916, 11058, 14980, 11058 },
+    { 11058, 10826, 14290, 10826, 11058, 10826, 14290, 10826 },
+    { 14980, 14290, 19174, 14290, 14980, 14290, 19174, 14290 },
+    { 11058, 10826, 14290, 10826, 11058, 10826, 14290, 10826 },
+    { 11916, 11058, 14980, 11058, 11916, 11058, 14980, 11058 },
+    { 11058, 10826, 14290, 10826, 11058, 10826, 14290, 10826 },
+    { 14980, 14290, 19174, 14290, 14980, 14290, 19174, 14290 },
+    { 11058, 10826, 14290, 10826, 11058, 10826, 14290, 10826 }
+  }, {
+    { 10082,  9675, 12710,  9675, 10082,  9675, 12710,  9675 },
+    {  9675,  8943, 11985,  8943,  9675,  8943, 11985,  8943 },
+    { 12710, 11985, 15978, 11985, 12710, 11985, 15978, 11985 },
+    {  9675,  8943, 11985,  8943,  9675,  8943, 11985,  8943 },
+    { 10082,  9675, 12710,  9675, 10082,  9675, 12710,  9675 },
+    {  9675,  8943, 11985,  8943,  9675,  8943, 11985,  8943 },
+    { 12710, 11985, 15978, 11985, 12710, 11985, 15978, 11985 },
+    {  9675,  8943, 11985,  8943,  9675,  8943, 11985,  8943 }
+  }, {
+    {  9362,  8931, 11984,  8931,  9362,  8931, 11984,  8931 },
+    {  8931,  8228, 11259,  8228,  8931,  8228, 11259,  8228 },
+    { 11984, 11259, 14913, 11259, 11984, 11259, 14913, 11259 },
+    {  8931,  8228, 11259,  8228,  8931,  8228, 11259,  8228 },
+    {  9362,  8931, 11984,  8931,  9362,  8931, 11984,  8931 },
+    {  8931,  8228, 11259,  8228,  8931,  8228, 11259,  8228 },
+    { 11984, 11259, 14913, 11259, 11984, 11259, 14913, 11259 },
+    {  8931,  8228, 11259,  8228,  8931,  8228, 11259,  8228 }
+  }, {
+    {  8192,  7740, 10486,  7740,  8192,  7740, 10486,  7740 },
+    {  7740,  7346,  9777,  7346,  7740,  7346,  9777,  7346 },
+    { 10486,  9777, 13159,  9777, 10486,  9777, 13159,  9777 },
+    {  7740,  7346,  9777,  7346,  7740,  7346,  9777,  7346 },
+    {  8192,  7740, 10486,  7740,  8192,  7740, 10486,  7740 },
+    {  7740,  7346,  9777,  7346,  7740,  7346,  9777,  7346 },
+    { 10486,  9777, 13159,  9777, 10486,  9777, 13159,  9777 },
+    {  7740,  7346,  9777,  7346,  7740,  7346,  9777,  7346 }
+  }, {
+    {  7282,  6830,  9118,  6830,  7282,  6830,  9118,  6830 },
+    {  6830,  6428,  8640,  6428,  6830,  6428,  8640,  6428 },
+    {  9118,  8640, 11570,  8640,  9118,  8640, 11570,  8640 },
+    {  6830,  6428,  8640,  6428,  6830,  6428,  8640,  6428 },
+    {  7282,  6830,  9118,  6830,  7282,  6830,  9118,  6830 },
+    {  6830,  6428,  8640,  6428,  6830,  6428,  8640,  6428 },
+    {  9118,  8640, 11570,  8640,  9118,  8640, 11570,  8640 },
+    {  6830,  6428,  8640,  6428,  6830,  6428,  8640,  6428 }
+  }
 };
 
 static const int i_chroma_qp_table[52] =
@@ -77,67 +136,67 @@ static const int i_chroma_qp_table[52] =
 //static const int scan_zigzag_x[16]={0, 1, 0, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 3, 2, 3};
 //static const int scan_zigzag_y[16]={0, 0, 1, 2, 1, 0, 0, 1, 2, 3, 3, 2, 1, 2, 3, 3};
 
+#define ZIG(i,y,x) level[i] = dct[y][x];
+static inline void scan_zigzag_8x8full( int level[64], int16_t dct[8][8] )
+{
+    ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
+    ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
+    ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)
+    ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)
+    ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)
+    ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)
+    ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)
+    ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)
+    ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)
+    ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)
+    ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)
+    ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)
+    ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)
+    ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)
+    ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)
+    ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)
+}
 static inline void scan_zigzag_4x4full( int level[16], int16_t dct[4][4] )
 {
-    level[0] = dct[0][0];
-    level[1] = dct[0][1];
-    level[2] = dct[1][0];
-    level[3] = dct[2][0];
-    level[4] = dct[1][1];
-    level[5] = dct[0][2];
-    level[6] = dct[0][3];
-    level[7] = dct[1][2];
-    level[8] = dct[2][1];
-    level[9] = dct[3][0];
-    level[10] = dct[3][1];
-    level[11] = dct[2][2];
-    level[12] = dct[1][3];
-    level[13] = dct[2][3];
-    level[14] = dct[3][2];
-    level[15] = dct[3][3];
-#if 0
-    int i;
-    for( i = 0; i < 16; i++ )
-    {
-        level[i] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]];
-    }
-#endif
+    ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
+    ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
+    ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
+    ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
 }
 static inline void scan_zigzag_4x4( int level[15], int16_t dct[4][4] )
 {
-    level[0] = dct[0][1];
-    level[1] = dct[1][0];
-    level[2] = dct[2][0];
-    level[3] = dct[1][1];
-    level[4] = dct[0][2];
-    level[5] = dct[0][3];
-    level[6] = dct[1][2];
-    level[7] = dct[2][1];
-    level[8] = dct[3][0];
-    level[9] = dct[3][1];
-    level[10] = dct[2][2];
-    level[11] = dct[1][3];
-    level[12] = dct[2][3];
-    level[13] = dct[3][2];
-    level[14] = dct[3][3];
-#if 0
-    int i;
-    for( i = 1; i < 16; i++ )
-    {
-        level[i - 1] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]];
-    }
-#endif
+                ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0)
+    ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
+    ZIG( 7,2,1) ZIG( 8,3,0) ZIG( 9,3,1) ZIG(10,2,2)
+    ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
 }
-
 static inline void scan_zigzag_2x2_dc( int level[4], int16_t dct[2][2] )
 {
-    level[0] = dct[0][0];
-    level[1] = dct[0][1];
-    level[2] = dct[1][0];
-    level[3] = dct[1][1];
+    ZIG(0,0,0)
+    ZIG(1,0,1)
+    ZIG(2,1,0)
+    ZIG(3,1,1)
 }
+#undef ZIG
 
+static void quant_8x8( int16_t dct[8][8], int i_qscale, int b_intra )
+{
+    const int i_qbits = 16 + i_qscale / 6;
+    const int i_mf = i_qscale % 6;
+    const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
 
+    int x,y;
+    for( y = 0; y < 8; y++ )
+    {
+        for( x = 0; x < 8; x++ )
+        {
+            if( dct[y][x] > 0 )
+                dct[y][x] = ( f + dct[y][x] * quant8_mf[i_mf][y][x] ) >> i_qbits;
+            else
+                dct[y][x] = - ( ( f - dct[y][x] * quant8_mf[i_mf][y][x] ) >> i_qbits );
+        }
+    }
+}
 static void quant_4x4( int16_t dct[4][4], int i_qscale, int b_intra )
 {
     const int i_qbits = 15 + i_qscale / 6;
@@ -150,13 +209,9 @@ static void quant_4x4( int16_t dct[4][4], int i_qscale, int b_intra )
         for( x = 0; x < 4; x++ )
         {
             if( dct[y][x] > 0 )
-            {
-                dct[y][x] =( f + dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits;
-            }
+                dct[y][x] = ( f + dct[y][x] * quant_mf[i_mf][y][x] ) >> i_qbits;
             else
-            {
-                dct[y][x] = - ( ( f - dct[y][x]  * quant_mf[i_mf][y][x] ) >> i_qbits );
-            }
+                dct[y][x] = - ( ( f - dct[y][x] * quant_mf[i_mf][y][x] ) >> i_qbits );
         }
     }
 }
@@ -172,13 +227,9 @@ static void quant_4x4_dc( int16_t dct[4][4], int i_qscale )
         for( x = 0; x < 4; x++ )
         {
             if( dct[y][x] > 0 )
-            {
                 dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
-            }
             else
-            {
                 dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
-            }
         }
     }
 }
@@ -194,13 +245,9 @@ static void quant_2x2_dc( int16_t dct[2][2], int i_qscale, int b_intra )
         for( x = 0; x < 2; x++ )
         {
             if( dct[y][x] > 0 )
-            {
                 dct[y][x] =( f2 + dct[y][x]  * i_qmf) >> ( 1 + i_qbits );
-            }
             else
-            {
                 dct[y][x] = - ( ( f2 - dct[y][x]  * i_qmf ) >> (1 + i_qbits ) );
-            }
         }
     }
 }
@@ -306,21 +353,6 @@ static void quant_2x2_dc( int16_t dct[2][2], int i_qscale, int b_intra )
 
 #endif
 
-static inline int array_non_zero_count( int *v, int i_count )
-{
-    int i;
-    int i_nz;
-
-    for( i = 0, i_nz = 0; i < i_count; i++ )
-    {
-        if( v[i] )
-        {
-            i_nz++;
-        }
-    }
-    return i_nz;
-}
-
 /* (ref: JVT-B118)
  * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
  * to 0 (low score means set it to null)
@@ -331,24 +363,27 @@ static inline int array_non_zero_count( int *v, int i_count )
  */
 static int x264_mb_decimate_score( int *dct, int i_max )
 {
-    static const int i_ds_table[16] = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-
+    static const int i_ds_table4[16] = {
+        3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 };
+    static const int i_ds_table8[64] = {
+        3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
+        1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
+
+    const int *ds_table = (i_max == 64) ? i_ds_table8 : i_ds_table4;
     int i_score = 0;
     int idx = i_max - 1;
 
     while( idx >= 0 && dct[idx] == 0 )
-    {
         idx--;
-    }
 
     while( idx >= 0 )
     {
         int i_run;
 
         if( abs( dct[idx--] ) > 1 )
-        {
             return 9;
-        }
 
         i_run = 0;
         while( idx >= 0 && dct[idx] == 0 )
@@ -356,7 +391,7 @@ static int x264_mb_decimate_score( int *dct, int i_max )
             idx--;
             i_run++;
         }
-        i_score += i_ds_table[i_run];
+        i_score += ds_table[i_run];
     }
 
     return i_score;
@@ -365,23 +400,35 @@ static int x264_mb_decimate_score( int *dct, int i_max )
 void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
 {
     const int i_stride = h->mb.pic.i_stride[0];
-    uint8_t  *p_src = &h->mb.pic.p_fenc[0][4 * block_idx_x[idx] + 4 * block_idx_y[idx] * i_stride];
-    uint8_t  *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[idx] + 4 * block_idx_y[idx] * i_stride];
-
+    const int i_offset = 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * i_stride;
+    uint8_t *p_src = &h->mb.pic.p_fenc[0][i_offset];
+    uint8_t *p_dst = &h->mb.pic.p_fdec[0][i_offset];
     int16_t dct4x4[4][4];
 
     h->dctf.sub4x4_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
-
     quant_4x4( dct4x4, i_qscale, 1 );
-
     scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4 );
-
     x264_mb_dequant_4x4( dct4x4, i_qscale );
 
     /* output samples to fdec */
     h->dctf.add4x4_idct( p_dst, i_stride, dct4x4 );
 }
 
+void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )
+{
+    const int i_stride = h->mb.pic.i_stride[0];
+    const int i_offset = 8 * (idx&1) + 8 * (idx>>1) * i_stride;
+    uint8_t *p_src = &h->mb.pic.p_fenc[0][i_offset];
+    uint8_t *p_dst = &h->mb.pic.p_fdec[0][i_offset];
+    int16_t dct8x8[8][8];
+
+    h->dctf.sub8x8_dct8( dct8x8, p_src, i_stride, p_dst, i_stride );
+    quant_8x8( dct8x8, i_qscale, 1 );
+    scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8 );
+    x264_mb_dequant_8x8( dct8x8, i_qscale );
+    h->dctf.add8x8_idct8( p_dst, i_stride, dct8x8 );
+}
+
 static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
 {
     const int i_stride = h->mb.pic.i_stride[0];
@@ -422,7 +469,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
     h->dctf.add16x16_idct( p_dst, i_stride, &dct4x4[1] );
 }
 
-static void x264_mb_encode_8x8( x264_t *h, int b_inter, int i_qscale )
+static void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
 {
     int i, ch;
 
@@ -572,6 +619,19 @@ void x264_macroblock_encode( x264_t *h )
         /* fix the pred mode value */
         h->mb.i_intra16x16_pred_mode = x264_mb_pred_mode16x16_fix[i_mode];
     }
+    else if( h->mb.i_type == I_8x8 )
+    {
+        for( i = 0; i < 4; i++ )
+        {
+            const int i_dst = h->mb.pic.i_stride[0];
+            uint8_t  *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * i_dst];
+            int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
+
+            h->predict_8x8[i_mode]( p_dst, i_dst, h->mb.i_neighbour8[i] );
+            x264_mb_encode_i8x8( h, i, i_qscale );
+            h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]] = x264_mb_pred_mode4x4_fix(i_mode);
+        }
+    }
     else if( h->mb.i_type == I_4x4 )
     {
         for( i = 0; i < 16; i++ )
@@ -580,83 +640,95 @@ void x264_macroblock_encode( x264_t *h )
             uint8_t  *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[i] + 4 * block_idx_y[i] * i_dst];
             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
 
-            /* Do the right prediction */
             h->predict_4x4[i_mode]( p_dst, i_dst );
-
-            /* encode one 4x4 block */
             x264_mb_encode_i4x4( h, i, i_qscale );
-
-            /* fix the pred mode value */
-            h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = x264_mb_pred_mode4x4_fix[i_mode];
+            h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = x264_mb_pred_mode4x4_fix(i_mode);
         }
     }
     else    /* Inter MB */
     {
-        int16_t dct4x4[16][4][4];
-
         int i8x8, i4x4, idx;
         int i_decimate_mb = 0;
 
         /* Motion compensation */
         x264_mb_mc( h );
 
-        h->dctf.sub16x16_dct( dct4x4,
-                              h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
-                              h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
-
-        for( i8x8 = 0; i8x8 < 4; i8x8++ )
+        if( h->mb.b_transform_8x8 )
         {
-            int i_decimate_8x8;
+            int16_t dct8x8[4][8][8];
+            h->dctf.sub16x16_dct8( dct8x8,
+                                   h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
+                                   h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
 
-            /* encode one 4x4 block */
-            i_decimate_8x8 = 0;
-            for( i4x4 = 0; i4x4 < 4; i4x4++ )
+            for( idx = 0; idx < 4; idx++ )
             {
-                idx = i8x8 * 4 + i4x4;
+                int i_decimate_8x8;
 
-                quant_4x4( dct4x4[idx], i_qscale, 0 );
-                scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4[idx] );
-                x264_mb_dequant_4x4( dct4x4[idx], i_qscale );
+                quant_8x8( dct8x8[idx], i_qscale, 0 );
+                scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8[idx] );
+                x264_mb_dequant_8x8( dct8x8[idx], i_qscale );
 
-                i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[idx].luma4x4, 16 );
+                i_decimate_8x8 = x264_mb_decimate_score( h->dct.luma8x8[idx], 64 );
+                i_decimate_mb += i_decimate_8x8;
+                if( i_decimate_8x8 < 4 )
+                {
+                    memset( h->dct.luma8x8[idx], 0, sizeof( h->dct.luma8x8[idx] ) );
+                    memset( dct8x8[idx], 0, sizeof( dct8x8[idx] ) );
+                }
             }
 
-            /* decimate this 8x8 block */
-            i_decimate_mb += i_decimate_8x8;
-            if( i_decimate_8x8 < 4 )
+            if( i_decimate_mb < 6 )
+                memset( h->dct.luma8x8, 0, sizeof( h->dct.luma8x8 ) );
+            else
+                h->dctf.add16x16_idct8( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], dct8x8 );
+        }
+        else
+        {
+            int16_t dct4x4[16][4][4];
+            h->dctf.sub16x16_dct( dct4x4,
+                                  h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
+                                  h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
+
+            for( i8x8 = 0; i8x8 < 4; i8x8++ )
             {
+                int i_decimate_8x8;
+
+                /* encode one 4x4 block */
+                i_decimate_8x8 = 0;
                 for( i4x4 = 0; i4x4 < 4; i4x4++ )
                 {
-                    int x, y;
                     idx = i8x8 * 4 + i4x4;
-                    for( i = 0; i < 16; i++ )
-                    {
-                        h->dct.block[idx].luma4x4[i] = 0;
-                    }
-                    for( x = 0; x < 4; x++ )
-                    {
-                        for( y = 0; y < 4; y++ )
-                        {
-                            dct4x4[idx][x][y] = 0;
-                        }
-                    }
+
+                    quant_4x4( dct4x4[idx], i_qscale, 0 );
+                    scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4[idx] );
+                    x264_mb_dequant_4x4( dct4x4[idx], i_qscale );
+
+                    i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[idx].luma4x4, 16 );
                 }
-            }
-        }
 
-        if( i_decimate_mb < 6 )
-        {
-            for( idx = 0; idx < 16; idx++ )
-            {
-                for( i = 0; i < 16; i++ )
+                /* decimate this 8x8 block */
+                i_decimate_mb += i_decimate_8x8;
+                if( i_decimate_8x8 < 4 )
                 {
-                    h->dct.block[idx].luma4x4[i] = 0;
+                    for( i4x4 = 0; i4x4 < 4; i4x4++ )
+                    {
+                        int x, y;
+                        idx = i8x8 * 4 + i4x4;
+                        for( i = 0; i < 16; i++ )
+                            h->dct.block[idx].luma4x4[i] = 0;
+                        for( x = 0; x < 4; x++ )
+                            for( y = 0; y < 4; y++ )
+                                dct4x4[idx][x][y] = 0;
+                    }
                 }
             }
-        }
-        else
-        {
-            h->dctf.add16x16_idct( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], dct4x4 );
+
+            if( i_decimate_mb < 6 )
+                for( idx = 0; idx < 16; idx++ )
+                    for( i = 0; i < 16; i++ )
+                        h->dct.block[idx].luma4x4[i] = 0;
+            else
+                h->dctf.add16x16_idct( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], dct4x4 );
         }
     }
 
@@ -666,41 +738,50 @@ void x264_macroblock_encode( x264_t *h )
     {
         const int i_mode = h->mb.i_chroma_pred_mode;
         /* do the right prediction */
-        h->predict_8x8[i_mode]( h->mb.pic.p_fdec[1], h->mb.pic.i_stride[1] );
-        h->predict_8x8[i_mode]( h->mb.pic.p_fdec[2], h->mb.pic.i_stride[2] );
+        h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1], h->mb.pic.i_stride[1] );
+        h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2], h->mb.pic.i_stride[2] );
 
         /* fix the pred mode value */
-        h->mb.i_chroma_pred_mode = x264_mb_pred_mode8x8_fix[i_mode];
+        h->mb.i_chroma_pred_mode = x264_mb_pred_mode8x8c_fix[i_mode];
     }
 
     /* encode the 8x8 blocks */
-    x264_mb_encode_8x8( h, !IS_INTRA( h->mb.i_type ), i_qscale );
+    x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), i_qscale );
 
     /* Calculate the Luma/Chroma patern and non_zero_count */
+    h->mb.i_cbp_luma = 0x00;
     if( h->mb.i_type == I_16x16 )
     {
-        h->mb.i_cbp_luma = 0x00;
         for( i = 0; i < 16; i++ )
         {
             const int nz = array_non_zero_count( h->dct.block[i].residual_ac, 15 );
             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
             if( nz > 0 )
-            {
                 h->mb.i_cbp_luma = 0x0f;
-            }
+        }
+    }
+    else if( h->mb.b_transform_8x8 )
+    {
+        /* coded_block_flag is enough for CABAC,
+         * but CAVLC needs the full non_zero_count. */
+        for( i = 0; i < 4; i++ )
+        {
+            const int nz = array_non_zero( h->dct.luma8x8[i], 64 );
+            int j;
+            for( j = 0; j < 4; j++ )
+                h->mb.cache.non_zero_count[x264_scan8[4*i+j]] = nz;
+            if( nz > 0 )
+                h->mb.i_cbp_luma |= 1 << i;
         }
     }
     else
     {
-        h->mb.i_cbp_luma = 0x00;
         for( i = 0; i < 16; i++ )
         {
             const int nz = array_non_zero_count( h->dct.block[i].luma4x4, 16 );
             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
             if( nz > 0 )
-            {
                 h->mb.i_cbp_luma |= 1 << (i/4);
-            }
         }
     }
 
@@ -772,6 +853,9 @@ void x264_macroblock_encode( x264_t *h )
         h->mb.type[h->mb.i_mb_xy] = h->mb.i_type = B_SKIP;
         h->mb.qp[h->mb.i_mb_xy] = h->mb.i_last_qp;  /* Needed */
     }
+
+    if( h->mb.i_cbp_luma == 0 && h->mb.i_type != I_8x8 )
+        h->mb.b_transform_8x8 = 0;
 }
 
 /*****************************************************************************
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index 6c8768ae..a16bcf10 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -39,5 +39,26 @@ void x264_macroblock_write_cavlc ( x264_t *h, bs_t *s );
 
 void x264_cabac_mb_skip( x264_t *h, int b_skip );
 
+static inline int array_non_zero( int *v, int i_count )
+{
+    int i;
+    for( i = 0; i < i_count; i++ )
+        if( v[i] ) return 1;
+    return 0;
+}
+
+static inline int array_non_zero_count( int *v, int i_count )
+{
+    int i;
+    int i_nz;
+
+    for( i = 0, i_nz = 0; i < i_count; i++ )
+        if( v[i] )
+            i_nz++;
+
+    return i_nz;
+}
+
+
 #endif
 
diff --git a/encoder/set.c b/encoder/set.c
index 8406118d..4f94bad9 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -40,12 +40,14 @@
 
 void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
 {
-    sps->i_id               = i_id;
+    sps->i_id = i_id;
 
-    if( param->b_cabac || param->i_bframe > 0 )
-        sps->i_profile_idc      = PROFILE_MAIN;
+    if( param->analyse.b_transform_8x8 )
+        sps->i_profile_idc  = PROFILE_HIGH;
+    else if( param->b_cabac || param->i_bframe > 0 )
+        sps->i_profile_idc  = PROFILE_MAIN;
     else
-        sps->i_profile_idc      = PROFILE_BASELINE;
+        sps->i_profile_idc  = PROFILE_BASELINE;
     sps->i_level_idc = param->i_level_idc;
 
     sps->b_constraint_set0  = 0;
@@ -160,6 +162,16 @@ void x264_sps_write( bs_t *s, x264_sps_t *sps )
     bs_write( s, 8, sps->i_level_idc );
 
     bs_write_ue( s, sps->i_id );
+
+    if( sps->i_profile_idc >= PROFILE_HIGH )
+    {
+        bs_write_ue( s, 1 ); // chroma_format_idc = 4:2:0
+        bs_write_ue( s, 0 ); // bit_depth_luma_minus8
+        bs_write_ue( s, 0 ); // bit_depth_chroma_minus8
+        bs_write( s, 1, 0 ); // qpprime_y_zero_transform_bypass_flag
+        bs_write( s, 1, 0 ); // seq_scaling_matrix_present_flag
+    }
+
     bs_write_ue( s, sps->i_log2_max_frame_num - 4 );
     bs_write_ue( s, sps->i_poc_type );
     if( sps->i_poc_type == 0 )
@@ -326,6 +338,8 @@ void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *
     pps->b_deblocking_filter_control = 1;
     pps->b_constrained_intra_pred = 0;
     pps->b_redundant_pic_cnt = 0;
+
+    pps->b_transform_8x8_mode = param->analyse.b_transform_8x8 ? 1 : 0;
 }
 
 void x264_pps_write( bs_t *s, x264_pps_t *pps )
@@ -389,6 +403,13 @@ void x264_pps_write( bs_t *s, x264_pps_t *pps )
     bs_write( s, 1, pps->b_constrained_intra_pred );
     bs_write( s, 1, pps->b_redundant_pic_cnt );
 
+    if( pps->b_transform_8x8_mode )
+    {
+        bs_write( s, 1, pps->b_transform_8x8_mode );
+        bs_write( s, 1, 0 ); // pic_scaling_matrix_present_flag
+        bs_write_se( s, 0 ); // second_chroma_qp_index_offset
+    }
+
     bs_rbsp_trailing( s );
 }
 
diff --git a/encoder/slicetype_decision.c b/encoder/slicetype_decision.c
index b24d891b..eadce569 100644
--- a/encoder/slicetype_decision.c
+++ b/encoder/slicetype_decision.c
@@ -197,7 +197,7 @@ lowres_intra_mb:
         for( i = I_PRED_CHROMA_DC; i <= I_PRED_CHROMA_P; i++ )
         {
             int i_cost;
-            h->predict_8x8[i]( &pix1[10], 9 );
+            h->predict_8x8c[i]( &pix1[10], 9 );
             i_cost = h->pixf.satd[PIXEL_8x8]( &pix1[10], 9, src, i_stride ) + intra_penalty;
             i_bcost = X264_MIN( i_bcost, i_cost );
         }
diff --git a/x264.h b/x264.h
index a1a0131a..14a11d21 100644
--- a/x264.h
+++ b/x264.h
@@ -26,7 +26,7 @@
 
 #include <stdarg.h>
 
-#define X264_BUILD 28
+#define X264_BUILD 29
 
 /* x264_t:
  *      opaque handler for decoder and encoder */
@@ -48,6 +48,7 @@ typedef struct x264_t x264_t;
 /* Analyse flags
  */
 #define X264_ANALYSE_I4x4       0x0001  /* Analyse i4x4 */
+#define X264_ANALYSE_I8x8       0x0002  /* Analyse i8x8 (requires 8x8 transform) */
 #define X264_ANALYSE_PSUB16x16  0x0010  /* Analyse p16x8, p8x16 and p8x8 */
 #define X264_ANALYSE_PSUB8x8    0x0020  /* Analyse p8x4, p4x8, p4x4 */
 #define X264_ANALYSE_BSUB16x16  0x0100  /* Analyse b16x8, b8x16 and b8x8 */
@@ -149,8 +150,10 @@ typedef struct
     /* Encoder analyser parameters */
     struct
     {
-        unsigned int intra;     /* intra flags */
-        unsigned int inter;     /* inter flags */
+        unsigned int intra;     /* intra partitions */
+        unsigned int inter;     /* inter partitions */
+
+        int          b_transform_8x8;
 
         int          i_direct_mv_pred; /* spatial vs temporal mv prediction */
         int          i_me_method; /* motion estimation algorithm to use (X264_ME_*) */