From: Fiona Glaser <fiona@x264.com>
Date: Fri, 30 Jan 2009 11:40:54 +0000 (-0800)
Subject: Massive overhaul of nnz/cbp calculation
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e394bd600ba9b1a3cee24e7d0b01dfb0acc5d1ad;p=libx264

Massive overhaul of nnz/cbp calculation
Modify quantization to also calculate array_non_zero.
PPC assembly changes by gpoirior.
New quant asm includes some small tweaks to quant and SSE4 versions using ptest for the array_non_zero.
Use this new feature of quant to merge nnz/cbp calculation directly with encoding and avoid many unnecessary calls to dequant/zigzag/decimate/etc.
Also add new i16x16 DC-only iDCT with asm.
Since intra encoding now directly calculates nnz, skip_intra now backs up nnz/cbp as well.
Output should be equivalent except when using p4x4+RDO because of a subtlety involving old nnz values lying around.
Performance increase in macroblock_encode: ~18% with dct-decimate, 30% without at CRF 25.
Overall performance increase 0-6% depending on encoding settings.
---

diff --git a/common/common.h b/common/common.h
index 78b1efb6..97c68781 100644
--- a/common/common.h
+++ b/common/common.h
@@ -471,6 +471,10 @@ struct x264_t
             DECLARE_ALIGNED_16( uint8_t i8x8_fdec_buf[16*16] );
             DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
             DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
+            uint32_t i4x4_nnz_buf[4];
+            uint32_t i8x8_nnz_buf[4];
+            int i4x4_cbp;
+            int i8x8_cbp;
 
             /* Psy trellis DCT data */
             DECLARE_ALIGNED_16( int16_t fenc_dct8[4][64] );
diff --git a/common/dct.c b/common/dct.c
index f8d51e40..5f9f0fb0 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -369,6 +369,18 @@ static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[2][2] )
     add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] );
 }
 
+static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[4][4] )
+{
+    int i;
+    for( i = 0; i < 4; i++, p_dst += 4*FDEC_STRIDE )
+    {
+        add4x4_idct_dc( &p_dst[ 0], dct[i][0] );
+        add4x4_idct_dc( &p_dst[ 4], dct[i][1] );
+        add4x4_idct_dc( &p_dst[ 8], dct[i][2] );
+        add4x4_idct_dc( &p_dst[12], dct[i][3] );
+    }
+}
+
 
 /****************************************************************************
  * x264_dct_init:
@@ -384,6 +396,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
 
     dctf->sub16x16_dct  = sub16x16_dct;
     dctf->add16x16_idct = add16x16_idct;
+    dctf->add16x16_idct_dc = add16x16_idct_dc;
 
     dctf->sub8x8_dct8   = sub8x8_dct8;
     dctf->add8x8_idct8  = add8x8_idct8;
@@ -400,6 +413,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
         dctf->add4x4_idct   = x264_add4x4_idct_mmx;
         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
+        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
         dctf->dct4x4dc      = x264_dct4x4dc_mmx;
         dctf->idct4x4dc     = x264_idct4x4dc_mmx;
 
@@ -427,10 +441,14 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
         dctf->add8x8_idct   = x264_add8x8_idct_sse2;
         dctf->add16x16_idct = x264_add16x16_idct_sse2;
+        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
     }
 
     if( cpu&X264_CPU_SSSE3 )
+    {
         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
+        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
+    }
 #endif //HAVE_MMX
 
 #ifdef ARCH_PPC
diff --git a/common/dct.h b/common/dct.h
index f4474fcc..71951f9b 100644
--- a/common/dct.h
+++ b/common/dct.h
@@ -100,6 +100,7 @@ typedef struct
 
     void (*sub16x16_dct) ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
     void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][4][4] );
+    void (*add16x16_idct_dc) ( uint8_t *p_dst, int16_t dct[4][4] );
 
     void (*sub8x8_dct8)  ( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 );
     void (*add8x8_idct8) ( uint8_t *p_dst, int16_t dct[8][8] );
diff --git a/common/ppc/quant.c b/common/ppc/quant.c
index 64b34ab5..d1d9d72a 100644
--- a/common/ppc/quant.c
+++ b/common/ppc/quant.c
@@ -30,10 +30,10 @@ mfvA = vec_ld((idx0), mf);                                                   \
 mfvB = vec_ld((idx1), mf);                                                   \
 biasvA = vec_ld((idx0), bias);                                               \
 biasvB = vec_ld((idx1), bias);                                               \
-mskA = vec_cmplt(temp1v, zerov);                                             \
-mskB = vec_cmplt(temp2v, zerov);                                             \
-coefvA = (vec_u16_t)vec_max(vec_sub(zerov, temp1v), temp1v);                 \
-coefvB = (vec_u16_t)vec_max(vec_sub(zerov, temp2v), temp2v);                 \
+mskA = vec_cmplt(temp1v, zero_s16v);                                         \
+mskB = vec_cmplt(temp2v, zero_s16v);                                         \
+coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);             \
+coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);             \
 coefvA = vec_adds(coefvA, biasvA);                                           \
 coefvB = vec_adds(coefvB, biasvB);                                           \
 multEvenvA = vec_mule(coefvA, mfvA);                                         \
@@ -51,17 +51,20 @@ temp2v = vec_xor(temp2v, mskB);                                              \
 temp1v = vec_adds(temp1v, vec_and(mskA, one));                               \
 vec_st(temp1v, (idx0), (int16_t*)dct);                                       \
 temp2v = vec_adds(temp2v, vec_and(mskB, one));                               \
+nz = vec_or(nz, vec_or(temp1v, temp2v));                                     \
 vec_st(temp2v, (idx1), (int16_t*)dct);
                 
-void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
+int x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
 {
+    LOAD_ZERO;
     vector bool short mskA;
     vec_u32_t i_qbitsv;
     vec_u16_t coefvA;
     vec_u32_t multEvenvA, multOddvA;
     vec_u16_t mfvA;
     vec_u16_t biasvA;
-    vec_s16_t zerov, one;
+    vec_s16_t one = vec_splat_s16(1);;
+    vec_s16_t nz = zero_s16v;
 
     vector bool short mskB;
     vec_u16_t coefvB;
@@ -75,20 +78,18 @@ void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[1
     qbits_u.s[0]=16;
     i_qbitsv = vec_splat(qbits_u.v, 0);
 
-    zerov = vec_splat_s16(0);
-    one = vec_splat_s16(1);
-
     QUANT_16_U( 0, 16 );
+    return vec_any_ne(nz, zero_s16v);
 }
 
 // DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
 #define QUANT_16_U_DC( idx0, idx1 )                             \
 temp1v = vec_ld((idx0), *dct);                                  \
 temp2v = vec_ld((idx1), *dct);                                  \
-mskA = vec_cmplt(temp1v, zerov);                                \
-mskB = vec_cmplt(temp2v, zerov);                                \
-coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v);   \
-coefvB = (vec_u16_t) vec_max(vec_sub(zerov, temp2v), temp2v);   \
+mskA = vec_cmplt(temp1v, zero_s16v);                            \
+mskB = vec_cmplt(temp2v, zero_s16v);                            \
+coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
+coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
 coefvA = vec_add(coefvA, biasv);                                \
 coefvB = vec_add(coefvB, biasv);                                \
 multEvenvA = vec_mule(coefvA, mfv);                             \
@@ -106,15 +107,18 @@ temp2v = vec_xor(temp2v, mskB);                                 \
 temp1v = vec_add(temp1v, vec_and(mskA, one));                   \
 vec_st(temp1v, (idx0), (int16_t*)dct);                          \
 temp2v = vec_add(temp2v, vec_and(mskB, one));                   \
+nz = vec_or(nz, vec_or(temp1v, temp2v));                        \
 vec_st(temp2v, (idx1), (int16_t*)dct);
 
-void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias )
+int x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias )
 {
+    LOAD_ZERO;
     vector bool short mskA;
     vec_u32_t i_qbitsv;
     vec_u16_t coefvA;
     vec_u32_t multEvenvA, multOddvA;
-    vec_s16_t zerov, one;
+    vec_s16_t one = vec_splat_s16(1);
+    vec_s16_t nz = zero_s16v;
 
     vector bool short mskB;
     vec_u16_t coefvB;
@@ -137,18 +141,16 @@ void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias )
     bias_u.s[0]=bias;
     biasv = vec_splat(bias_u.v, 0);
 
-    zerov = vec_splat_s16(0);
-    one = vec_splat_s16(1);
-
     QUANT_16_U_DC( 0, 16 );
+    return vec_any_ne(nz, zero_s16v);
 }
 
 // DC quant of a whole 2x2 block
 #define QUANT_4_U_DC( idx0 )                                    \
 const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0);      \
 temp1v = vec_ld((idx0), *dct);                                  \
-mskA = vec_cmplt(temp1v, zerov);                                \
-coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v);   \
+mskA = vec_cmplt(temp1v, zero_s16v);                            \
+coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
 coefvA = vec_add(coefvA, biasv);                                \
 multEvenvA = vec_mule(coefvA, mfv);                             \
 multOddvA = vec_mulo(coefvA, mfv);                              \
@@ -158,15 +160,18 @@ temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(mul
 temp2v = vec_xor(temp2v, mskA);                                 \
 temp2v = vec_add(temp2v, vec_and(mskA, one));                   \
 temp1v = vec_sel(temp1v, temp2v, sel);                          \
+nz = vec_or(nz, temp1v);                                        \
 vec_st(temp1v, (idx0), (int16_t*)dct);
 
-void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias )
+int x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias )
 {
+    LOAD_ZERO;
     vector bool short mskA;
     vec_u32_t i_qbitsv;
     vec_u16_t coefvA;
     vec_u32_t multEvenvA, multOddvA;
-    vec_s16_t zerov, one;
+    vec_s16_t one = vec_splat_s16(1);
+    vec_s16_t nz = zero_s16v;
 
     vec_s16_t temp1v, temp2v;
 
@@ -185,42 +190,41 @@ void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias )
     bias_u.s[0]=bias;
     biasv = vec_splat(bias_u.v, 0);
 
-    zerov = vec_splat_s16(0);
-    one = vec_splat_s16(1);
-
+    static const vec_s16_t mask2 = CV(-1, -1, -1, -1,  0, 0, 0, 0);
     QUANT_4_U_DC(0);
+    return vec_any_ne(vec_and(nz, mask2), zero_s16v);
 }
 
-void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
+int x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
 {
+    LOAD_ZERO;
     vector bool short mskA;
     vec_u32_t i_qbitsv;
     vec_u16_t coefvA;
     vec_u32_t multEvenvA, multOddvA;
     vec_u16_t mfvA;
     vec_u16_t biasvA;
-    vec_s16_t zerov, one;
-    
+    vec_s16_t one = vec_splat_s16(1);;
+    vec_s16_t nz = zero_s16v;
+
     vector bool short mskB;
     vec_u16_t coefvB;
     vec_u32_t multEvenvB, multOddvB;
     vec_u16_t mfvB;
     vec_u16_t biasvB;
-    
+
     vec_s16_t temp1v, temp2v;
     
     vec_u32_u qbits_u;
     qbits_u.s[0]=16;
     i_qbitsv = vec_splat(qbits_u.v, 0);
-
-    zerov = vec_splat_s16(0);
-    one = vec_splat_s16(1);
     
     int i;
 
     for ( i=0; i<4; i++ ) {
       QUANT_16_U( i*2*16, i*2*16+16 );
     }
+    return vec_any_ne(nz, zero_s16v);
 }
 
 #define DEQUANT_SHL()                                                \
diff --git a/common/ppc/quant.h b/common/ppc/quant.h
index 05049003..f55a934a 100644
--- a/common/ppc/quant.h
+++ b/common/ppc/quant.h
@@ -21,11 +21,11 @@
 #ifndef X264_PPC_QUANT_H
 #define X264_PPC_QUANT_H
 
-void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
 
-void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias );
-void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias );
+int x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias );
+int x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias );
 
 void x264_dequant_4x4_altivec( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
 void x264_dequant_8x8_altivec( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
diff --git a/common/quant.c b/common/quant.c
index ac798a25..daf2b5a2 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -36,35 +36,41 @@
         (coef) = (f + (coef)) * (mf) >> 16; \
     else \
         (coef) = - ((f - (coef)) * (mf) >> 16); \
+    nz |= (coef); \
 }
 
-static void quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
+static int quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
 {
-    int i;
+    int i, nz = 0;
     for( i = 0; i < 64; i++ )
         QUANT_ONE( dct[0][i], mf[i], bias[i] );
+    return !!nz;
 }
 
-static void quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
+static int quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
 {
-    int i;
+    int i, nz = 0;
     for( i = 0; i < 16; i++ )
         QUANT_ONE( dct[0][i], mf[i], bias[i] );
+    return !!nz;
 }
 
-static void quant_4x4_dc( int16_t dct[4][4], int mf, int bias )
+static int quant_4x4_dc( int16_t dct[4][4], int mf, int bias )
 {
-    int i;
+    int i, nz = 0;
     for( i = 0; i < 16; i++ )
         QUANT_ONE( dct[0][i], mf, bias );
+    return !!nz;
 }
 
-static void quant_2x2_dc( int16_t dct[2][2], int mf, int bias )
+static int quant_2x2_dc( int16_t dct[2][2], int mf, int bias )
 {
+    int nz = 0;
     QUANT_ONE( dct[0][0], mf, bias );
     QUANT_ONE( dct[0][1], mf, bias );
     QUANT_ONE( dct[0][2], mf, bias );
     QUANT_ONE( dct[0][3], mf, bias );
+    return !!nz;
 }
 
 #define DEQUANT_SHL( x ) \
@@ -402,6 +408,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->decimate_score16 = x264_decimate_score16_ssse3;
         pf->decimate_score64 = x264_decimate_score64_ssse3;
     }
+
+    if( cpu&X264_CPU_SSE4 )
+    {
+        pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
+        pf->quant_4x4 = x264_quant_4x4_sse4;
+        pf->quant_8x8 = x264_quant_8x8_sse4;
+    }
 #endif // HAVE_MMX
 
 #ifdef ARCH_PPC
diff --git a/common/quant.h b/common/quant.h
index eaac5937..b8a7b988 100644
--- a/common/quant.h
+++ b/common/quant.h
@@ -25,10 +25,10 @@
 
 typedef struct
 {
-    void (*quant_8x8)( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-    void (*quant_4x4)( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-    void (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias );
-    void (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias );
+    int (*quant_8x8)( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+    int (*quant_4x4)( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+    int (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias );
+    int (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias );
 
     void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
     void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index a474932f..156a7ae4 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -33,6 +33,7 @@ pb_sub4frame:   db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
 pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
 pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
+pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
 
 SECTION .text
 
@@ -324,6 +325,104 @@ cglobal x264_add8x8_idct_dc_ssse3, 2,2
     movhps    [r0+FDEC_STRIDE* 3], xmm5
     ret
 
+cglobal x264_add16x16_idct_dc_mmx, 2,3
+    mov       r2, 4
+.loop:
+    movq      mm0, [r1]
+    pxor      mm1, mm1
+    paddw     mm0, [pw_32 GLOBAL]
+    psraw     mm0, 6
+    psubw     mm1, mm0
+    packuswb  mm0, mm0
+    packuswb  mm1, mm1
+    punpcklbw mm0, mm0
+    punpcklbw mm1, mm1
+    pshufw    mm2, mm0, 0xFA
+    pshufw    mm3, mm1, 0xFA
+    punpcklbw mm0, mm0
+    punpcklbw mm1, mm1
+    ADD_DC    mm0, mm1, r0
+    ADD_DC    mm2, mm3, r0+8
+    add       r1, 8
+    add       r0, FDEC_STRIDE*4
+    dec       r2
+    jg .loop
+    ret
+
+%macro IDCT_DC_STORE 3
+    movdqa    xmm4, [r0+%1+FDEC_STRIDE*0]
+    movdqa    xmm5, [r0+%1+FDEC_STRIDE*1]
+    movdqa    xmm6, [r0+%1+FDEC_STRIDE*2]
+    movdqa    xmm7, [r0+%1+FDEC_STRIDE*3]
+    paddusb   xmm4, %2
+    paddusb   xmm5, %2
+    paddusb   xmm6, %2
+    paddusb   xmm7, %2
+    psubusb   xmm4, %3
+    psubusb   xmm5, %3
+    psubusb   xmm6, %3
+    psubusb   xmm7, %3
+    movdqa    [r0+%1+FDEC_STRIDE*0], xmm4
+    movdqa    [r0+%1+FDEC_STRIDE*1], xmm5
+    movdqa    [r0+%1+FDEC_STRIDE*2], xmm6
+    movdqa    [r0+%1+FDEC_STRIDE*3], xmm7
+%endmacro
+
+cglobal x264_add16x16_idct_dc_sse2, 2,2
+    call .loop
+    add       r0, FDEC_STRIDE*4
+.loop:
+    add       r0, FDEC_STRIDE*4
+    movq      xmm0, [r1+0]
+    movq      xmm2, [r1+8]
+    add       r1, 16
+    punpcklwd xmm0, xmm0
+    punpcklwd xmm2, xmm2
+    pxor      xmm1, xmm1
+    pxor      xmm3, xmm3
+    paddw     xmm0, [pw_32 GLOBAL]
+    paddw     xmm2, [pw_32 GLOBAL]
+    psraw     xmm0, 6
+    psraw     xmm2, 6
+    psubw     xmm1, xmm0
+    psubw     xmm3, xmm2
+    packuswb  xmm0, xmm1
+    packuswb  xmm2, xmm3
+    movdqa    xmm1, xmm0
+    movdqa    xmm3, xmm2
+    punpcklbw xmm0, xmm0
+    punpcklbw xmm2, xmm2
+    punpckhbw xmm1, xmm1
+    punpckhbw xmm3, xmm3
+    IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
+    IDCT_DC_STORE 0, xmm2, xmm3
+    ret
+
+cglobal x264_add16x16_idct_dc_ssse3, 2,2
+    call .loop
+    add       r0, FDEC_STRIDE*4
+.loop:
+    add       r0, FDEC_STRIDE*4
+    movdqa    xmm0, [r1]
+    add       r1, 16
+    pxor      xmm1, xmm1
+    paddw     xmm0, [pw_32 GLOBAL]
+    psraw     xmm0, 6
+    psubw     xmm1, xmm0
+    movdqa    xmm5, [ pb_idctdc_unpack GLOBAL]
+    movdqa    xmm6, [pb_idctdc_unpack2 GLOBAL]
+    packuswb  xmm0, xmm0
+    packuswb  xmm1, xmm1
+    movdqa    xmm2, xmm0
+    movdqa    xmm3, xmm1
+    pshufb    xmm0, xmm5
+    pshufb    xmm2, xmm6
+    pshufb    xmm1, xmm5
+    pshufb    xmm3, xmm6
+    IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
+    IDCT_DC_STORE 0, xmm2, xmm3
+    ret
+
 ;-----------------------------------------------------------------------------
 ; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
 ;-----------------------------------------------------------------------------
diff --git a/common/x86/dct.h b/common/x86/dct.h
index d30fa972..99392761 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -34,9 +34,12 @@ void x264_add4x4_idct_mmx    ( uint8_t *p_dst, int16_t dct[ 4][4]    );
 void x264_add8x8_idct_mmx    ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
 void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[2][2] );
 void x264_add16x16_idct_mmx  ( uint8_t *p_dst, int16_t dct[16][4][4] );
+void x264_add16x16_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[4][4] );
 void x264_add8x8_idct_sse2   ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
 void x264_add16x16_idct_sse2 ( uint8_t *p_dst, int16_t dct[16][4][4] );
+void x264_add16x16_idct_dc_sse2( uint8_t *p_dst, int16_t dct[4][4] );
 void x264_add8x8_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[2][2] );
+void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[4][4] );
 
 void x264_dct4x4dc_mmx       ( int16_t d[4][4] );
 void x264_idct4x4dc_mmx      ( int16_t d[4][4] );
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index d1fd8693..d1b39919 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -29,6 +29,7 @@ SECTION_RODATA
 pb_1:     times 16 db 1
 pw_1:     times 8 dw 1
 pd_1:     times 4 dd 1
+pb_01:    times 8 db 0, 1
 
 %macro DQM4 3
     dw %1, %2, %1, %2, %2, %3, %2, %3
@@ -70,7 +71,7 @@ decimate_mask_table4:
 
 SECTION .text
 
-%macro QUANT_DC_START 0
+%macro QUANT_DC_START_MMX 0
     movd       m6, r1m     ; mf
     movd       m7, r2m     ; bias
 %ifidn m0, mm0
@@ -84,6 +85,14 @@ SECTION .text
 %endif
 %endmacro
 
+%macro QUANT_DC_START_SSSE3 0
+    movdqa     m5, [pb_01 GLOBAL]
+    movd       m6, r1m     ; mf
+    movd       m7, r2m     ; bias
+    pshufb     m6, m5
+    pshufb     m7, m5
+%endmacro
+
 %macro PABSW_MMX 2
     pxor       %1, %1
     pcmpgtw    %1, %2
@@ -105,7 +114,7 @@ SECTION .text
     psignw     %1, %2
 %endmacro
 
-%macro QUANT_ONE 3
+%macro QUANT_ONE 4
 ;;; %1      (m64)       dct[y][x]
 ;;; %2      (m64/mmx)   mf[y][x] or mf[0][0] (as uint16_t)
 ;;; %3      (m64/mmx)   bias[y][x] or bias[0][0] (as uint16_t)
@@ -115,6 +124,62 @@ SECTION .text
     pmulhuw    m0, %2   ; divide
     PSIGNW     m0, m1   ; restore sign
     mova       %1, m0   ; store
+%if %4
+    por        m5, m0
+%else
+    SWAP       m5, m0
+%endif
+%endmacro
+
+%macro QUANT_TWO 7
+    mova       m1, %1
+    mova       m3, %2
+    PABSW      m0, m1
+    PABSW      m2, m3
+    paddusw    m0, %5
+    paddusw    m2, %6
+    pmulhuw    m0, %3
+    pmulhuw    m2, %4
+    PSIGNW     m0, m1
+    PSIGNW     m2, m3
+    mova       %1, m0
+    mova       %2, m2
+%if %7
+    por        m5, m0
+    por        m5, m2
+%else
+    SWAP       m5, m0
+    por        m5, m2
+%endif
+%endmacro
+
+%macro QUANT_END_MMX 0
+    xor      eax, eax
+%ifndef ARCH_X86_64
+%if mmsize==8
+    packsswb  m5, m5
+    movd     ecx, m5
+    test     ecx, ecx
+%else
+    pxor      m4, m4
+    pcmpeqb   m5, m4
+    pmovmskb ecx, m5
+    cmp      ecx, (1<<mmsize)-1
+%endif
+%else
+%if mmsize==16
+    packsswb  m5, m5
+%endif
+    movq     rcx, m5
+    test     rcx, rcx
+%endif
+    setne     al
+%endmacro
+
+%macro QUANT_END_SSE4 0
+    xor      eax, eax
+    ptest     m5, m5
+    setne     al
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -123,30 +188,38 @@ SECTION .text
 %macro QUANT_DC 2
 cglobal %1, 1,1
     QUANT_DC_START
+%if %2==1
+    QUANT_ONE [r0], m6, m7, 0
+%else
 %assign x 0
-%rep %2
-    QUANT_ONE [r0+x], m6, m7
-%assign x x+mmsize
+%rep %2/2
+    QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x
+%assign x x+mmsize*2
 %endrep
+%endif
+    QUANT_END
     RET
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
+; int x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
 ;-----------------------------------------------------------------------------
 %macro QUANT_AC 2
 cglobal %1, 3,3
 %assign x 0
-%rep %2
-    QUANT_ONE [r0+x], [r1+x], [r2+x]
-%assign x x+mmsize
+%rep %2/2
+    QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x
+%assign x x+mmsize*2
 %endrep
+    QUANT_END
     RET
 %endmacro
 
 INIT_MMX
+%define QUANT_END QUANT_END_MMX
 %define PABSW PABSW_MMX
 %define PSIGNW PSIGNW_MMX
+%define QUANT_DC_START QUANT_DC_START_MMX
 QUANT_DC x264_quant_2x2_dc_mmxext, 1
 %ifndef ARCH_X86_64 ; not needed because sse2 is faster
 QUANT_DC x264_quant_4x4_dc_mmxext, 4
@@ -167,6 +240,13 @@ QUANT_AC x264_quant_8x8_ssse3, 8
 
 INIT_MMX
 QUANT_DC x264_quant_2x2_dc_ssse3, 1
+%define QUANT_END QUANT_END_SSE4
+;Not faster on Conroe, so only used in SSE4 versions
+%define QUANT_DC_START QUANT_DC_START_SSSE3
+INIT_XMM
+QUANT_DC x264_quant_4x4_dc_sse4, 2
+QUANT_AC x264_quant_4x4_sse4, 2
+QUANT_AC x264_quant_8x8_sse4, 8
 
 
 
diff --git a/common/x86/quant.h b/common/x86/quant.h
index 878699f9..dff60a85 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -24,17 +24,20 @@
 #ifndef X264_I386_QUANT_H
 #define X264_I386_QUANT_H
 
-void x264_quant_2x2_dc_mmxext( int16_t dct[2][2], int mf, int bias );
-void x264_quant_4x4_dc_mmxext( int16_t dct[4][4], int mf, int bias );
-void x264_quant_4x4_mmx( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-void x264_quant_8x8_mmx( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-void x264_quant_4x4_dc_sse2( int16_t dct[4][4], int mf, int bias );
-void x264_quant_4x4_sse2( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-void x264_quant_8x8_sse2( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-void x264_quant_2x2_dc_ssse3( int16_t dct[2][2], int mf, int bias );
-void x264_quant_4x4_dc_ssse3( int16_t dct[4][4], int mf, int bias );
-void x264_quant_4x4_ssse3( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-void x264_quant_8x8_ssse3( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_2x2_dc_mmxext( int16_t dct[2][2], int mf, int bias );
+int x264_quant_4x4_dc_mmxext( int16_t dct[4][4], int mf, int bias );
+int x264_quant_4x4_mmx( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_mmx( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_4x4_dc_sse2( int16_t dct[4][4], int mf, int bias );
+int x264_quant_4x4_sse2( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_sse2( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_2x2_dc_ssse3( int16_t dct[2][2], int mf, int bias );
+int x264_quant_4x4_dc_ssse3( int16_t dct[4][4], int mf, int bias );
+int x264_quant_4x4_ssse3( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_ssse3( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_4x4_dc_sse4( int16_t dct[4][4], int mf, int bias );
+int x264_quant_4x4_sse4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_sse4( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
 void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
 void x264_dequant_4x4dc_mmxext( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
 void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
diff --git a/encoder/analyse.c b/encoder/analyse.c
index bd53ebfe..63a74ea9 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -665,6 +665,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
         x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
         int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
         int i_cost = 0;
+        h->mb.i_cbp_luma = 0;
         b_merged_satd = h->pixf.intra_sa8d_x3_8x8 && h->pixf.mbcmp[0] == h->pixf.satd[0];
 
         // FIXME some bias like in i4x4?
@@ -732,6 +733,11 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
             if( h->mb.i_skip_intra )
             {
                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
+                h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
+                h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
+                h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
+                h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
+                h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
                 if( h->mb.i_skip_intra == 2 )
                     h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
             }
@@ -751,6 +757,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
     {
         int i_cost;
         int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
+        h->mb.i_cbp_luma = 0;
         b_merged_satd = h->pixf.intra_satd_x3_4x4 && h->pixf.mbcmp[0] == h->pixf.satd[0];
         if( a->i_mbrd )
             i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
@@ -817,6 +824,11 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
             if( h->mb.i_skip_intra )
             {
                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
+                h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
+                h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
+                h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
+                h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
+                h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
                 if( h->mb.i_skip_intra == 2 )
                     h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
             }
@@ -1951,6 +1963,8 @@ static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
             x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
             x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
             x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
+            /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
+             * for future blocks are those left over from previous RDO calls. */
             for( i = 0; i < 4; i++ )
             {
                 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 4fa74033..2015da5e 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -1142,20 +1142,10 @@ static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, i
 static void x264_partition_i8x8_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_mode )
 {
     const int i_pred = x264_mb_predict_intra4x4_mode( h, 4*i8 );
-    const int nnz = array_non_zero(h->dct.luma8x8[i8]);
     i_mode = x264_mb_pred_mode4x4_fix( i_mode );
     x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
-    if( nnz )
-    {
-        *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4]] = 0x0101;
-        *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
+    if( h->mb.i_cbp_luma & (1 << i8) )
         block_residual_write_cabac_8x8( h, cb, 4*i8, h->dct.luma8x8[i8] );
-    }
-    else
-    {
-        *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4]] = 0;
-        *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
-    }
 }
 
 static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_mode )
@@ -1163,7 +1153,6 @@ static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4,
     const int i_pred = x264_mb_predict_intra4x4_mode( h, i4 );
     i_mode = x264_mb_pred_mode4x4_fix( i_mode );
     x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
-    h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
     block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
 }
 
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index 4f4ff033..e499fac5 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -702,7 +702,6 @@ static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
 static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
 {
     h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode );
-    h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
     block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
     return h->out.bs.i_bits_encoded;
 }
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 30df7781..6faa305e 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -84,26 +84,38 @@ static inline void dct2x2dc( int16_t d[2][2], int16_t dct4x4[4][4][4] )
     dct4x4[3][0][0] = 0;
 }
 
-static ALWAYS_INLINE void x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
+static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
 {
     int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
     if( h->mb.b_trellis )
-        x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, idx );
+        return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, idx );
     else
-        h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
+        return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
 }
 
-static ALWAYS_INLINE void x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra, int idx )
+static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra, int idx )
 {
     int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY;
     if( h->mb.b_trellis )
-        x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx );
+        return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx );
     else
-        h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
+        return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
 }
 
+/* All encoding functions must output the correct CBP and NNZ values.
+ * The entropy coding functions will check CBP first, then NNZ, before
+ * actually reading the DCT coefficients.  NNZ still must be correct even
+ * if CBP is zero because of the use of NNZ values for context selection.
+ * "NNZ" need only be 0 or 1 rather than the exact coefficient count because
+ * that is only needed in CAVLC, and will be calculated by CAVLC's residual
+ * coding and stored as necessary. */
+
+/* This means that decimation can be done merely by adjusting the CBP and NNZ
+ * rather than memsetting the coefficients. */
+
 void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
 {
+    int nz;
     uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
     uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
     DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
@@ -111,29 +123,36 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
     if( h->mb.b_lossless )
     {
         h->zigzagf.sub_4x4( h->dct.luma4x4[idx], p_src, p_dst );
+        nz = array_non_zero( h->dct.luma4x4[idx] );
+        h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
+        h->mb.i_cbp_luma |= nz<<(idx>>2);
         return;
     }
 
     h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
 
-    x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx );
-
-    if( array_non_zero( dct4x4 ) )
+    nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx );
+    h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
+    if( nz )
     {
+        h->mb.i_cbp_luma |= 1<<(idx>>2);
         h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 );
         h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qp );
-
-        /* output samples to fdec */
         h->dctf.add4x4_idct( p_dst, dct4x4 );
     }
-    else
-        memset( h->dct.luma4x4[idx], 0, sizeof(h->dct.luma4x4[idx]));
+}
+
+#define STORE_8x8_NNZ(idx,nz)\
+{\
+    *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] = nz * 0x0101;\
+    *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] = nz * 0x0101;\
 }
 
 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
 {
     int x = 8 * (idx&1);
     int y = 8 * (idx>>1);
+    int nz;
     uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
     uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
     DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
@@ -141,16 +160,25 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
     if( h->mb.b_lossless )
     {
         h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
+        nz = array_non_zero( h->dct.luma8x8[idx] );
+        STORE_8x8_NNZ(idx,nz);
+        h->mb.i_cbp_luma |= nz<<idx;
         return;
     }
 
     h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
 
-    x264_quant_8x8( h, dct8x8, i_qp, 1, idx );
-
+    nz = x264_quant_8x8( h, dct8x8, i_qp, 1, idx );
     h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
-    h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
-    h->dctf.add8x8_idct8( p_dst, dct8x8 );
+    if( nz )
+    {
+        h->mb.i_cbp_luma |= 1<<idx;
+        h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
+        h->dctf.add8x8_idct8( p_dst, dct8x8 );
+        STORE_8x8_NNZ(idx,1);
+    }
+    else
+        STORE_8x8_NNZ(idx,0);
 }
 
 static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
@@ -161,7 +189,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
     DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
     DECLARE_ALIGNED_16( int16_t dct_dc4x4[4][4] );
 
-    int i;
+    int i, nz;
 
     if( h->mb.b_lossless )
     {
@@ -172,12 +200,18 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
             h->zigzagf.sub_4x4( h->dct.luma4x4[i], p_src+oe, p_dst+od );
             dct_dc4x4[0][block_idx_yx_1d[i]] = h->dct.luma4x4[i][0];
             h->dct.luma4x4[i][0] = 0;
+            nz = array_non_zero( h->dct.luma4x4[i] );
+            h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
+            h->mb.i_cbp_luma |= nz;
         }
+        h->mb.i_cbp_luma *= 0xf;
+        h->mb.cache.non_zero_count[x264_scan8[24]] = array_non_zero( dct_dc4x4 );
         h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
         return;
     }
 
     h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
+
     for( i = 0; i < 16; i++ )
     {
         /* copy dc coeff */
@@ -185,36 +219,45 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
         dct4x4[i][0][0] = 0;
 
         /* quant/scan/dequant */
-        x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i );
-
-        h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] );
-        h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp );
+        nz = x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i );
+        h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
+        if( nz )
+        {
+            h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] );
+            h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp );
+            h->mb.i_cbp_luma = 0xf;
+        }
     }
 
     h->dctf.dct4x4dc( dct_dc4x4 );
     if( h->mb.b_trellis )
-        x264_quant_dc_trellis( h, (int16_t*)dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1);
+        nz = x264_quant_dc_trellis( h, (int16_t*)dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1);
     else
-        h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 );
-    h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
+        nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 );
 
-    /* output samples to fdec */
-    h->dctf.idct4x4dc( dct_dc4x4 );
-    h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp );  /* XXX not inversed */
-
-    /* calculate dct coeffs */
-    for( i = 0; i < 16; i++ )
+    h->mb.cache.non_zero_count[x264_scan8[24]] = nz;
+    if( nz )
     {
-        /* copy dc coeff */
-        dct4x4[i][0][0] = dct_dc4x4[0][block_idx_xy_1d[i]];
+        h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
+
+        /* output samples to fdec */
+        h->dctf.idct4x4dc( dct_dc4x4 );
+        h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp );  /* XXX not inversed */
+        if( h->mb.i_cbp_luma )
+            for( i = 0; i < 16; i++ )
+                dct4x4[i][0][0] = dct_dc4x4[0][block_idx_xy_1d[i]];
     }
+
     /* put pixels to fdec */
-    h->dctf.add16x16_idct( p_dst, dct4x4 );
+    if( h->mb.i_cbp_luma )
+        h->dctf.add16x16_idct( p_dst, dct4x4 );
+    else if( nz )
+        h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
 }
 
 void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
 {
-    int i, ch, nz;
+    int i, ch, nz, nz_dc;
     int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
     h->mb.i_cbp_chroma = 0;
 
@@ -223,6 +266,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
         int i_decimate_score = 0;
+        int nz_ac = 0;
 
         DECLARE_ALIGNED_16( int16_t dct2x2[2][2]  );
         DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
@@ -250,52 +294,49 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
         for( i = 0; i < 4; i++ )
         {
             if( h->mb.b_trellis )
-                x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 0 );
+                nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 0 );
             else
-                h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
-            h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] );
-
-            if( b_decimate )
-                i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*4] );
+                nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
+            h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
+            if( nz )
+            {
+                nz_ac = 1;
+                h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] );
+                h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+                if( b_decimate )
+                    i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*4] );
+            }
         }
 
         if( h->mb.b_trellis )
-            x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter );
+            nz_dc = x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter );
         else
-            h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
+            nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
 
-        if( b_decimate && i_decimate_score < 7 )
+        h->mb.cache.non_zero_count[x264_scan8[25]+ch] = nz_dc;
+
+        if( (b_decimate && i_decimate_score < 7) || !nz_ac )
         {
             /* Decimate the block */
             h->mb.cache.non_zero_count[x264_scan8[16+0]+24*ch] = 0;
             h->mb.cache.non_zero_count[x264_scan8[16+1]+24*ch] = 0;
             h->mb.cache.non_zero_count[x264_scan8[16+2]+24*ch] = 0;
             h->mb.cache.non_zero_count[x264_scan8[16+3]+24*ch] = 0;
-            if( !array_non_zero( dct2x2 ) ) /* Whole block is empty */
-            {
-                h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 0;
+            if( !nz_dc ) /* Whole block is empty */
                 continue;
-            }
             /* DC-only */
-            h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
             zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
             idct_dequant_2x2_dconly( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
             h->dctf.add8x8_idct_dc( p_dst, dct2x2 );
         }
         else
         {
-            for( i = 0; i < 4; i++ )
+            h->mb.i_cbp_chroma = 1;
+            if( nz_dc )
             {
-                nz = array_non_zero( h->dct.luma4x4[16+ch*4+i] );
-                h->mb.cache.non_zero_count[x264_scan8[16+ch*4+i]] = nz;
-                h->mb.i_cbp_chroma |= nz;
-                if( nz )
-                    h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+                zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
+                idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
             }
-            /* Don't optimize for the AC-only case--it's very rare */
-            h->mb.cache.non_zero_count[x264_scan8[25]+ch] = array_non_zero( dct2x2 );
-            zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
-            idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
             h->dctf.add8x8_idct( p_dst, dct4x4 );
         }
     }
@@ -423,8 +464,9 @@ void x264_macroblock_encode( x264_t *h )
     int i_qp = h->mb.i_qp;
     int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
     int b_force_no_skip = 0;
-    int i,j,idx;
-    uint8_t nnz8x8[4] = {1,1,1,1};
+    int i,idx,nz;
+    h->mb.i_cbp_luma = 0;
+    h->mb.cache.non_zero_count[x264_scan8[24]] = 0;
 
     if( h->sh.b_mbaff
         && h->mb.i_mb_xy == h->sh.i_first_mb + h->mb.i_mb_stride
@@ -479,6 +521,11 @@ void x264_macroblock_encode( x264_t *h )
         if( h->mb.i_skip_intra )
         {
             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
+            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i8x8_nnz_buf[0];
+            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i8x8_nnz_buf[1];
+            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i8x8_nnz_buf[2];
+            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i8x8_nnz_buf[3];
+            h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
             /* In RD mode, restore the now-overwritten DCT data. */
             if( h->mb.i_skip_intra == 2 )
                 h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
@@ -496,8 +543,6 @@ void x264_macroblock_encode( x264_t *h )
 
             x264_mb_encode_i8x8( h, i, i_qp );
         }
-        for( i = 0; i < 4; i++ )
-            nnz8x8[i] = array_non_zero( h->dct.luma8x8[i] );
     }
     else if( h->mb.i_type == I_4x4 )
     {
@@ -506,6 +551,11 @@ void x264_macroblock_encode( x264_t *h )
         if( h->mb.i_skip_intra )
         {
             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
+            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i4x4_nnz_buf[0];
+            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i4x4_nnz_buf[1];
+            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i4x4_nnz_buf[2];
+            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i4x4_nnz_buf[3];
+            h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
             /* In RD mode, restore the now-overwritten DCT data. */
             if( h->mb.i_skip_intra == 2 )
                 h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
@@ -545,7 +595,9 @@ void x264_macroblock_encode( x264_t *h )
                     h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
                                         h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
                                         h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
-                    nnz8x8[i8x8] = array_non_zero( h->dct.luma8x8[i8x8] );
+                    nz = array_non_zero( h->dct.luma8x8[i8x8] );
+                    STORE_8x8_NNZ(i8x8,nz);
+                    h->mb.i_cbp_luma |= nz << i8x8;
                 }
             else
                 for( i4x4 = 0; i4x4 < 16; i4x4++ )
@@ -553,6 +605,9 @@ void x264_macroblock_encode( x264_t *h )
                     h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
                                         h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4],
                                         h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] );
+                    nz = array_non_zero( h->dct.luma4x4[i4x4] );
+                    h->mb.cache.non_zero_count[x264_scan8[i4x4]] = nz;
+                    h->mb.i_cbp_luma |= nz << (i4x4>>2);
                 }
         }
         else if( h->mb.b_transform_8x8 )
@@ -566,31 +621,44 @@ void x264_macroblock_encode( x264_t *h )
             {
                 if( h->mb.b_noise_reduction )
                     h->quantf.denoise_dct( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
-                x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx );
+                nz = x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx );
 
-                h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] );
-
-                if( b_decimate )
+                if( nz )
                 {
-                    int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[idx] );
-                    i_decimate_mb += i_decimate_8x8;
-                    if( i_decimate_8x8 < 4 )
-                        nnz8x8[idx] = 0;
+                    h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] );
+                    if( b_decimate )
+                    {
+                        int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[idx] );
+                        i_decimate_mb += i_decimate_8x8;
+                        if( i_decimate_8x8 >= 4 )
+                            h->mb.i_cbp_luma |= 1<<idx;
+                    }
+                    else
+                        h->mb.i_cbp_luma |= 1<<idx;
                 }
-                else
-                    nnz8x8[idx] = array_non_zero( dct8x8[idx] );
             }
 
             if( i_decimate_mb < 6 && b_decimate )
-                *(uint32_t*)nnz8x8 = 0;
+            {
+                h->mb.i_cbp_luma = 0;
+                *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
+                *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
+                *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
+                *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+            }
             else
             {
                 for( idx = 0; idx < 4; idx++ )
-                    if( nnz8x8[idx] )
+                {
+                    if( h->mb.i_cbp_luma&(1<<idx) )
                     {
                         h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
                         h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
+                        STORE_8x8_NNZ(idx,1);
                     }
+                    else
+                        STORE_8x8_NNZ(idx,0);
+                }
             }
         }
         else
@@ -601,41 +669,61 @@ void x264_macroblock_encode( x264_t *h )
 
             for( i8x8 = 0; i8x8 < 4; i8x8++ )
             {
-                int i_decimate_8x8;
+                int i_decimate_8x8 = 0;
+                int cbp = 0;
 
                 /* encode one 4x4 block */
-                i_decimate_8x8 = 0;
                 for( i4x4 = 0; i4x4 < 4; i4x4++ )
                 {
                     idx = i8x8 * 4 + i4x4;
 
                     if( h->mb.b_noise_reduction )
                         h->quantf.denoise_dct( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
-                    x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx );
+                    nz = x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx );
+                    h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
 
-                    h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
-
-                    if( b_decimate && i_decimate_8x8 < 6 )
-                        i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[idx] );
+                    if( nz )
+                    {
+                        h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
+                        h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[CQM_4PY], i_qp );
+                        if( b_decimate && i_decimate_8x8 < 6 )
+                            i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[idx] );
+                        cbp = 1;
+                    }
                 }
 
                 /* decimate this 8x8 block */
                 i_decimate_mb += i_decimate_8x8;
-                if( i_decimate_8x8 < 4 && b_decimate )
-                    nnz8x8[i8x8] = 0;
+                if( b_decimate )
+                {
+                    if( i_decimate_8x8 < 4 )
+                        STORE_8x8_NNZ(i8x8,0)
+                    else
+                        h->mb.i_cbp_luma |= 1<<i8x8;
+                }
+                else if( cbp )
+                {
+                    h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
+                    h->mb.i_cbp_luma |= 1<<i8x8;
+                }
             }
 
-            if( i_decimate_mb < 6 && b_decimate )
-                *(uint32_t*)nnz8x8 = 0;
-            else
+            if( b_decimate )
             {
-                for( i8x8 = 0; i8x8 < 4; i8x8++ )
-                    if( nnz8x8[i8x8] )
-                    {
-                        for( i = 0; i < 4; i++ )
-                            h->quantf.dequant_4x4( dct4x4[i8x8*4+i], h->dequant4_mf[CQM_4PY], i_qp );
-                        h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
-                    }
+                if( i_decimate_mb < 6 )
+                {
+                    h->mb.i_cbp_luma = 0;
+                    *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
+                    *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
+                    *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
+                    *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+                }
+                else
+                {
+                    for( i8x8 = 0; i8x8 < 4; i8x8++ )
+                        if( h->mb.i_cbp_luma&(1<<i8x8) )
+                            h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
+                }
             }
         }
     }
@@ -656,49 +744,6 @@ void x264_macroblock_encode( x264_t *h )
     /* encode the 8x8 blocks */
     x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
 
-    /* coded block pattern and non_zero_count */
-    h->mb.i_cbp_luma = 0x00;
-    if( h->mb.i_type == I_16x16 )
-    {
-        for( i = 0; i < 16; i++ )
-        {
-            int nz = array_non_zero( h->dct.luma4x4[i] );
-            h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
-            h->mb.i_cbp_luma |= nz;
-        }
-        h->mb.i_cbp_luma *= 0xf;
-        h->mb.cache.non_zero_count[x264_scan8[24]] = array_non_zero( h->dct.luma16x16_dc );
-    }
-    else
-    {
-        for( i = 0; i < 4; i++)
-        {
-            if(!nnz8x8[i])
-            {
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+i*4]] = 0;
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+i*4]] = 0;
-            }
-            else if( h->mb.b_transform_8x8 )
-            {
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+4*i]] = nnz8x8[i] * 0x0101;
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+4*i]] = nnz8x8[i] * 0x0101;
-                h->mb.i_cbp_luma |= nnz8x8[i] << i;
-            }
-            else
-            {
-                int nz, cbp = 0;
-                for( j = 0; j < 4; j++ )
-                {
-                    nz = array_non_zero( h->dct.luma4x4[j+4*i] );
-                    h->mb.cache.non_zero_count[x264_scan8[j+4*i]] = nz;
-                    cbp |= nz;
-                }
-                h->mb.i_cbp_luma |= cbp << i;
-            }
-        }
-        h->mb.cache.non_zero_count[x264_scan8[24]] = 0;
-    }
-
     if( h->param.b_cabac )
     {
         i_cbp_dc = h->mb.cache.non_zero_count[x264_scan8[24]]
@@ -770,8 +815,7 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
         /* encode one 4x4 block */
         for( i4x4 = 0; i4x4 < 4; i4x4++ )
         {
-            h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
-            if( !array_non_zero(dct4x4[i4x4]) )
+            if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ) )
                 continue;
             h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
             i_decimate_mb += h->quantf.decimate_score16( dctscan );
@@ -805,15 +849,13 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
 
         /* calculate dct DC */
         dct2x2dc( dct2x2, dct4x4 );
-        h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 );
-        if( array_non_zero(dct2x2) )
+        if( h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ) )
             return 0;
 
         /* calculate dct coeffs */
         for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
         {
-            h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
-            if( !array_non_zero(dct4x4[i4x4]) )
+            if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
                 continue;
             h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
             i_decimate_mb += h->quantf.decimate_score15( dctscan );
@@ -865,7 +907,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
     uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
     int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
     int nnz8x8 = 0;
-    int ch;
+    int ch, nz;
 
     x264_mb_mc_8x8( h, i8 );
 
@@ -876,8 +918,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
         {
             h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
             nnz8x8 = array_non_zero( h->dct.luma8x8[i8] );
-            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101 * nnz8x8;
-            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101 * nnz8x8;
+            STORE_8x8_NNZ(i8,nnz8x8);
         }
         else
         {
@@ -898,9 +939,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
             p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
             h->zigzagf.sub_4x4( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec );
             h->dct.luma4x4[16+i8+ch*4][0] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = array_non_zero( h->dct.luma4x4[16+i8+ch*4] );
         }
-        h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero( h->dct.luma4x4[16+i8] );
-        h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero( h->dct.luma4x4[20+i8] );
     }
     else
     {
@@ -908,67 +948,53 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
         {
             DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
             h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
-            x264_quant_8x8( h, dct8x8, i_qp, 0, i8 );
-            h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
-
-            if( b_decimate && !h->mb.b_trellis )
-                nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[i8] );
-            else
-                nnz8x8 = array_non_zero( dct8x8 );
-
+            nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, 0, i8 );
             if( nnz8x8 )
             {
-                h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
-                h->dctf.add8x8_idct8( p_fdec, dct8x8 );
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101;
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
+                h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
+
+                if( b_decimate && !h->mb.b_trellis )
+                    nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[i8] );
+
+                if( nnz8x8 )
+                {
+                    h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
+                    h->dctf.add8x8_idct8( p_fdec, dct8x8 );
+                    STORE_8x8_NNZ(i8,1);
+                }
+                else
+                    STORE_8x8_NNZ(i8,0);
             }
             else
-            {
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0;
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
-            }
+                STORE_8x8_NNZ(i8,0);
         }
         else
         {
             int i4;
+            int i_decimate_8x8 = 0;
             DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
             h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
             for( i4 = 0; i4 < 4; i4++ )
-                x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0, i8*4+i4 );
-
-            for( i4 = 0; i4 < 4; i4++ )
-                h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
-
-            if( b_decimate )
             {
-                int i_decimate_8x8 = 0;
-                for( i4 = 0; i4 < 4 && i_decimate_8x8 < 4; i4++ )
-                    i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[i8*4+i4] );
-                nnz8x8 = 4 <= i_decimate_8x8;
+                nz = x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0, i8*4+i4 );
+                h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = nz;
+                if( nz )
+                {
+                    h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
+                    h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
+                    if( b_decimate )
+                        i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[i8*4+i4] );
+                    nnz8x8 = 1;
+                }
             }
-            else
-                nnz8x8 = array_non_zero( dct4x4 );
+
+            if( b_decimate && i_decimate_8x8 < 4 )
+                nnz8x8 = 0;
 
             if( nnz8x8 )
-            {
-                for( i4 = 0; i4 < 4; i4++ )
-                {
-                    if( array_non_zero( dct4x4[i4] ) )
-                    {
-                        h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
-                        h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = 1;
-                    }
-                    else
-                        h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = 0;
-                }
                 h->dctf.add8x8_idct( p_fdec, dct4x4 );
-            }
             else
-            {
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0;
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
-            }
+                STORE_8x8_NNZ(i8,0);
         }
 
         i_qp = h->mb.i_chroma_qp;
@@ -983,19 +1009,17 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
             dct4x4[0][0] = 0;
 
             if( h->mb.b_trellis )
-                x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 0 );
+                nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 0 );
             else
-                h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
+                nz = h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
 
-            if( array_non_zero( dct4x4 ) )
+            h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
+            if( nz )
             {
                 h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*4], dct4x4 );
                 h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
                 h->dctf.add4x4_idct( p_fdec, dct4x4 );
-                h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = 1;
             }
-            else
-                h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = 0;
         }
     }
     h->mb.i_cbp_luma &= ~(1 << i8);
@@ -1014,6 +1038,7 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
     const int i_ref = h->mb.cache.ref[0][x264_scan8[i4]];
     const int mvx   = x264_clip3( h->mb.cache.mv[0][x264_scan8[i4]][0], h->mb.mv_min[0], h->mb.mv_max[0] );
     const int mvy   = x264_clip3( h->mb.cache.mv[0][x264_scan8[i4]][1], h->mb.mv_min[1], h->mb.mv_max[1] );
+    int nz;
 
     h->mc.mc_luma( p_fdec, FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0], mvx + 4*4*block_idx_x[i4], mvy + 4*4*block_idx_y[i4], 4, 4 );
 
@@ -1026,15 +1051,13 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
     {
         DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
         h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
-        x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 0, i4 );
-        if( array_non_zero( dct4x4 ) )
+        nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 0, i4 );
+        h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
+        if( nz )
         {
             h->zigzagf.scan_4x4( h->dct.luma4x4[i4], dct4x4 );
             h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PY], i_qp );
             h->dctf.add4x4_idct( p_fdec, dct4x4 );
-            h->mb.cache.non_zero_count[x264_scan8[i4]] = 1;
         }
-        else
-            h->mb.cache.non_zero_count[x264_scan8[i4]] = 0;
     }
 }
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index 4cc599aa..7b9f08a3 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -55,11 +55,11 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp );
 
 void x264_cabac_mb_skip( x264_t *h, int b_skip );
 
-void x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
+int x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
                              int i_qp, int i_ctxBlockCat, int b_intra );
-void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
+int x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
                              int i_qp, int i_ctxBlockCat, int b_intra, int idx );
-void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
+int x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
                              int i_qp, int b_intra, int idx );
 
 void x264_noise_reduction_update( x264_t *h );
diff --git a/encoder/rdo.c b/encoder/rdo.c
index 1ba2a715..76cfdcaf 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -215,6 +215,8 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
     if( i_pixel > PIXEL_8x8 )
         return x264_rd_cost_subpart( h, i_lambda2, i4, i_pixel );
 
+    h->mb.i_cbp_luma = 0;
+
     x264_macroblock_encode_p8x8( h, i8 );
     if( i_pixel == PIXEL_16x8 )
         x264_macroblock_encode_p8x8( h, i8+1 );
@@ -243,6 +245,8 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
 static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
 {
     uint64_t i_ssd, i_bits;
+    h->mb.i_cbp_luma = 0;
+    h->mb.b_transform_8x8 = 1;
 
     x264_mb_encode_i8x8( h, i8, h->mb.i_qp );
     i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 );
@@ -404,7 +408,7 @@ typedef struct {
 // comparable to the input. so unquant is the direct inverse of quant,
 // and uses the dct scaling factors, not the idct ones.
 
-static ALWAYS_INLINE void quant_trellis_cabac( x264_t *h, int16_t *dct,
+static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, int16_t *dct,
                                  const uint16_t *quant_mf, const int *unquant_mf,
                                  const int *coef_weight, const uint8_t *zigzag,
                                  int i_ctxBlockCat, int i_lambda2, int b_ac, int dc, int i_coefs, int idx )
@@ -419,7 +423,7 @@ static ALWAYS_INLINE void quant_trellis_cabac( x264_t *h, int16_t *dct,
     const int b_interlaced = h->mb.b_interlaced;
     const int f = 1 << 15; // no deadzone
     int i_last_nnz;
-    int i, j;
+    int i, j, nz;
 
     // (# of coefs) * (# of ctx) * (# of levels tried) = 1024
     // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough,
@@ -438,7 +442,7 @@ static ALWAYS_INLINE void quant_trellis_cabac( x264_t *h, int16_t *dct,
     if( i < b_ac )
     {
         memset( dct, 0, i_coefs * sizeof(*dct) );
-        return;
+        return 0;
     }
 
     i_last_nnz = i;
@@ -613,39 +617,42 @@ static ALWAYS_INLINE void quant_trellis_cabac( x264_t *h, int16_t *dct,
             bnode = &nodes_cur[j];
 
     j = bnode->level_idx;
+    nz = 0;
     for( i = b_ac; i < i_coefs; i++ )
     {
         dct[zigzag[i]] = level_tree[j].abs_level * signs[i];
+        nz |= level_tree[j].abs_level;
         j = level_tree[j].next;
     }
+    return !!nz;
 }
 
 const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3};
 
-void x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
+int x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
                             int i_qp, int i_ctxBlockCat, int b_intra )
 {
-    quant_trellis_cabac( h, (int16_t*)dct,
+    return quant_trellis_cabac( h, (int16_t*)dct,
         h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
         NULL, i_ctxBlockCat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced],
         i_ctxBlockCat, lambda2_tab[b_intra][i_qp], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 );
 }
 
-void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
+int x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
                              int i_qp, int i_ctxBlockCat, int b_intra, int idx )
 {
     int b_ac = (i_ctxBlockCat == DCT_LUMA_AC || i_ctxBlockCat == DCT_CHROMA_AC);
-    quant_trellis_cabac( h, (int16_t*)dct,
+    return quant_trellis_cabac( h, (int16_t*)dct,
         h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
         x264_dct4_weight2_zigzag[h->mb.b_interlaced],
         x264_zigzag_scan4[h->mb.b_interlaced],
         i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 0, 16, idx );
 }
 
-void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
+int x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
                              int i_qp, int b_intra, int idx )
 {
-    quant_trellis_cabac( h, (int16_t*)dct,
+    return quant_trellis_cabac( h, (int16_t*)dct,
         h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
         x264_dct8_weight2_zigzag[h->mb.b_interlaced],
         x264_zigzag_scan8[h->mb.b_interlaced],
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 9bc802ad..3f89e681 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -558,6 +558,7 @@ static int check_dct( int cpu_ref, int cpu_new )
     TEST_IDCT( add8x8_idct, dct4 );
     TEST_IDCT( add8x8_idct_dc, dct4 );
     TEST_IDCT( add16x16_idct, dct4 );
+    TEST_IDCT( add16x16_idct_dc, dct4 );
     report( "add_idct4 :" );
 
     ok = 1; used_asm = 0;
@@ -958,7 +959,7 @@ static int check_quant( int cpu_ref, int cpu_new )
     DECLARE_ALIGNED_16( uint8_t cqm_buf[64] );
     int ret = 0, ok, used_asm;
     int oks[2] = {1,1}, used_asms[2] = {0,0};
-    int i, i_cqm, qp;
+    int i, j, i_cqm, qp;
     x264_t h_buf;
     x264_t *h = &h_buf;
     memset( h, 0, sizeof(*h) );
@@ -1007,7 +1008,7 @@ static int check_quant( int cpu_ref, int cpu_new )
                 for( x = 0; x < 8; x++ ) \
                 { \
                     unsigned int scale = (255*scale1d[y]*scale1d[x])/16; \
-                    dct1[y*8+x] = dct2[y*8+x] = (rand()%(2*scale+1))-scale; \
+                    dct1[y*8+x] = dct2[y*8+x] = j ? (rand()%(2*scale+1))-scale : 0; \
                 } \
         }
 
@@ -1019,7 +1020,7 @@ static int check_quant( int cpu_ref, int cpu_new )
                 for( x = 0; x < 4; x++ ) \
                 { \
                     unsigned int scale = 255*scale1d[y]*scale1d[x]; \
-                    dct1[y*4+x] = dct2[y*4+x] = (rand()%(2*scale+1))-scale; \
+                    dct1[y*4+x] = dct2[y*4+x] = j ? (rand()%(2*scale+1))-scale : 0; \
                 } \
         }
 
@@ -1030,18 +1031,22 @@ static int check_quant( int cpu_ref, int cpu_new )
             used_asms[0] = 1; \
             for( qp = 51; qp > 0; qp-- ) \
             { \
-                for( i = 0; i < 16; i++ ) \
-                    dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \
-                call_c1( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
-                call_a1( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
-                if( memcmp( dct1, dct2, 16*2 ) )       \
+                for( j = 0; j < 2; j++ ) \
                 { \
-                    oks[0] = 0; \
-                    fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
-                    break; \
+                    int result_c, result_a; \
+                    for( i = 0; i < 16; i++ ) \
+                        dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \
+                    result_c = call_c1( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+                    result_a = call_a1( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+                    if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a )       \
+                    { \
+                        oks[0] = 0; \
+                        fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
+                        break; \
+                    } \
+                    call_c2( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+                    call_a2( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
                 } \
-                call_c2( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
-                call_a2( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
             } \
         }
 
@@ -1052,17 +1057,21 @@ static int check_quant( int cpu_ref, int cpu_new )
             used_asms[0] = 1; \
             for( qp = 51; qp > 0; qp-- ) \
             { \
-                INIT_QUANT##w() \
-                call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
-                call_a1( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
-                if( memcmp( dct1, dct2, w*w*2 ) ) \
+                for( j = 0; j < 2; j++ ) \
                 { \
-                    oks[0] = 0; \
-                    fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
-                    break; \
+                    int result_c, result_a; \
+                    INIT_QUANT##w() \
+                    result_c = call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                    result_a = call_a1( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                    if( memcmp( dct1, dct2, w*w*2 ) || result_c != result_a ) \
+                    { \
+                        oks[0] = 0; \
+                        fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
+                        break; \
+                    } \
+                    call_c2( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                    call_a2( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
                 } \
-                call_c2( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
-                call_a2( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
             } \
         }
 
@@ -1078,6 +1087,7 @@ static int check_quant( int cpu_ref, int cpu_new )
         { \
             set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
             used_asms[1] = 1; \
+            j = 1; \
             for( qp = 51; qp > 0; qp-- ) \
             { \
                 INIT_QUANT##w() \