From 205a032c22467c90c26d33ed9ab23d60461e57c1 Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Tue, 30 Jun 2009 15:20:32 -0700
Subject: [PATCH] Early termination for chroma encoding Faster chroma encoding
 by terminating early if heuristics indicate that the block will be DC-only.
 This works because the vast majority of inter chroma blocks have no
 coefficients at all, and those that do are almost always DC-only. Add two new
 helper DSP functions for this: dct_dc_8x8 and var2_8x8.  mmx/sse2/ssse3
 versions of each. Early termination is disabled at very low QPs due to it not
 being useful there. Performance increase is ~1-2% without trellis, up to 5-6%
 with trellis=2. Increase is greater with lower bitrates.

---
 common/dct.c           |  25 +++++++++
 common/dct.h           |   1 +
 common/pixel.c         |  28 ++++++++++
 common/pixel.h         |   1 +
 common/x86/dct-a.asm   |  74 +++++++++++++++++++++++++++
 common/x86/dct.h       |   3 +-
 common/x86/pixel-a.asm | 113 +++++++++++++++++++++++++++++++++++++++++
 common/x86/pixel.h     |   3 ++
 encoder/macroblock.c   |  60 +++++++++++++++++++++-
 tools/checkasm.c       |  19 +++++++
 10 files changed, 325 insertions(+), 2 deletions(-)

diff --git a/common/dct.c b/common/dct.c
index 1f8f4b39..3a2d9161 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -170,6 +170,28 @@ static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
     sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 }
 
+static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
+{
+    int16_t d[4][4];
+    int sum = 0;
+
+    pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
+
+    sum += d[0][0] + d[0][1] + d[0][2] + d[0][3];
+    sum += d[1][0] + d[1][1] + d[1][2] + d[1][3];
+    sum += d[2][0] + d[2][1] + d[2][2] + d[2][3];
+    sum += d[3][0] + d[3][1] + d[3][2] + d[3][3];
+
+    return sum;
+}
+
+static void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
+{
+    dct[0][0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
+    dct[0][1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
+    dct[1][0] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
+    dct[1][1] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
+}
 
 static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
 {
@@ -391,6 +413,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
     dctf->add4x4_idct   = add4x4_idct;
 
     dctf->sub8x8_dct    = sub8x8_dct;
+    dctf->sub8x8_dct_dc = sub8x8_dct_dc;
     dctf->add8x8_idct   = add8x8_idct;
     dctf->add8x8_idct_dc = add8x8_idct_dc;
 
@@ -416,6 +439,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
         dctf->dct4x4dc      = x264_dct4x4dc_mmx;
         dctf->idct4x4dc     = x264_idct4x4dc_mmx;
+        dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
 
 #ifndef ARCH_X86_64
         dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
@@ -434,6 +458,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
     {
         dctf->sub8x8_dct8   = x264_sub8x8_dct8_sse2;
         dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
+        dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
         dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
         dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
 
diff --git a/common/dct.h b/common/dct.h
index 3819ce11..a38bf919 100644
--- a/common/dct.h
+++ b/common/dct.h
@@ -95,6 +95,7 @@ typedef struct
     void (*add4x4_idct)  ( uint8_t *p_dst, int16_t dct[4][4] );
 
     void (*sub8x8_dct)   ( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 );
+    void (*sub8x8_dct_dc)( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 );
     void (*add8x8_idct)  ( uint8_t *p_dst, int16_t dct[4][4][4] );
     void (*add8x8_idct_dc) ( uint8_t *p_dst, int16_t dct[2][2] );
 
diff --git a/common/pixel.c b/common/pixel.c
index 5932f07f..852748ec 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -160,6 +160,30 @@ static int name( uint8_t *pix, int i_stride ) \
 PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 )
 PIXEL_VAR_C( x264_pixel_var_8x8,    8, 6 )
 
+/****************************************************************************
+ * pixel_var2_wxh
+ ****************************************************************************/
+static int pixel_var2_8x8( uint8_t *pix1, int i_stride1, uint8_t *pix2, int i_stride2, int *ssd )
+{
+    uint32_t var = 0, sum = 0, sqr = 0;
+    int x, y;
+    for( y = 0; y < 8; y++ )
+    {
+        for( x = 0; x < 8; x++ )
+        {
+            int diff = pix1[x] - pix2[x];
+            sum += diff;
+            sqr += diff * diff;
+        }
+        pix1 += i_stride1;
+        pix2 += i_stride2;
+    }
+    sum = abs(sum);
+    var = sqr - (sum * sum >> 6);
+    *ssd = sqr;
+    return var;
+}
+
 
 #define HADAMARD4(d0,d1,d2,d3,s0,s1,s2,s3) {\
     int t0 = s0 + s1;\
@@ -611,6 +635,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 
     pixf->ssim_4x4x2_core = ssim_4x4x2_core;
     pixf->ssim_end4 = ssim_end4;
+    pixf->var2_8x8 = pixel_var2_8x8;
 
 #ifdef HAVE_MMX
     if( cpu&X264_CPU_MMX )
@@ -636,6 +661,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_mmxext;
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
         pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_mmxext;
+        pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext;
 
         if( cpu&X264_CPU_CACHELINE_32 )
         {
@@ -682,6 +708,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 #ifdef ARCH_X86_64
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
 #endif
+        pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
     }
 
     if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
@@ -761,6 +788,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 #ifdef ARCH_X86_64
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
 #endif
+        pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3;
         if( cpu&X264_CPU_CACHELINE_64 )
         {
             INIT2( sad, _cache64_ssse3 );
diff --git a/common/pixel.h b/common/pixel.h
index 207c74f2..53f99566 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -73,6 +73,7 @@ typedef struct
     x264_pixel_cmp_x3_t fpelcmp_x3[7];
     x264_pixel_cmp_x4_t fpelcmp_x4[7];
     x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
+    int (*var2_8x8)( uint8_t *, int, uint8_t *, int, int * );
 
     int (*var[4])( uint8_t *pix, int stride );
     uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride );
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 6e92df6f..64cde9c3 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -36,6 +36,7 @@ pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
 pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
 pb_1: times 16 db 1
+pw_1: times 8 dw 1
 
 SECTION .text
 
@@ -427,6 +428,79 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
     IDCT_DC_STORE 0, xmm2, xmm3
     ret
 
+;-----------------------------------------------------------------------------
+; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
+;-----------------------------------------------------------------------------
+
+%macro DCTDC_2ROW_MMX 3
+    movq      %1, [r1+FENC_STRIDE*(0+%3)]
+    movq      m1, [r1+FENC_STRIDE*(1+%3)]
+    movq      m2, [r2+FDEC_STRIDE*(0+%3)]
+    movq      m3, [r2+FDEC_STRIDE*(1+%3)]
+    movq      %2, %1
+    punpckldq %1, m1
+    punpckhdq %2, m1
+    movq      m1, m2
+    punpckldq m2, m3
+    punpckhdq m1, m3
+    psadbw    %1, m7
+    psadbw    %2, m7
+    psadbw    m2, m7
+    psadbw    m1, m7
+    psubw     %1, m2
+    psubw     %2, m1
+%endmacro
+
+INIT_MMX
+cglobal x264_sub8x8_dct_dc_mmxext, 3,3
+    pxor      m7, m7
+    call .loop
+    add       r1, FENC_STRIDE*4
+    add       r2, FDEC_STRIDE*4
+    add       r0, 4
+.loop:
+    DCTDC_2ROW_MMX m0, m4, 0
+    DCTDC_2ROW_MMX m5, m6, 2
+    paddw     m0, m5
+    paddw     m4, m6
+    punpcklwd m0, m4
+    movd    [r0], m0
+    ret
+
+INIT_XMM
+%macro DCTDC_2ROW_SSE2 3
+    movq      m0, [r1+FENC_STRIDE*(0+%1)]
+    movq      m1, [r1+FENC_STRIDE*(1+%1)]
+    movq      m2, [r2+FDEC_STRIDE*(0+%1)]
+    movq      m3, [r2+FDEC_STRIDE*(1+%1)]
+    punpckldq m0, m1
+    punpckldq m2, m3
+    psadbw    m0, m7
+    psadbw    m2, m7
+%if %2
+    paddw     %3, m0
+    paddw     m6, m2
+%else
+    SWAP      %3, m0
+    SWAP      m6, m2
+%endif
+%endmacro
+
+cglobal x264_sub8x8_dct_dc_sse2, 3,3,8
+    pxor     m7, m7
+    DCTDC_2ROW_SSE2 0, 0, m4
+    DCTDC_2ROW_SSE2 2, 1, m4
+    add      r1, FENC_STRIDE*4
+    add      r2, FDEC_STRIDE*4
+    psubq    m4, m6
+    DCTDC_2ROW_SSE2 0, 0, m5
+    DCTDC_2ROW_SSE2 2, 1, m5
+    psubq    m5, m6
+    packssdw m4, m5
+    packssdw m4, m4
+    movq   [r0], m4
+    RET
+
 ;-----------------------------------------------------------------------------
 ; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
 ;-----------------------------------------------------------------------------
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 44518212..87ab8fc0 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -32,7 +32,8 @@ void x264_sub16x16_dct_sse2  ( int16_t dct[16][4][4],  uint8_t *pix1, uint8_t *p
 void x264_sub4x4_dct_ssse3   ( int16_t dct[ 4][4]   ,  uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x8_dct_ssse3   ( int16_t dct[ 4][4][4],  uint8_t *pix1, uint8_t *pix2 );
 void x264_sub16x16_dct_ssse3 ( int16_t dct[16][4][4],  uint8_t *pix1, uint8_t *pix2 );
-
+void x264_sub8x8_dct_dc_mmxext( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_dc_sse2 ( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 );
 
 void x264_add4x4_idct_mmx    ( uint8_t *p_dst, int16_t dct[ 4][4]    );
 void x264_add8x8_idct_mmx    ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 9617f9e4..6a235c3d 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -386,6 +386,119 @@ cglobal x264_pixel_var_8x8_sse2, 2,4,8
     jg .loop
     VAR_END 6
 
+%macro VAR2_END 0
+    HADDW   m5, m7
+    movd   r1d, m5
+    imul   r1d, r1d
+    HADDD   m6, m1
+    shr    r1d, 6
+    movd   eax, m6
+    mov   [r4], eax
+    sub    eax, r1d  ; sqr - (sum * sum >> shift)
+    RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * )
+;-----------------------------------------------------------------------------
+%ifndef ARCH_X86_64
+INIT_MMX
+cglobal x264_pixel_var2_8x8_mmxext, 5,6
+    VAR_START 0
+    mov      r5d, 8
+.loop:
+    movq      m0, [r0]
+    movq      m1, m0
+    movq      m4, m0
+    movq      m2, [r2]
+    movq      m3, m2
+    punpcklbw m0, m7
+    punpckhbw m1, m7
+    punpcklbw m2, m7
+    punpckhbw m3, m7
+    psubw     m0, m2
+    psubw     m1, m3
+    paddw     m5, m0
+    paddw     m5, m1
+    pmaddwd   m0, m0
+    pmaddwd   m1, m1
+    paddd     m6, m0
+    paddd     m6, m1
+    add       r0, r1
+    add       r2, r3
+    dec       r5d
+    jg .loop
+    VAR2_END
+    RET
+%endif
+
+INIT_XMM
+cglobal x264_pixel_var2_8x8_sse2, 5,6,8
+    VAR_START 1
+    mov      r5d, 4
+.loop:
+    movq      m1, [r0]
+    movhps    m1, [r0+r1]
+    movq      m3, [r2]
+    movhps    m3, [r2+r3]
+    DEINTB    0, 1, 2, 3, 7
+    psubw     m0, m2
+    psubw     m1, m3
+    paddw     m5, m0
+    paddw     m5, m1
+    pmaddwd   m0, m0
+    pmaddwd   m1, m1
+    paddd     m6, m0
+    paddd     m6, m1
+    lea       r0, [r0+r1*2]
+    lea       r2, [r2+r3*2]
+    dec      r5d
+    jg .loop
+    VAR2_END
+    RET
+
+cglobal x264_pixel_var2_8x8_ssse3, 5,6,8
+    pxor      m5, m5    ; sum
+    pxor      m6, m6    ; sum squared
+    mova      m7, [hsub_mul GLOBAL]
+    mov      r5d, 2
+.loop:
+    movq      m0, [r0]
+    movq      m2, [r2]
+    movq      m1, [r0+r1]
+    movq      m3, [r2+r3]
+    lea       r0, [r0+r1*2]
+    lea       r2, [r2+r3*2]
+    punpcklbw m0, m2
+    punpcklbw m1, m3
+    movq      m2, [r0]
+    movq      m3, [r2]
+    punpcklbw m2, m3
+    movq      m3, [r0+r1]
+    movq      m4, [r2+r3]
+    punpcklbw m3, m4
+    pmaddubsw m0, m7
+    pmaddubsw m1, m7
+    pmaddubsw m2, m7
+    pmaddubsw m3, m7
+    paddw     m5, m0
+    paddw     m5, m1
+    paddw     m5, m2
+    paddw     m5, m3
+    pmaddwd   m0, m0
+    pmaddwd   m1, m1
+    pmaddwd   m2, m2
+    pmaddwd   m3, m3
+    paddd     m6, m0
+    paddd     m6, m1
+    paddd     m6, m2
+    paddd     m6, m3
+    lea       r0, [r0+r1*2]
+    lea       r2, [r2+r3*2]
+    dec      r5d
+    jg .loop
+    VAR2_END
+    RET
 
 ;=============================================================================
 ; SATD
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 312aca8f..b1e22cee 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -102,6 +102,9 @@ void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
 void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
                                       const uint8_t *pix2, int stride2, int sums[2][4] );
 float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
+int  x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * );
+int  x264_pixel_var2_8x8_sse2( uint8_t *, int, uint8_t *, int, int * );
+int  x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
 
 #define DECL_ADS( size, suffix ) \
 int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 1aa15b20..e3d288d9 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -84,6 +84,18 @@ static inline void dct2x2dc( int16_t d[2][2], int16_t dct4x4[4][4][4] )
     dct4x4[3][0][0] = 0;
 }
 
+static inline void dct2x2dc_dconly( int16_t d[2][2] )
+{
+    int d0 = d[0][0] + d[0][1];
+    int d1 = d[1][0] + d[1][1];
+    int d2 = d[0][0] - d[0][1];
+    int d3 = d[1][0] - d[1][1];
+    d[0][0] = d0 + d1;
+    d[1][0] = d2 + d3;
+    d[0][1] = d0 - d1;
+    d[1][1] = d2 - d3;
+}
+
 static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
 {
     int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
@@ -273,8 +285,55 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
 {
     int i, ch, nz, nz_dc;
     int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
+    DECLARE_ALIGNED_16( int16_t dct2x2[2][2]  );
     h->mb.i_cbp_chroma = 0;
 
+    /* Early termination: check variance of chroma residual before encoding.
+     * Don't bother trying early termination at low QPs.
+     * Values are experimentally derived. */
+    if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) )
+    {
+        int thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
+        int ssd[2];
+        int score  = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
+            score += h->pixf.var2_8x8( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
+        if( score < thresh*4 )
+        {
+            h->mb.cache.non_zero_count[x264_scan8[16]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[17]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[18]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[19]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[20]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[21]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[22]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[23]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[25]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[26]] = 0;
+            for( ch = 0; ch < 2; ch++ )
+            {
+                if( ssd[ch] > thresh )
+                {
+                    h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] );
+                    dct2x2dc_dconly( dct2x2 );
+                    if( h->mb.b_trellis )
+                        nz_dc = x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter );
+                    else
+                        nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<
+    1 );
+                    if( nz_dc )
+                    {
+                        h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
+                        zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
+                        idct_dequant_2x2_dconly( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+                        h->dctf.add8x8_idct_dc( h->mb.pic.p_fdec[1+ch], dct2x2 );
+                        h->mb.i_cbp_chroma = 1;
+                    }
+                }
+            }
+            return;
+        }
+    }
+
     for( ch = 0; ch < 2; ch++ )
     {
         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
@@ -282,7 +341,6 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
         int i_decimate_score = 0;
         int nz_ac = 0;
 
-        DECLARE_ALIGNED_16( int16_t dct2x2[2][2]  );
         DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
 
         if( h->mb.b_lossless )
diff --git a/tools/checkasm.c b/tools/checkasm.c
index a42cd06b..750feed4 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -354,6 +354,23 @@ static int check_pixel( int cpu_ref, int cpu_new )
     TEST_PIXEL_VAR( PIXEL_8x8 );
     report( "pixel var :" );
 
+    ok = 1; used_asm = 0;
+    if( pixel_asm.var2_8x8 != pixel_ref.var2_8x8 )
+    {
+        int res_c, res_asm, ssd_c, ssd_asm;
+        set_func_name( "var2_8x8" );
+        used_asm = 1;
+        res_c   = call_c( pixel_c.var2_8x8, buf1, 16, buf2, 16, &ssd_c );
+        res_asm = call_a( pixel_asm.var2_8x8, buf1, 16, buf2, 16, &ssd_asm );
+        if( res_c != res_asm || ssd_c != ssd_asm )
+        {
+            ok = 0;
+            fprintf( stderr, "var[%d]: %d != %d or %d != %d [FAILED]\n", i, res_c, res_asm, ssd_c, ssd_asm );
+        }
+    }
+
+    report( "pixel var2 :" );
+
     for( i=0, ok=1, used_asm=0; i<4; i++ )
         if( pixel_asm.hadamard_ac[i] != pixel_ref.hadamard_ac[i] )
         {
@@ -480,6 +497,7 @@ static int check_dct( int cpu_ref, int cpu_new )
     DECLARE_ALIGNED_16( int16_t dct2[16][4][4] );
     DECLARE_ALIGNED_16( int16_t dct4[16][4][4] );
     DECLARE_ALIGNED_16( int16_t dct8[4][8][8] );
+    DECLARE_ALIGNED_8( int16_t dctdc[2][2][2] );
     x264_t h_buf;
     x264_t *h = &h_buf;
 
@@ -514,6 +532,7 @@ static int check_dct( int cpu_ref, int cpu_new )
     ok = 1; used_asm = 0;
     TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16*2 );
     TEST_DCT( sub8x8_dct, dct1, dct2, 16*2*4 );
+    TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4*2 );
     TEST_DCT( sub16x16_dct, dct1, dct2, 16*2*16 );
     report( "sub_dct4 :" );
 
-- 
2.50.1