From 205a032c22467c90c26d33ed9ab23d60461e57c1 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Tue, 30 Jun 2009 15:20:32 -0700 Subject: [PATCH] Early termination for chroma encoding Faster chroma encoding by terminating early if heuristics indicate that the block will be DC-only. This works because the vast majority of inter chroma blocks have no coefficients at all, and those that do are almost always DC-only. Add two new helper DSP functions for this: dct_dc_8x8 and var2_8x8. mmx/sse2/ssse3 versions of each. Early termination is disabled at very low QPs due to it not being useful there. Performance increase is ~1-2% without trellis, up to 5-6% with trellis=2. Increase is greater with lower bitrates. --- common/dct.c | 25 +++++++++ common/dct.h | 1 + common/pixel.c | 28 ++++++++++ common/pixel.h | 1 + common/x86/dct-a.asm | 74 +++++++++++++++++++++++++++ common/x86/dct.h | 3 +- common/x86/pixel-a.asm | 113 +++++++++++++++++++++++++++++++++++++++++ common/x86/pixel.h | 3 ++ encoder/macroblock.c | 60 +++++++++++++++++++++- tools/checkasm.c | 19 +++++++ 10 files changed, 325 insertions(+), 2 deletions(-) diff --git a/common/dct.c b/common/dct.c index 1f8f4b39..3a2d9161 100644 --- a/common/dct.c +++ b/common/dct.c @@ -170,6 +170,28 @@ static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ) sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] ); } +static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 ) +{ + int16_t d[4][4]; + int sum = 0; + + pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE ); + + sum += d[0][0] + d[0][1] + d[0][2] + d[0][3]; + sum += d[1][0] + d[1][1] + d[1][2] + d[1][3]; + sum += d[2][0] + d[2][1] + d[2][2] + d[2][3]; + sum += d[3][0] + d[3][1] + d[3][2] + d[3][3]; + + return sum; +} + +static void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 ) +{ + dct[0][0] = sub4x4_dct_dc( &pix1[0], &pix2[0] ); + dct[0][1] = sub4x4_dct_dc( &pix1[4], &pix2[4] ); + dct[1][0] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] ); + dct[1][1] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] ); +} static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] ) { @@ -391,6 +413,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->add4x4_idct = add4x4_idct; dctf->sub8x8_dct = sub8x8_dct; + dctf->sub8x8_dct_dc = sub8x8_dct_dc; dctf->add8x8_idct = add8x8_idct; dctf->add8x8_idct_dc = add8x8_idct_dc; @@ -416,6 +439,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx; dctf->dct4x4dc = x264_dct4x4dc_mmx; dctf->idct4x4dc = x264_idct4x4dc_mmx; + dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext; #ifndef ARCH_X86_64 dctf->sub8x8_dct = x264_sub8x8_dct_mmx; @@ -434,6 +458,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) { dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2; dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2; + dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2; dctf->add8x8_idct8 = x264_add8x8_idct8_sse2; dctf->add16x16_idct8= x264_add16x16_idct8_sse2; diff --git a/common/dct.h b/common/dct.h index 3819ce11..a38bf919 100644 --- a/common/dct.h +++ b/common/dct.h @@ -95,6 +95,7 @@ typedef struct void (*add4x4_idct) ( uint8_t *p_dst, int16_t dct[4][4] ); void (*sub8x8_dct) ( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ); + void (*sub8x8_dct_dc)( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 ); void (*add8x8_idct) ( uint8_t *p_dst, int16_t dct[4][4][4] ); void (*add8x8_idct_dc) ( uint8_t *p_dst, int16_t dct[2][2] ); diff --git a/common/pixel.c b/common/pixel.c index 5932f07f..852748ec 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -160,6 +160,30 @@ static int name( uint8_t *pix, int i_stride ) \ PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 ) PIXEL_VAR_C( x264_pixel_var_8x8, 8, 6 ) +/**************************************************************************** + * pixel_var2_wxh + ****************************************************************************/ +static int pixel_var2_8x8( uint8_t *pix1, int i_stride1, uint8_t *pix2, int i_stride2, int *ssd ) +{ + uint32_t var = 0, sum = 0, sqr = 0; + int x, y; + for( y = 0; y < 8; y++ ) + { + for( x = 0; x < 8; x++ ) + { + int diff = pix1[x] - pix2[x]; + sum += diff; + sqr += diff * diff; + } + pix1 += i_stride1; + pix2 += i_stride2; + } + sum = abs(sum); + var = sqr - (sum * sum >> 6); + *ssd = sqr; + return var; +} + #define HADAMARD4(d0,d1,d2,d3,s0,s1,s2,s3) {\ int t0 = s0 + s1;\ @@ -611,6 +635,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->ssim_4x4x2_core = ssim_4x4x2_core; pixf->ssim_end4 = ssim_end4; + pixf->var2_8x8 = pixel_var2_8x8; #ifdef HAVE_MMX if( cpu&X264_CPU_MMX ) @@ -636,6 +661,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext; pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmxext; + pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext; if( cpu&X264_CPU_CACHELINE_32 ) { @@ -682,6 +708,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) #ifdef ARCH_X86_64 pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; #endif + pixf->var2_8x8 = x264_pixel_var2_8x8_sse2; } if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) @@ -761,6 +788,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) #ifdef ARCH_X86_64 pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3; #endif + pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3; if( cpu&X264_CPU_CACHELINE_64 ) { INIT2( sad, _cache64_ssse3 ); diff --git a/common/pixel.h b/common/pixel.h index 207c74f2..53f99566 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -73,6 +73,7 @@ typedef struct x264_pixel_cmp_x3_t fpelcmp_x3[7]; x264_pixel_cmp_x4_t fpelcmp_x4[7]; x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */ + int (*var2_8x8)( uint8_t *, int, uint8_t *, int, int * ); int (*var[4])( uint8_t *pix, int stride ); uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride ); diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index 6e92df6f..64cde9c3 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -36,6 +36,7 @@ pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7 pb_1: times 16 db 1 +pw_1: times 8 dw 1 SECTION .text @@ -427,6 +428,79 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2,8 IDCT_DC_STORE 0, xmm2, xmm3 ret +;----------------------------------------------------------------------------- +; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 ) +;----------------------------------------------------------------------------- + +%macro DCTDC_2ROW_MMX 3 + movq %1, [r1+FENC_STRIDE*(0+%3)] + movq m1, [r1+FENC_STRIDE*(1+%3)] + movq m2, [r2+FDEC_STRIDE*(0+%3)] + movq m3, [r2+FDEC_STRIDE*(1+%3)] + movq %2, %1 + punpckldq %1, m1 + punpckhdq %2, m1 + movq m1, m2 + punpckldq m2, m3 + punpckhdq m1, m3 + psadbw %1, m7 + psadbw %2, m7 + psadbw m2, m7 + psadbw m1, m7 + psubw %1, m2 + psubw %2, m1 +%endmacro + +INIT_MMX +cglobal x264_sub8x8_dct_dc_mmxext, 3,3 + pxor m7, m7 + call .loop + add r1, FENC_STRIDE*4 + add r2, FDEC_STRIDE*4 + add r0, 4 +.loop: + DCTDC_2ROW_MMX m0, m4, 0 + DCTDC_2ROW_MMX m5, m6, 2 + paddw m0, m5 + paddw m4, m6 + punpcklwd m0, m4 + movd [r0], m0 + ret + +INIT_XMM +%macro DCTDC_2ROW_SSE2 3 + movq m0, [r1+FENC_STRIDE*(0+%1)] + movq m1, [r1+FENC_STRIDE*(1+%1)] + movq m2, [r2+FDEC_STRIDE*(0+%1)] + movq m3, [r2+FDEC_STRIDE*(1+%1)] + punpckldq m0, m1 + punpckldq m2, m3 + psadbw m0, m7 + psadbw m2, m7 +%if %2 + paddw %3, m0 + paddw m6, m2 +%else + SWAP %3, m0 + SWAP m6, m2 +%endif +%endmacro + +cglobal x264_sub8x8_dct_dc_sse2, 3,3,8 + pxor m7, m7 + DCTDC_2ROW_SSE2 0, 0, m4 + DCTDC_2ROW_SSE2 2, 1, m4 + add r1, FENC_STRIDE*4 + add r2, FDEC_STRIDE*4 + psubq m4, m6 + DCTDC_2ROW_SSE2 0, 0, m5 + DCTDC_2ROW_SSE2 2, 1, m5 + psubq m5, m6 + packssdw m4, m5 + packssdw m4, m4 + movq [r0], m4 + RET + ;----------------------------------------------------------------------------- ; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] ) ;----------------------------------------------------------------------------- diff --git a/common/x86/dct.h b/common/x86/dct.h index 44518212..87ab8fc0 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -32,7 +32,8 @@ void x264_sub16x16_dct_sse2 ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *p void x264_sub4x4_dct_ssse3 ( int16_t dct[ 4][4] , uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_ssse3 ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ); - +void x264_sub8x8_dct_dc_mmxext( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_dc_sse2 ( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 ); void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4] ); void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4][4] ); diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 9617f9e4..6a235c3d 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -386,6 +386,119 @@ cglobal x264_pixel_var_8x8_sse2, 2,4,8 jg .loop VAR_END 6 +%macro VAR2_END 0 + HADDW m5, m7 + movd r1d, m5 + imul r1d, r1d + HADDD m6, m1 + shr r1d, 6 + movd eax, m6 + mov [r4], eax + sub eax, r1d ; sqr - (sum * sum >> shift) + RET +%endmacro + +;----------------------------------------------------------------------------- +; int x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * ) +;----------------------------------------------------------------------------- +%ifndef ARCH_X86_64 +INIT_MMX +cglobal x264_pixel_var2_8x8_mmxext, 5,6 + VAR_START 0 + mov r5d, 8 +.loop: + movq m0, [r0] + movq m1, m0 + movq m4, m0 + movq m2, [r2] + movq m3, m2 + punpcklbw m0, m7 + punpckhbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + psubw m0, m2 + psubw m1, m3 + paddw m5, m0 + paddw m5, m1 + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m6, m0 + paddd m6, m1 + add r0, r1 + add r2, r3 + dec r5d + jg .loop + VAR2_END + RET +%endif + +INIT_XMM +cglobal x264_pixel_var2_8x8_sse2, 5,6,8 + VAR_START 1 + mov r5d, 4 +.loop: + movq m1, [r0] + movhps m1, [r0+r1] + movq m3, [r2] + movhps m3, [r2+r3] + DEINTB 0, 1, 2, 3, 7 + psubw m0, m2 + psubw m1, m3 + paddw m5, m0 + paddw m5, m1 + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m6, m0 + paddd m6, m1 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + dec r5d + jg .loop + VAR2_END + RET + +cglobal x264_pixel_var2_8x8_ssse3, 5,6,8 + pxor m5, m5 ; sum + pxor m6, m6 ; sum squared + mova m7, [hsub_mul GLOBAL] + mov r5d, 2 +.loop: + movq m0, [r0] + movq m2, [r2] + movq m1, [r0+r1] + movq m3, [r2+r3] + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + punpcklbw m0, m2 + punpcklbw m1, m3 + movq m2, [r0] + movq m3, [r2] + punpcklbw m2, m3 + movq m3, [r0+r1] + movq m4, [r2+r3] + punpcklbw m3, m4 + pmaddubsw m0, m7 + pmaddubsw m1, m7 + pmaddubsw m2, m7 + pmaddubsw m3, m7 + paddw m5, m0 + paddw m5, m1 + paddw m5, m2 + paddw m5, m3 + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + paddd m6, m0 + paddd m6, m1 + paddd m6, m2 + paddd m6, m3 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + dec r5d + jg .loop + VAR2_END + RET ;============================================================================= ; SATD diff --git a/common/x86/pixel.h b/common/x86/pixel.h index 312aca8f..b1e22cee 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -102,6 +102,9 @@ void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1, void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, const uint8_t *pix2, int stride2, int sums[2][4] ); float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width ); +int x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * ); +int x264_pixel_var2_8x8_sse2( uint8_t *, int, uint8_t *, int, int * ); +int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * ); #define DECL_ADS( size, suffix ) \ int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\ diff --git a/encoder/macroblock.c b/encoder/macroblock.c index 1aa15b20..e3d288d9 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -84,6 +84,18 @@ static inline void dct2x2dc( int16_t d[2][2], int16_t dct4x4[4][4][4] ) dct4x4[3][0][0] = 0; } +static inline void dct2x2dc_dconly( int16_t d[2][2] ) +{ + int d0 = d[0][0] + d[0][1]; + int d1 = d[1][0] + d[1][1]; + int d2 = d[0][0] - d[0][1]; + int d3 = d[1][0] - d[1][1]; + d[0][0] = d0 + d1; + d[1][0] = d2 + d3; + d[0][1] = d0 - d1; + d[1][1] = d2 - d3; +} + static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx ) { int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY; @@ -273,8 +285,55 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) { int i, ch, nz, nz_dc; int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate); + DECLARE_ALIGNED_16( int16_t dct2x2[2][2] ); h->mb.i_cbp_chroma = 0; + /* Early termination: check variance of chroma residual before encoding. + * Don't bother trying early termination at low QPs. + * Values are experimentally derived. */ + if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) ) + { + int thresh = (x264_lambda2_tab[i_qp] + 32) >> 6; + int ssd[2]; + int score = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] ); + score += h->pixf.var2_8x8( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] ); + if( score < thresh*4 ) + { + h->mb.cache.non_zero_count[x264_scan8[16]] = 0; + h->mb.cache.non_zero_count[x264_scan8[17]] = 0; + h->mb.cache.non_zero_count[x264_scan8[18]] = 0; + h->mb.cache.non_zero_count[x264_scan8[19]] = 0; + h->mb.cache.non_zero_count[x264_scan8[20]] = 0; + h->mb.cache.non_zero_count[x264_scan8[21]] = 0; + h->mb.cache.non_zero_count[x264_scan8[22]] = 0; + h->mb.cache.non_zero_count[x264_scan8[23]] = 0; + h->mb.cache.non_zero_count[x264_scan8[25]] = 0; + h->mb.cache.non_zero_count[x264_scan8[26]] = 0; + for( ch = 0; ch < 2; ch++ ) + { + if( ssd[ch] > thresh ) + { + h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] ); + dct2x2dc_dconly( dct2x2 ); + if( h->mb.b_trellis ) + nz_dc = x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter ); + else + nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<< + 1 ); + if( nz_dc ) + { + h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1; + zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 ); + idct_dequant_2x2_dconly( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ); + h->dctf.add8x8_idct_dc( h->mb.pic.p_fdec[1+ch], dct2x2 ); + h->mb.i_cbp_chroma = 1; + } + } + } + return; + } + } + for( ch = 0; ch < 2; ch++ ) { uint8_t *p_src = h->mb.pic.p_fenc[1+ch]; @@ -282,7 +341,6 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) int i_decimate_score = 0; int nz_ac = 0; - DECLARE_ALIGNED_16( int16_t dct2x2[2][2] ); DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] ); if( h->mb.b_lossless ) diff --git a/tools/checkasm.c b/tools/checkasm.c index a42cd06b..750feed4 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -354,6 +354,23 @@ static int check_pixel( int cpu_ref, int cpu_new ) TEST_PIXEL_VAR( PIXEL_8x8 ); report( "pixel var :" ); + ok = 1; used_asm = 0; + if( pixel_asm.var2_8x8 != pixel_ref.var2_8x8 ) + { + int res_c, res_asm, ssd_c, ssd_asm; + set_func_name( "var2_8x8" ); + used_asm = 1; + res_c = call_c( pixel_c.var2_8x8, buf1, 16, buf2, 16, &ssd_c ); + res_asm = call_a( pixel_asm.var2_8x8, buf1, 16, buf2, 16, &ssd_asm ); + if( res_c != res_asm || ssd_c != ssd_asm ) + { + ok = 0; + fprintf( stderr, "var[%d]: %d != %d or %d != %d [FAILED]\n", i, res_c, res_asm, ssd_c, ssd_asm ); + } + } + + report( "pixel var2 :" ); + for( i=0, ok=1, used_asm=0; i<4; i++ ) if( pixel_asm.hadamard_ac[i] != pixel_ref.hadamard_ac[i] ) { @@ -480,6 +497,7 @@ static int check_dct( int cpu_ref, int cpu_new ) DECLARE_ALIGNED_16( int16_t dct2[16][4][4] ); DECLARE_ALIGNED_16( int16_t dct4[16][4][4] ); DECLARE_ALIGNED_16( int16_t dct8[4][8][8] ); + DECLARE_ALIGNED_8( int16_t dctdc[2][2][2] ); x264_t h_buf; x264_t *h = &h_buf; @@ -514,6 +532,7 @@ static int check_dct( int cpu_ref, int cpu_new ) ok = 1; used_asm = 0; TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16*2 ); TEST_DCT( sub8x8_dct, dct1, dct2, 16*2*4 ); + TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4*2 ); TEST_DCT( sub16x16_dct, dct1, dct2, 16*2*16 ); report( "sub_dct4 :" ); -- 2.40.0