From 8029e6640967ee71b4ff94233615a5e291da62f4 Mon Sep 17 00:00:00 2001 From: Simon Horlick Date: Sun, 17 Apr 2011 10:05:51 +0100 Subject: [PATCH] MBAFF: Don't call zigzag_init for every mb --- common/common.h | 2 + common/dct.c | 178 ++++++++++++++++++++++------------------------ common/dct.h | 2 +- encoder/encoder.c | 5 +- tools/checkasm.c | 76 +++++++++----------- 5 files changed, 125 insertions(+), 138 deletions(-) diff --git a/common/common.h b/common/common.h index d24a155c..38d7d438 100644 --- a/common/common.h +++ b/common/common.h @@ -878,6 +878,8 @@ struct x264_t x264_mc_functions_t mc; x264_dct_function_t dctf; x264_zigzag_function_t zigzagf; + x264_zigzag_function_t zigzagf_interlaced; + x264_zigzag_function_t zigzagf_progressive; x264_quant_function_t quantf; x264_deblock_function_t loopf; x264_bitstream_function_t bsf; diff --git a/common/dct.c b/common/dct.c index a27cb0f8..38d3be3e 100644 --- a/common/dct.c +++ b/common/dct.c @@ -746,123 +746,117 @@ static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nn } } -void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) -{ - if( b_interlaced ) - { - pf->scan_8x8 = zigzag_scan_8x8_field; - pf->scan_4x4 = zigzag_scan_4x4_field; - pf->sub_8x8 = zigzag_sub_8x8_field; - pf->sub_4x4 = zigzag_sub_4x4_field; - pf->sub_4x4ac = zigzag_sub_4x4ac_field; +void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced ) +{ + pf_interlaced->scan_8x8 = zigzag_scan_8x8_field; + pf_progressive->scan_8x8 = zigzag_scan_8x8_frame; + pf_interlaced->scan_4x4 = zigzag_scan_4x4_field; + pf_progressive->scan_4x4 = zigzag_scan_4x4_frame; + pf_interlaced->sub_8x8 = zigzag_sub_8x8_field; + pf_progressive->sub_8x8 = zigzag_sub_8x8_frame; + pf_interlaced->sub_4x4 = zigzag_sub_4x4_field; + pf_progressive->sub_4x4 = zigzag_sub_4x4_frame; + pf_interlaced->sub_4x4ac = zigzag_sub_4x4ac_field; + pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame; + #if HIGH_BIT_DEPTH #if HAVE_MMX - if( cpu&X264_CPU_SSE2 ) - pf->scan_4x4 = x264_zigzag_scan_4x4_field_sse2; - if( cpu&X264_CPU_SSE4 ) - pf->scan_8x8 = x264_zigzag_scan_8x8_field_sse4; - if( cpu&X264_CPU_AVX ) - pf->scan_8x8 = x264_zigzag_scan_8x8_field_avx; -#endif // HAVE_MMX -#else -#if HAVE_MMX - if( cpu&X264_CPU_MMXEXT ) - { - pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext; - pf->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext; - } - if( cpu&X264_CPU_SSSE3 ) - { - pf->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3; - pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3; - } - if( cpu&X264_CPU_AVX ) - { - pf->sub_4x4 = x264_zigzag_sub_4x4_field_avx; -#if ARCH_X86_64 - pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_avx; -#endif - } -#endif // HAVE_MMX -#if HAVE_ALTIVEC - if( cpu&X264_CPU_ALTIVEC ) - pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec; -#endif -#endif // HIGH_BIT_DEPTH - } - else + if( cpu&X264_CPU_SSE2 ) { - pf->scan_8x8 = zigzag_scan_8x8_frame; - pf->scan_4x4 = zigzag_scan_4x4_frame; - pf->sub_8x8 = zigzag_sub_8x8_frame; - pf->sub_4x4 = zigzag_sub_4x4_frame; - pf->sub_4x4ac = zigzag_sub_4x4ac_frame; -#if HIGH_BIT_DEPTH -#if HAVE_MMX - if( cpu&X264_CPU_SSE2 ) - { - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2; - pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; - } + pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse2; + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; + } + if( cpu&X264_CPU_SSE4 ) + pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4; + if( cpu&X264_CPU_AVX ) + pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx; #if ARCH_X86_64 - if( cpu&X264_CPU_AVX ) - { - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; - pf->scan_8x8 = x264_zigzag_scan_8x8_frame_avx; - } + if( cpu&X264_CPU_AVX ) + { + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx; + } #endif // ARCH_X86_64 #endif // HAVE_MMX #else #if HAVE_MMX - if( cpu&X264_CPU_MMX ) - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx; - if( cpu&X264_CPU_MMXEXT ) - pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext; - if( cpu&X264_CPU_SSE2_IS_FAST ) - pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; - if( cpu&X264_CPU_SSSE3 ) - { - pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; - pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3; - pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3; - if( cpu&X264_CPU_SHUFFLE_IS_FAST ) - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3; - } - if( cpu&X264_CPU_AVX ) - { - pf->sub_4x4 = x264_zigzag_sub_4x4_frame_avx; + if( cpu&X264_CPU_MMX ) + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx; + if( cpu&X264_CPU_MMXEXT ) + { + pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext; + pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext; + } + if( cpu&X264_CPU_SSE2_IS_FAST ) + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; + if( cpu&X264_CPU_SSSE3 ) + { + pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3; + pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; + pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3; + pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3; + if( cpu&X264_CPU_SHUFFLE_IS_FAST ) + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3; + } + if( cpu&X264_CPU_AVX ) + { + pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_avx; + pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_avx; #if ARCH_X86_64 - pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx; + pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx; + pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx; #endif - if( cpu&X264_CPU_SHUFFLE_IS_FAST ) - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; - } + if( cpu&X264_CPU_SHUFFLE_IS_FAST ) + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; + } #endif // HAVE_MMX #if HAVE_ALTIVEC - if( cpu&X264_CPU_ALTIVEC ) - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec; + if( cpu&X264_CPU_ALTIVEC ) + { + pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_altivec; + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec; + } #endif #if HAVE_ARMV6 - if( cpu&X264_CPU_NEON ) - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; + if( cpu&X264_CPU_NEON ) + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; #endif #endif // HIGH_BIT_DEPTH - } - pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc; + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc; #if HAVE_MMX #if HIGH_BIT_DEPTH if( cpu&X264_CPU_SSE2 ) - pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; + } if( cpu&X264_CPU_AVX ) - pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; + } #else if( cpu&X264_CPU_MMX ) - pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx; + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx; + } if( cpu&X264_CPU_SHUFFLE_IS_FAST ) - pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; + } + if( cpu&X264_CPU_AVX ) - pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; + } #endif // HIGH_BIT_DEPTH #endif } diff --git a/common/dct.h b/common/dct.h index d3773ad6..9303d446 100644 --- a/common/dct.h +++ b/common/dct.h @@ -132,6 +132,6 @@ typedef struct void x264_dct_init( int cpu, x264_dct_function_t *dctf ); void x264_dct_init_weights( void ); -void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ); +void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced ); #endif diff --git a/encoder/encoder.c b/encoder/encoder.c index b90df68f..bf82cebb 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -1088,7 +1088,8 @@ x264_t *x264_encoder_open( x264_param_t *param ) x264_cavlc_init(); x264_pixel_init( h->param.cpu, &h->pixf ); x264_dct_init( h->param.cpu, &h->dctf ); - x264_zigzag_init( h->param.cpu, &h->zigzagf, h->param.b_interlaced ); + x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive, &h->zigzagf_interlaced ); + memcpy( &h->zigzagf, h->param.b_interlaced ? &h->zigzagf_interlaced : &h->zigzagf_progressive, sizeof(h->zigzagf) ); x264_mc_init( h->param.cpu, &h->mc ); x264_quant_init( h, h->param.cpu, &h->quantf ); x264_deblock_init( h->param.cpu, &h->loopf, h->param.b_interlaced ); @@ -2056,8 +2057,8 @@ static int x264_slice_write( x264_t *h ) int stride = h->fenc->i_stride[0]; pixel *fenc = h->fenc->plane[0] + 16 * (i_mb_x + i_mb_y * stride); h->mb.b_interlaced = x264_field_vsad( h, fenc, stride ); + memcpy( &h->zigzagf, h->mb.b_interlaced ? &h->zigzagf_interlaced : &h->zigzagf_progressive, sizeof(h->zigzagf) ); } - x264_zigzag_init( h->param.cpu, &h->zigzagf, h->mb.b_interlaced ); } h->mb.field[mb_xy] = h->mb.b_interlaced; } diff --git a/tools/checkasm.c b/tools/checkasm.c index 8f5ce9f4..0e896ba2 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -546,7 +546,7 @@ static int check_dct( int cpu_ref, int cpu_new ) x264_dct_function_t dct_ref; x264_dct_function_t dct_asm; x264_quant_function_t qf; - int ret = 0, ok, used_asm, interlace; + int ret = 0, ok, used_asm, interlace = 0; ALIGNED_16( dctcoef dct1[16][16] ); ALIGNED_16( dctcoef dct2[16][16] ); ALIGNED_16( dctcoef dct4[16][16] ); @@ -711,21 +711,21 @@ static int check_dct( int cpu_ref, int cpu_new ) TEST_DCTDC( idct4x4dc ); #undef TEST_DCTDC - x264_zigzag_function_t zigzag_c; - x264_zigzag_function_t zigzag_ref; - x264_zigzag_function_t zigzag_asm; + x264_zigzag_function_t zigzag_c[2]; + x264_zigzag_function_t zigzag_ref[2]; + x264_zigzag_function_t zigzag_asm[2]; ALIGNED_16( dctcoef level1[64] ); ALIGNED_16( dctcoef level2[64] ); #define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \ - if( zigzag_asm.name != zigzag_ref.name ) \ + if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ { \ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ memcpy(dct, buf1, size*sizeof(dctcoef)); \ - call_c( zigzag_c.name, t1, dct ); \ - call_a( zigzag_asm.name, t2, dct ); \ + call_c( zigzag_c[interlace].name, t1, dct ); \ + call_a( zigzag_asm[interlace].name, t2, dct ); \ if( memcmp( t1, t2, size*sizeof(dctcoef) ) ) \ { \ ok = 0; \ @@ -734,26 +734,26 @@ static int check_dct( int cpu_ref, int cpu_new ) } #define TEST_ZIGZAG_SUB( name, t1, t2, size ) \ - if( zigzag_asm.name != zigzag_ref.name ) \ + if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ { \ int nz_a, nz_c; \ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ memcpy( pbuf3, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \ memcpy( pbuf4, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \ - nz_c = call_c1( zigzag_c.name, t1, pbuf2, pbuf3 ); \ - nz_a = call_a1( zigzag_asm.name, t2, pbuf2, pbuf4 ); \ + nz_c = call_c1( zigzag_c[interlace].name, t1, pbuf2, pbuf3 ); \ + nz_a = call_a1( zigzag_asm[interlace].name, t2, pbuf2, pbuf4 ); \ if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE*sizeof(pixel) ) || nz_c != nz_a ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ } \ - call_c2( zigzag_c.name, t1, pbuf2, pbuf3 ); \ - call_a2( zigzag_asm.name, t2, pbuf2, pbuf4 ); \ + call_c2( zigzag_c[interlace].name, t1, pbuf2, pbuf3 ); \ + call_a2( zigzag_asm[interlace].name, t2, pbuf2, pbuf4 ); \ } #define TEST_ZIGZAG_SUBAC( name, t1, t2 ) \ - if( zigzag_asm.name != zigzag_ref.name ) \ + if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ { \ int nz_a, nz_c; \ dctcoef dc_a, dc_c; \ @@ -768,8 +768,8 @@ static int check_dct( int cpu_ref, int cpu_new ) memcpy( pbuf3 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * sizeof(pixel) ); \ memcpy( pbuf4 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * sizeof(pixel) ); \ } \ - nz_c = call_c1( zigzag_c.name, t1, pbuf2, pbuf3, &dc_c ); \ - nz_a = call_a1( zigzag_asm.name, t2, pbuf2, pbuf4, &dc_a ); \ + nz_c = call_c1( zigzag_c[interlace].name, t1, pbuf2, pbuf3, &dc_c ); \ + nz_a = call_a1( zigzag_asm[interlace].name, t2, pbuf2, pbuf4, &dc_a ); \ if( memcmp( t1+1, t2+1, 15*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE * sizeof(pixel) ) || nz_c != nz_a || dc_c != dc_a ) \ { \ ok = 0; \ @@ -777,12 +777,12 @@ static int check_dct( int cpu_ref, int cpu_new ) break; \ } \ } \ - call_c2( zigzag_c.name, t1, pbuf2, pbuf3, &dc_c ); \ - call_a2( zigzag_asm.name, t2, pbuf2, pbuf4, &dc_a ); \ + call_c2( zigzag_c[interlace].name, t1, pbuf2, pbuf3, &dc_c ); \ + call_a2( zigzag_asm[interlace].name, t2, pbuf2, pbuf4, &dc_a ); \ } #define TEST_INTERLEAVE( name, t1, t2, dct, size ) \ - if( zigzag_asm.name != zigzag_ref.name ) \ + if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ { \ for( int j = 0; j < 100; j++ ) \ { \ @@ -792,8 +792,8 @@ static int check_dct( int cpu_ref, int cpu_new ) for( int i = 0; i < size; i++ ) \ dct[i] = rand()&0x1F ? 0 : dct[i]; \ memcpy(buf3, buf4, 10); \ - call_c( zigzag_c.name, t1, dct, buf3 ); \ - call_a( zigzag_asm.name, t2, dct, buf4 ); \ + call_c( zigzag_c[interlace].name, t1, dct, buf3 ); \ + call_a( zigzag_asm[interlace].name, t2, dct, buf4 ); \ if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( buf3, buf4, 10 ) ) \ { \ ok = 0; \ @@ -801,33 +801,23 @@ static int check_dct( int cpu_ref, int cpu_new ) } \ } - interlace = 0; - x264_zigzag_init( 0, &zigzag_c, 0 ); - x264_zigzag_init( cpu_ref, &zigzag_ref, 0 ); - x264_zigzag_init( cpu_new, &zigzag_asm, 0 ); - - ok = 1; used_asm = 0; - TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 ); - TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 ); - TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 ); - TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 ); - report( "zigzag_frame :" ); - - interlace = 1; - x264_zigzag_init( 0, &zigzag_c, 1 ); - x264_zigzag_init( cpu_ref, &zigzag_ref, 1 ); - x264_zigzag_init( cpu_new, &zigzag_asm, 1 ); - - ok = 1; used_asm = 0; - TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 ); - TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 ); - TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 ); - TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 ); - report( "zigzag_field :" ); + x264_zigzag_init( 0, &zigzag_c[0], &zigzag_c[1] ); + x264_zigzag_init( cpu_ref, &zigzag_ref[0], &zigzag_ref[1] ); + x264_zigzag_init( cpu_new, &zigzag_asm[0], &zigzag_asm[1] ); ok = 1; used_asm = 0; TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0], 64 ); report( "zigzag_interleave :" ); + + for( interlace = 0; interlace <= 1; interlace++ ) + { + ok = 1; used_asm = 0; + TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 ); + TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 ); + TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 ); + TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 ); + report( interlace ? "zigzag_field :" : "zigzag_frame :" ); + } #undef TEST_ZIGZAG_SCAN #undef TEST_ZIGZAG_SUB -- 2.40.0