From ded3e28cf1f593cbd1ad7c5255ba4ec82635574c Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Sat, 31 Jan 2009 05:00:39 -0800 Subject: [PATCH] Faster 8x8dct+CAVLC interleave Integrate array_non_zero with the CAVLC 8x8dct interleave function. Roughly 1.5-2x faster than the original separate array_non_zero method. --- common/dct.c | 9 ++++++- common/dct.h | 2 +- common/x86/dct-a.asm | 59 +++++++++++++++++++++++++++++++++----------- common/x86/dct.h | 2 +- encoder/cavlc.c | 6 +---- tools/checkasm.c | 25 ++++++++++++++++++- 6 files changed, 79 insertions(+), 24 deletions(-) diff --git a/common/dct.c b/common/dct.c index 5f9f0fb0..f6095409 100644 --- a/common/dct.c +++ b/common/dct.c @@ -608,12 +608,19 @@ static void zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8 #undef ZIG #undef COPY4x4 -static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src ) +static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz ) { int i,j; for( i=0; i<4; i++ ) + { + int nz = 0; for( j=0; j<16; j++ ) + { + nz |= src[i+j*4]; dst[i*16+j] = src[i+j*4]; + } + nnz[(i&1) + (i>>1)*8] = !!nz; + } } void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) diff --git a/common/dct.h b/common/dct.h index 71951f9b..3819ce11 100644 --- a/common/dct.h +++ b/common/dct.h @@ -119,7 +119,7 @@ typedef struct void (*scan_4x4)( int16_t level[16], int16_t dct[4][4] ); void (*sub_8x8)( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst ); void (*sub_4x4)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst ); - void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src ); + void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src, uint8_t *nnz ); } x264_zigzag_function_t; diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index 156a7ae4..b6604974 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -34,6 +34,7 @@ pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11 pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7 +pb_1: times 8 db 1 SECTION .text @@ -737,19 +738,47 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3 movdqa [r0+16], xmm1 RET -INIT_MMX -cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 2,3 - mov r2d, 24 -.loop: - movq m0, [r1+r2*4+ 0] - movq m1, [r1+r2*4+ 8] - movq m2, [r1+r2*4+16] - movq m3, [r1+r2*4+24] +;----------------------------------------------------------------------------- +; void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz ) +;----------------------------------------------------------------------------- + +%macro INTERLEAVE 1 + movq m0, [r1+%1*4+ 0] + movq m1, [r1+%1*4+ 8] + movq m2, [r1+%1*4+16] + movq m3, [r1+%1*4+24] TRANSPOSE4x4W 0,1,2,3,4 - movq [r0+r2+ 0], m0 - movq [r0+r2+32], m1 - movq [r0+r2+64], m2 - movq [r0+r2+96], m3 - sub r2d, 8 - jge .loop - REP_RET + movq [r0+%1+ 0], m0 + movq [r0+%1+32], m1 + movq [r0+%1+64], m2 + movq [r0+%1+96], m3 +%if %1 + packsswb m0, m1 + por m6, m2 + por m7, m3 + por m5, m0 +%else + packsswb m0, m1 + SWAP m5, m0 + SWAP m6, m2 + SWAP m7, m3 +%endif +%endmacro + +INIT_MMX +cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3 + INTERLEAVE 0 + INTERLEAVE 8 + INTERLEAVE 16 + INTERLEAVE 24 + packsswb m6, m7 + packsswb m5, m6 + packsswb m5, m5 + pxor m0, m0 + pcmpeqb m5, m0 + paddb m5, [pb_1 GLOBAL] + movd r0d, m5 + mov [r2+0], r0w + shr r0d, 16 + mov [r2+8], r0w + RET diff --git a/common/x86/dct.h b/common/x86/dct.h index 99392761..7617ea58 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -61,6 +61,6 @@ void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[4][4] ); void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[4][4] ); void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] ); void x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst ); -void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src ); +void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz ); #endif diff --git a/encoder/cavlc.c b/encoder/cavlc.c index bfeecc2f..50eb5a1a 100644 --- a/encoder/cavlc.c +++ b/encoder/cavlc.c @@ -273,11 +273,7 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s /* shuffle 8x8 dct coeffs into 4x4 lists */ for( i8 = i8start; i8 <= i8end; i8++ ) if( h->mb.i_cbp_luma & (1 << i8) ) - { - h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8] ); - for( i4 = 0; i4 < 4; i4++ ) - h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] ); - } + h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8], &h->mb.cache.non_zero_count[x264_scan8[i8*4]] ); } for( i8 = i8start; i8 <= i8end; i8++ ) diff --git a/tools/checkasm.c b/tools/checkasm.c index 3f89e681..29ddadd5 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -635,6 +635,26 @@ static int check_dct( int cpu_ref, int cpu_new ) call_a2( zigzag_asm.name, t2, buf2, buf4 ); \ } +#define TEST_INTERLEAVE( name, t1, t2, dct, size ) \ + if( zigzag_asm.name != zigzag_ref.name ) \ + { \ + for( j=0; j<100; j++ ) \ + { \ + set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\ + used_asm = 1; \ + memcpy(dct, buf1, size*sizeof(int16_t));\ + for( i=0; i