From: Loren Merritt Date: Thu, 20 Mar 2008 05:43:19 +0000 (-0600) Subject: reduce zigzag arrays from int to int16_t X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=30da25a99e24e5c1ff5972b7f5c22c4be2a944b1;p=libx264 reduce zigzag arrays from int to int16_t --- diff --git a/common/common.h b/common/common.h index 5ee3ef8a..c21ad1c8 100644 --- a/common/common.h +++ b/common/common.h @@ -337,14 +337,14 @@ struct x264_t /* Current MB DCT coeffs */ struct { - DECLARE_ALIGNED( int, luma16x16_dc[16], 16 ); - DECLARE_ALIGNED( int, chroma_dc[2][4], 16 ); + DECLARE_ALIGNED( int16_t, luma16x16_dc[16], 16 ); + DECLARE_ALIGNED( int16_t, chroma_dc[2][4], 16 ); // FIXME merge with union - DECLARE_ALIGNED( int, luma8x8[4][64], 16 ); + DECLARE_ALIGNED( int16_t, luma8x8[4][64], 16 ); union { - DECLARE_ALIGNED( int, residual_ac[15], 16 ); - DECLARE_ALIGNED( int, luma4x4[16], 16 ); + DECLARE_ALIGNED( int16_t, residual_ac[15], 16 ); + DECLARE_ALIGNED( int16_t, luma4x4[16], 16 ); } block[16+8]; } dct; @@ -441,8 +441,8 @@ struct x264_t /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */ DECLARE_ALIGNED( uint8_t, i4x4_fdec_buf[16*16], 16 ); DECLARE_ALIGNED( uint8_t, i8x8_fdec_buf[16*16], 16 ); - DECLARE_ALIGNED( int, i8x8_dct_buf[3][64], 16 ); - DECLARE_ALIGNED( int, i4x4_dct_buf[15][16], 16 ); + DECLARE_ALIGNED( int16_t, i8x8_dct_buf[3][64], 16 ); + DECLARE_ALIGNED( int16_t, i4x4_dct_buf[15][16], 16 ); /* pointer over mb of the frame to be compressed */ uint8_t *p_fenc[3]; diff --git a/common/dct.c b/common/dct.c index 74e5a917..895306b5 100644 --- a/common/dct.c +++ b/common/dct.c @@ -458,9 +458,10 @@ void x264_dct_init_weights( void ) } -#define ZIG(i,y,x) level[i] = dct[x][y]; +// gcc pessimizes multi-dimensional arrays here, even with constant indices +#define ZIG(i,y,x) level[i] = dct[0][x*8+y]; -static void zigzag_scan_8x8_frame( int level[64], int16_t dct[8][8] ) +static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] ) { ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0) ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2) @@ -480,7 +481,7 @@ static void zigzag_scan_8x8_frame( int level[64], int16_t dct[8][8] ) ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7) } -static void zigzag_scan_8x8_field( int level[64], int16_t dct[8][8] ) +static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] ) { ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1) ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1) @@ -500,7 +501,10 @@ static void zigzag_scan_8x8_field( int level[64], int16_t dct[8][8] ) ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7) } -static void zigzag_scan_4x4_frame( int level[16], int16_t dct[4][4] ) +#undef ZIG +#define ZIG(i,y,x) level[i] = dct[0][x*4+y]; + +static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] ) { ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0) ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2) @@ -508,15 +512,16 @@ static void zigzag_scan_4x4_frame( int level[16], int16_t dct[4][4] ) ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3) } -static void zigzag_scan_4x4_field( int level[16], int16_t dct[4][4] ) +static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] ) { - ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0) - ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1) - ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2) - ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3) + *(uint32_t*)level = *(uint32_t*)dct; + ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1) + *(uint32_t*)(level+6) = *(uint32_t*)(*dct+6); + *(uint64_t*)(level+8) = *(uint64_t*)(*dct+8); + *(uint64_t*)(level+12) = *(uint64_t*)(*dct+12); } -static void zigzag_scan_4x4ac_frame( int level[15], int16_t dct[4][4] ) +static void zigzag_scan_4x4ac_frame( int16_t level[15], int16_t dct[4][4] ) { ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2) @@ -524,7 +529,7 @@ static void zigzag_scan_4x4ac_frame( int level[15], int16_t dct[4][4] ) ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3) } -static void zigzag_scan_4x4ac_field( int level[15], int16_t dct[4][4] ) +static void zigzag_scan_4x4ac_field( int16_t level[15], int16_t dct[4][4] ) { ZIG( 0,1,0) ZIG( 1,0,1) ZIG( 2,2,0) ZIG( 3,3,0) ZIG( 4,1,1) ZIG( 5,2,1) ZIG( 6,3,1) @@ -533,7 +538,6 @@ static void zigzag_scan_4x4ac_field( int level[15], int16_t dct[4][4] ) } #undef ZIG - #define ZIG(i,y,x) {\ int oe = x+y*FENC_STRIDE;\ int od = x+y*FDEC_STRIDE;\ @@ -541,7 +545,7 @@ static void zigzag_scan_4x4ac_field( int level[15], int16_t dct[4][4] ) p_dst[od] = p_src[oe];\ } -static void zigzag_sub_4x4_frame( int level[16], const uint8_t *p_src, uint8_t *p_dst ) +static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst ) { ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0) ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2) @@ -549,7 +553,7 @@ static void zigzag_sub_4x4_frame( int level[16], const uint8_t *p_src, uint8_t * ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3) } -static void zigzag_sub_4x4_field( int level[16], const uint8_t *p_src, uint8_t *p_dst ) +static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst ) { ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0) ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1) @@ -557,7 +561,7 @@ static void zigzag_sub_4x4_field( int level[16], const uint8_t *p_src, uint8_t * ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3) } -static void zigzag_sub_4x4ac_frame( int level[15], const uint8_t *p_src, uint8_t *p_dst ) +static void zigzag_sub_4x4ac_frame( int16_t level[15], const uint8_t *p_src, uint8_t *p_dst ) { ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2) @@ -565,7 +569,7 @@ static void zigzag_sub_4x4ac_frame( int level[15], const uint8_t *p_src, uint8_t ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3) } -static void zigzag_sub_4x4ac_field( int level[15], const uint8_t *p_src, uint8_t *p_dst ) +static void zigzag_sub_4x4ac_field( int16_t level[15], const uint8_t *p_src, uint8_t *p_dst ) { ZIG( 0,1,0) ZIG( 1,0,1) ZIG( 2,2,0) ZIG( 3,3,0) ZIG( 4,1,1) ZIG( 5,2,1) ZIG( 6,3,1) @@ -585,12 +589,8 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) pf->sub_4x4 = zigzag_sub_4x4_field; pf->sub_4x4ac = zigzag_sub_4x4ac_field; #ifdef HAVE_MMX -#ifdef ARCH_X86 - if( cpu&X264_CPU_MMX ) - pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmx; -#endif - if( cpu&X264_CPU_SSE2 ) - pf->scan_4x4 = x264_zigzag_scan_4x4_field_sse2; + if( cpu&X264_CPU_MMXEXT ) + pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext; #endif #ifdef ARCH_PPC diff --git a/common/dct.h b/common/dct.h index 4c71e478..cf7dbbd1 100644 --- a/common/dct.h +++ b/common/dct.h @@ -108,11 +108,11 @@ typedef struct typedef struct { - void (*scan_8x8)( int level[64], int16_t dct[8][8] ); - void (*scan_4x4)( int level[16], int16_t dct[4][4] ); - void (*scan_4x4ac)( int level[15], int16_t dct[4][4] ); - void (*sub_4x4)( int level[16], const uint8_t *p_src, uint8_t *p_dst ); - void (*sub_4x4ac)( int level[15], const uint8_t *p_src, uint8_t *p_dst ); + void (*scan_8x8)( int16_t level[64], int16_t dct[8][8] ); + void (*scan_4x4)( int16_t level[16], int16_t dct[4][4] ); + void (*scan_4x4ac)( int16_t level[15], int16_t dct[4][4] ); + void (*sub_4x4)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst ); + void (*sub_4x4ac)( int16_t level[15], const uint8_t *p_src, uint8_t *p_dst ); } x264_zigzag_function_t; diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm index b7de1780..260bcf03 100644 --- a/common/x86/dct-32.asm +++ b/common/x86/dct-32.asm @@ -526,35 +526,3 @@ ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx, 128, 8 ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8 -;----------------------------------------------------------------------------- -; void x264_zigzag_scan_4x4_field_mmx( int level[16], int16_t dct[4][4] ) -;----------------------------------------------------------------------------- -cglobal x264_zigzag_scan_4x4_field_mmx - mov edx, [esp+8] - mov ecx, [esp+4] - punpcklwd mm0, [edx] - punpckhwd mm1, [edx] - punpcklwd mm2, [edx+8] - punpckhwd mm3, [edx+8] - punpcklwd mm4, [edx+16] - punpckhwd mm5, [edx+16] - punpcklwd mm6, [edx+24] - punpckhwd mm7, [edx+24] - psrad mm0, 16 - psrad mm1, 16 - psrad mm2, 16 - psrad mm3, 16 - psrad mm4, 16 - psrad mm5, 16 - psrad mm6, 16 - psrad mm7, 16 - movq [ecx ], mm0 - movq [ecx+16], mm2 - movq [ecx+24], mm3 - movq [ecx+32], mm4 - movq [ecx+40], mm5 - movq [ecx+48], mm6 - movq [ecx+56], mm7 - movq [ecx+12], mm1 - movd [ecx+ 8], mm2 - ret diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index a23bf761..0c6d463b 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -272,24 +272,21 @@ ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 8 %endif + ;----------------------------------------------------------------------------- -; void x264_zigzag_scan_4x4_field_sse2( int level[16], int16_t dct[4][4] ) +; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] ) ;----------------------------------------------------------------------------- -cglobal x264_zigzag_scan_4x4_field_sse2, 2,2 - punpcklwd xmm0, [r1] - punpckhwd xmm1, [r1] - punpcklwd xmm2, [r1+16] - punpckhwd xmm3, [r1+16] - psrad xmm0, 16 - psrad xmm1, 16 - psrad xmm2, 16 - psrad xmm3, 16 - movq [r0 ], xmm0 - movdqa [r0+16], xmm1 - movdqa [r0+32], xmm2 - movhlps xmm0, xmm0 - movdqa [r0+48], xmm3 - movq [r0+12], xmm0 - movd [r0+ 8], xmm1 +; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2 +cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3 + pshufw mm0, [r1+4], 0xd2 + movq mm1, [r1+16] + movq mm2, [r1+24] + movq [r0+4], mm0 + movq [r0+16], mm1 + movq [r0+24], mm2 + mov r2d, [r1] + mov [r0], r2d + mov r2d, [r1+12] + mov [r0+12], r2d RET diff --git a/common/x86/dct.h b/common/x86/dct.h index 5e93a99f..5b88dbea 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -46,7 +46,6 @@ void x264_sub16x16_dct8_sse2( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 void x264_add8x8_idct8_sse2( uint8_t *dst, int16_t dct[8][8] ); void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][8][8] ); -void x264_zigzag_scan_4x4_field_sse2( int level[16], int16_t dct[4][4] ); -void x264_zigzag_scan_4x4_field_mmx( int level[16], int16_t dct[4][4] ); +void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] ); #endif diff --git a/encoder/cabac.c b/encoder/cabac.c index e36744e5..c25c65cb 100644 --- a/encoder/cabac.c +++ b/encoder/cabac.c @@ -707,7 +707,7 @@ static const int last_coeff_flag_offset_8x8[63] = { static const int identity[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; -static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int *l, int i_count ) +static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count ) { const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat]; const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat]; diff --git a/encoder/cavlc.c b/encoder/cavlc.c index a20c0b0f..ff1aed24 100644 --- a/encoder/cavlc.c +++ b/encoder/cavlc.c @@ -63,7 +63,7 @@ static inline void bs_write_vlc( bs_t *s, vlc_t v ) /**************************************************************************** * block_residual_write_cavlc: ****************************************************************************/ -static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int *l, int i_count ) +static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int16_t *l, int i_count ) { int level[16], run[16]; int i_total, i_trailing; diff --git a/encoder/macroblock.c b/encoder/macroblock.c index d095b7c2..5f288635 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -26,7 +26,7 @@ #define ZIG(i,y,x) level[i] = dct[x][y]; -static inline void zigzag_scan_2x2_dc( int level[4], int16_t dct[2][2] ) +static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] ) { ZIG(0,0,0) ZIG(1,0,1) @@ -43,7 +43,7 @@ static inline void zigzag_scan_2x2_dc( int level[4], int16_t dct[2][2] ) * for the complete mb: if score < 6 -> null * chroma: for the complete mb: if score < 7 -> null */ -static int x264_mb_decimate_score( int *dct, int i_max ) +static int x264_mb_decimate_score( int16_t *dct, int i_max ) { static const int i_ds_table4[16] = { 3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 }; @@ -618,7 +618,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir ) { DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 ); DECLARE_ALIGNED( int16_t, dct2x2[2][2], 16 ); - DECLARE_ALIGNED( int, dctscan[16], 16 ); + DECLARE_ALIGNED( int16_t, dctscan[16], 16 ); int i_qp = h->mb.i_qp; int mvp[2]; diff --git a/encoder/macroblock.h b/encoder/macroblock.h index 3ca227da..ee69d407 100644 --- a/encoder/macroblock.h +++ b/encoder/macroblock.h @@ -65,7 +65,7 @@ static inline int array_non_zero_int( void *v, int i_count ) return 0; } -static inline int array_non_zero_count( int *v, int i_count ) +static inline int array_non_zero_count( int16_t *v, int i_count ) { int i; int i_nz; diff --git a/tools/checkasm.c b/tools/checkasm.c index 91f80de0..2b947b09 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -347,8 +347,8 @@ static int check_dct( int cpu_ref, int cpu_new ) x264_zigzag_function_t zigzag_ref; x264_zigzag_function_t zigzag_asm; - int32_t level1[64] __attribute__((aligned(16))); - int32_t level2[64] __attribute__((aligned(16))); + int16_t level1[64] __attribute__((aligned(16))); + int16_t level2[64] __attribute__((aligned(16))); #define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \ if( zigzag_asm.name != zigzag_ref.name ) \ @@ -356,7 +356,7 @@ static int check_dct( int cpu_ref, int cpu_new ) used_asm = 1; \ call_c( zigzag_c.name, t1, dct ); \ call_a( zigzag_asm.name, t2, dct ); \ - if( memcmp( t1, t2, size ) ) \ + if( memcmp( t1, t2, size*sizeof(int16_t) ) ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ @@ -371,7 +371,7 @@ static int check_dct( int cpu_ref, int cpu_new ) memcpy( buf4, buf1, 16*FDEC_STRIDE ); \ call_c( zigzag_c.name, t1, buf2, buf3 ); \ call_a( zigzag_asm.name, t2, buf2, buf4 ); \ - if( memcmp( t1, t2, size )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) ) \ + if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ @@ -383,11 +383,11 @@ static int check_dct( int cpu_ref, int cpu_new ) x264_zigzag_init( cpu_new, &zigzag_asm, 0 ); ok = 1; used_asm = 0; - TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64*4 ); - TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16*4 ); - TEST_ZIGZAG_SCAN( scan_4x4ac, level1, level2, dct1[0], 15*4 ); - TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16*4 ); - TEST_ZIGZAG_SUB( sub_4x4ac, level1, level2, 15*4 ); + TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 ); + TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 ); + TEST_ZIGZAG_SCAN( scan_4x4ac, level1, level2, dct1[0], 15 ); + TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 ); + TEST_ZIGZAG_SUB( sub_4x4ac, level1, level2, 15 ); report( "zigzag_frame :" ); x264_zigzag_init( 0, &zigzag_c, 1 ); @@ -395,11 +395,11 @@ static int check_dct( int cpu_ref, int cpu_new ) x264_zigzag_init( cpu_new, &zigzag_asm, 1 ); ok = 1; used_asm = 0; - TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64*4 ); - TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16*4 ); - TEST_ZIGZAG_SCAN( scan_4x4ac, level1, level2, dct1[0], 15*4 ); - TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16*4 ); - TEST_ZIGZAG_SUB( sub_4x4ac, level1, level2, 15*4 ); + TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 ); + TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 ); + TEST_ZIGZAG_SCAN( scan_4x4ac, level1, level2, dct1[0], 15 ); + TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 ); + TEST_ZIGZAG_SUB( sub_4x4ac, level1, level2, 15 ); report( "zigzag_field :" ); #undef TEST_ZIGZAG_SCAN #undef TEST_ZIGZAG_SUB