From: Fiona Glaser Date: Tue, 17 Mar 2009 18:01:57 +0000 (-0700) Subject: SSE2 zigzag_interleave X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d25d50c9ffb02571c12e13c09356fa08fe97b0b4;p=libx264 SSE2 zigzag_interleave Replace PHADD with FastShuffle (more accurate naming). This flag represents asm functions that rely on fast SSE2 shuffle units, and thus are only faster on Phenom, Nehalem, and Penryn CPUs. --- diff --git a/common/cpu.c b/common/cpu.c index bf376fc4..860cd957 100644 --- a/common/cpu.c +++ b/common/cpu.c @@ -53,7 +53,7 @@ const x264_cpu_name_t x264_cpu_names[] = { {"SSE2Fast",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_FAST}, {"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3}, {"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3}, - {"PHADD", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_PHADD_IS_FAST}, + {"FastShuffle", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SHUFFLE_IS_FAST}, {"SSE4.1", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, {"SSE4.2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42}, {"Cache32", X264_CPU_CACHELINE_32}, @@ -107,7 +107,7 @@ uint32_t x264_cpu_detect( void ) if( cpu & X264_CPU_SSSE3 ) cpu |= X264_CPU_SSE2_IS_FAST; if( cpu & X264_CPU_SSE4 ) - cpu |= X264_CPU_PHADD_IS_FAST; + cpu |= X264_CPU_SHUFFLE_IS_FAST; x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx ); max_extended_cap = eax; @@ -124,6 +124,7 @@ uint32_t x264_cpu_detect( void ) cpu |= X264_CPU_SSE2_IS_FAST; cpu |= X264_CPU_SSE_MISALIGN; cpu |= X264_CPU_LZCNT; + cpu |= X264_CPU_SHUFFLE_IS_FAST; x264_cpu_mask_misalign_sse(); } else diff --git a/common/dct.c b/common/dct.c index 04301a92..1f8f4b39 100644 --- a/common/dct.c +++ b/common/dct.c @@ -663,9 +663,9 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) { pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3; + if( cpu&X264_CPU_SHUFFLE_IS_FAST ) + pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3; } - if( cpu&X264_CPU_PHADD_IS_FAST ) - pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3; #endif #ifdef ARCH_PPC @@ -678,5 +678,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) #ifdef HAVE_MMX if( cpu&X264_CPU_MMX ) pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx; + if( cpu&X264_CPU_SHUFFLE_IS_FAST ) + pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; #endif } diff --git a/common/pixel.c b/common/pixel.c index 38c39260..76d04e0d 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -763,7 +763,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT2( sad_x3, _cache64_ssse3 ); INIT2( sad_x4, _cache64_ssse3 ); } - if( !(cpu&X264_CPU_PHADD_IS_FAST) ) + if( !(cpu&X264_CPU_SHUFFLE_IS_FAST) ) { INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */ } diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index df51926c..6e92df6f 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -35,7 +35,7 @@ pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11 pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7 -pb_1: times 8 db 1 +pb_1: times 16 db 1 SECTION .text @@ -785,3 +785,50 @@ cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3 shr r0d, 16 mov [r2+8], r0w RET + +%macro INTERLEAVE_XMM 1 + mova m0, [r1+%1*4+ 0] + mova m1, [r1+%1*4+16] + mova m4, [r1+%1*4+32] + mova m5, [r1+%1*4+48] + SBUTTERFLY wd, 0, 1, 6 + SBUTTERFLY wd, 4, 5, 7 + SBUTTERFLY wd, 0, 1, 6 + SBUTTERFLY wd, 4, 5, 7 + movq [r0+%1+ 0], m0 + movhps [r0+%1+ 32], m0 + movq [r0+%1+ 64], m1 + movhps [r0+%1+ 96], m1 + movq [r0+%1+ 8], m4 + movhps [r0+%1+ 40], m4 + movq [r0+%1+ 72], m5 + movhps [r0+%1+104], m5 +%if %1 + por m2, m0 + por m3, m1 + por m2, m4 + por m3, m5 +%else + SWAP 0,2 + SWAP 3,1 + por m2, m4 + por m3, m5 +%endif +%endmacro + +INIT_XMM +cglobal x264_zigzag_interleave_8x8_cavlc_sse2, 3,3,8 + INTERLEAVE_XMM 0 + INTERLEAVE_XMM 16 + packsswb m2, m3 + pxor m5, m5 + packsswb m2, m2 + packsswb m2, m2 + pcmpeqb m5, m2 + paddb m5, [pb_1 GLOBAL] + movd r0d, m5 + mov [r2+0], r0w + shr r0d, 16 + mov [r2+8], r0w + RET + diff --git a/common/x86/dct.h b/common/x86/dct.h index 5b83d342..44518212 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -69,5 +69,6 @@ void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[4][4] ); void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] ); void x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst ); void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz ); +void x264_zigzag_interleave_8x8_cavlc_sse2( int16_t *dst, int16_t *src, uint8_t *nnz ); #endif diff --git a/tools/checkasm.c b/tools/checkasm.c index 1c1562a3..31fd9758 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -151,7 +151,7 @@ static void print_bench(void) if( kcpu&X264_CPU_SSE4 ? "sse4" : - b->cpu&X264_CPU_PHADD_IS_FAST ? "phadd" : + b->cpu&X264_CPU_SHUFFLE_IS_FAST ? "fastshuffle" : b->cpu&X264_CPU_SSSE3 ? "ssse3" : b->cpu&X264_CPU_SSE3 ? "sse3" : /* print sse2slow only if there's also a sse2fast version of the same func */ @@ -1364,10 +1364,10 @@ static int check_intra( int cpu_ref, int cpu_new ) for( i = 0; i < 12; i++ ) INTRA_TEST( predict_8x8, i, 8, edge ); - used_asm = 1; set_func_name("intra_predict_8x8_filter"); if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter ) { + used_asm = 1; for( i = 0; i < 32; i++ ) { memcpy( edge2, edge, 33 ); @@ -1463,6 +1463,8 @@ static int check_all_flags( void ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" ); + cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST; } if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN ) { @@ -1483,7 +1485,8 @@ static int check_all_flags( void ) cpu1 &= ~X264_CPU_CACHELINE_64; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" ); - ret |= add_flags( &cpu0, &cpu1, X264_CPU_PHADD_IS_FAST, "PHADD" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" ); + cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST; } if( x264_cpu_detect() & X264_CPU_SSE4 ) { diff --git a/x264.h b/x264.h index 3971992b..26ac421c 100644 --- a/x264.h +++ b/x264.h @@ -57,7 +57,7 @@ typedef struct x264_t x264_t; #define X264_CPU_SSE2_IS_FAST 0x000100 /* a few functions are only faster on Core2 and Phenom */ #define X264_CPU_SSE3 0x000200 #define X264_CPU_SSSE3 0x000400 -#define X264_CPU_PHADD_IS_FAST 0x000800 /* pre-Penryn Core2 have a uselessly slow PHADD instruction */ +#define X264_CPU_SHUFFLE_IS_FAST 0x000800 /* Penryn, Nehalem, and Phenom have fast shuffle units */ #define X264_CPU_STACK_MOD4 0x001000 /* if stack is only mod4 and not mod16 */ #define X264_CPU_SSE4 0x002000 /* SSE4.1 */ #define X264_CPU_SSE42 0x004000 /* SSE4.2 */