From: Loren Merritt Date: Sat, 7 Jun 2008 05:30:37 +0000 (-0600) Subject: enable ssse3 phadd satd on Penryn. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f9ad5ee2564bb272635f0c69fefa28e0b1b47f37;p=libx264 enable ssse3 phadd satd on Penryn. --- diff --git a/common/cpu.c b/common/cpu.c index 47a72f76..3ebe970f 100644 --- a/common/cpu.c +++ b/common/cpu.c @@ -47,6 +47,7 @@ const struct { {"SSE2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2}, {"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3}, {"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3}, + {"SSE4", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, {"3DNow", X264_CPU_3DNOW}, {"Altivec", X264_CPU_ALTIVEC}, {"Cache32", X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32}, @@ -88,6 +89,8 @@ uint32_t x264_cpu_detect( void ) cpu |= X264_CPU_SSE3; if( ecx&0x00000200 ) cpu |= X264_CPU_SSSE3; + if( ecx&0x00080000 ) + cpu |= X264_CPU_SSE4; x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx ); max_extended_cap = eax; diff --git a/common/pixel.c b/common/pixel.c index 133968cc..0d00b6e5 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -360,6 +360,7 @@ SATD_X_DECL7() SATD_X_DECL7( _mmxext ) SATD_X_DECL5( _sse2 ) SATD_X_DECL7( _ssse3 ) +SATD_X_DECL5( _ssse3_phadd ) #endif /**************************************************************************** @@ -649,6 +650,14 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT2( sad_x4, _cache64_ssse3 ); } } + + if( cpu&X264_CPU_SSE4 ) + { + // enabled on Penryn, but slower on Conroe + INIT5( satd, _ssse3_phadd ); + INIT5( satd_x3, _ssse3_phadd ); + INIT5( satd_x4, _ssse3_phadd ); + } #endif //HAVE_MMX #ifdef ARCH_PPC diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 9eed1dbc..361e2a63 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -274,19 +274,23 @@ SSD_SSE2 8, 4 LOAD_DIFF_8P %4, %6, [r0+r4], [r2+r5] %endmacro -;;; row transform not used, because phaddw is much slower than paddw on a Conroe -;%macro PHSUMSUB 3 -; movdqa %3, %1 -; phaddw %1, %2 -; phsubw %3, %2 -;%endmacro - -;%macro HADAMARD4_ROW_SSSE3 5 ; abcd-t -> adtc -; PHSUMSUB %1, %2, %5 -; PHSUMSUB %3, %4, %2 -; PHSUMSUB %1, %3, %4 -; PHSUMSUB %5, %2, %3 -;%endmacro +; phaddw is used only in 4x4 hadamard, because in 8x8 it's slower: +; even on Penryn, phaddw has latency 3 while paddw and punpck* have 1. +; 4x4 is special in that 4x4 transpose in xmmregs takes extra munging, +; whereas phaddw-based transform doesn't care what order the coefs end up in. + +%macro PHSUMSUB 3 + movdqa %3, %1 + phaddw %1, %2 + phsubw %3, %2 +%endmacro + +%macro HADAMARD4_ROW_PHADD 5 ; abcd-t -> adtc + PHSUMSUB %1, %2, %5 + PHSUMSUB %3, %4, %2 + PHSUMSUB %1, %3, %4 + PHSUMSUB %5, %2, %3 +%endmacro %macro SUMSUB_BADC 4 paddw %1, %2 @@ -494,6 +498,21 @@ SSD_SSE2 8, 4 paddusw xmm6, xmm2 %endmacro +%macro SATD_8x4_PHADD 1 + LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 +%if %1 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +%endif + HADAMARD4_1D xmm0, xmm1, xmm2, xmm3 + HADAMARD4_ROW_PHADD xmm0, xmm1, xmm2, xmm3, xmm4 + ABS4 xmm0, xmm3, xmm4, xmm2, xmm1, xmm5 + paddusw xmm0, xmm3 + paddusw xmm2, xmm4 + paddusw xmm6, xmm0 + paddusw xmm6, xmm2 +%endmacro + %macro SATD_START_MMX 0 lea r4, [3*r1] ; 3*stride1 lea r5, [3*r3] ; 3*stride2 @@ -1279,6 +1298,8 @@ SA8D_16x16_32 ssse3 INTRA_SA8D_SSE2 ssse3 INTRA_SATDS_MMX ssse3 SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3. +%define SATD_8x4_SSE2 SATD_8x4_PHADD +SATDS_SSE2 ssse3_phadd diff --git a/common/x86/pixel.h b/common/x86/pixel.h index 92adfbfb..6aa556a8 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -51,6 +51,7 @@ DECL_X1( ssd, sse2 ) DECL_X1( satd, mmxext ) DECL_X1( satd, sse2 ) DECL_X1( satd, ssse3 ) +DECL_X1( satd, ssse3_phadd ) DECL_X1( sa8d, mmxext ) DECL_X1( sa8d, sse2 ) DECL_X1( sa8d, ssse3 ) diff --git a/tools/checkasm.c b/tools/checkasm.c index ab3c4f95..115e2217 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -120,6 +120,7 @@ static void print_bench(void) for( k=0; kpointer; k++ ); if( kcpu&X264_CPU_SSE4 ? "sse4" : b->cpu&X264_CPU_SSSE3 ? "ssse3" : b->cpu&X264_CPU_SSE3 ? "sse3" : b->cpu&X264_CPU_SSE2 ? "sse2" : @@ -1142,6 +1143,11 @@ int check_all_flags( void ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSSE3 Cache64" ); } + if( x264_cpu_detect() & X264_CPU_SSSE3 ) + { + cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" ); + } #elif ARCH_PPC if( x264_cpu_detect() & X264_CPU_ALTIVEC ) { diff --git a/x264.h b/x264.h index ff4cc24b..7b390495 100644 --- a/x264.h +++ b/x264.h @@ -58,6 +58,7 @@ typedef struct x264_t x264_t; #define X264_CPU_CACHELINE_SPLIT 0x200 /* avoid memory loads that span the border between two cachelines */ #define X264_CPU_CACHELINE_32 0x0400 /* size of a cacheline in bytes */ #define X264_CPU_CACHELINE_64 0x0800 +#define X264_CPU_SSE4 0x001000 /* sse 4.1 */ /* Analyse flags */