{"SSE2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2},
{"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
{"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
+ {"SSE4", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"3DNow", X264_CPU_3DNOW},
{"Altivec", X264_CPU_ALTIVEC},
{"Cache32", X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32},
cpu |= X264_CPU_SSE3;
if( ecx&0x00000200 )
cpu |= X264_CPU_SSSE3;
+ if( ecx&0x00080000 )
+ cpu |= X264_CPU_SSE4;
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
SATD_X_DECL7( _mmxext )
SATD_X_DECL5( _sse2 )
SATD_X_DECL7( _ssse3 )
+SATD_X_DECL5( _ssse3_phadd )
#endif
/****************************************************************************
INIT2( sad_x4, _cache64_ssse3 );
}
}
+
+ if( cpu&X264_CPU_SSE4 )
+ {
+ // enabled on Penryn, but slower on Conroe
+ INIT5( satd, _ssse3_phadd );
+ INIT5( satd_x3, _ssse3_phadd );
+ INIT5( satd_x4, _ssse3_phadd );
+ }
#endif //HAVE_MMX
#ifdef ARCH_PPC
LOAD_DIFF_8P %4, %6, [r0+r4], [r2+r5]
%endmacro
-;;; row transform not used, because phaddw is much slower than paddw on a Conroe
-;%macro PHSUMSUB 3
-; movdqa %3, %1
-; phaddw %1, %2
-; phsubw %3, %2
-;%endmacro
-
-;%macro HADAMARD4_ROW_SSSE3 5 ; abcd-t -> adtc
-; PHSUMSUB %1, %2, %5
-; PHSUMSUB %3, %4, %2
-; PHSUMSUB %1, %3, %4
-; PHSUMSUB %5, %2, %3
-;%endmacro
+; phaddw is used only in 4x4 hadamard, because in 8x8 it's slower:
+; even on Penryn, phaddw has latency 3 while paddw and punpck* have 1.
+; 4x4 is special in that 4x4 transpose in xmmregs takes extra munging,
+; whereas phaddw-based transform doesn't care what order the coefs end up in.
+
+%macro PHSUMSUB 3
+ movdqa %3, %1
+ phaddw %1, %2
+ phsubw %3, %2
+%endmacro
+
+%macro HADAMARD4_ROW_PHADD 5 ; abcd-t -> adtc
+ PHSUMSUB %1, %2, %5
+ PHSUMSUB %3, %4, %2
+ PHSUMSUB %1, %3, %4
+ PHSUMSUB %5, %2, %3
+%endmacro
%macro SUMSUB_BADC 4
paddw %1, %2
paddusw xmm6, xmm2
%endmacro
+%macro SATD_8x4_PHADD 1
+ LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+%if %1
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+%endif
+ HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
+ HADAMARD4_ROW_PHADD xmm0, xmm1, xmm2, xmm3, xmm4
+ ABS4 xmm0, xmm3, xmm4, xmm2, xmm1, xmm5
+ paddusw xmm0, xmm3
+ paddusw xmm2, xmm4
+ paddusw xmm6, xmm0
+ paddusw xmm6, xmm2
+%endmacro
+
%macro SATD_START_MMX 0
lea r4, [3*r1] ; 3*stride1
lea r5, [3*r3] ; 3*stride2
INTRA_SA8D_SSE2 ssse3
INTRA_SATDS_MMX ssse3
SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.
+%define SATD_8x4_SSE2 SATD_8x4_PHADD
+SATDS_SSE2 ssse3_phadd
DECL_X1( satd, mmxext )
DECL_X1( satd, sse2 )
DECL_X1( satd, ssse3 )
+DECL_X1( satd, ssse3_phadd )
DECL_X1( sa8d, mmxext )
DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 )
for( k=0; k<j && benchs[i].vers[k].pointer != b->pointer; k++ );
if( k<j ) continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+ b->cpu&X264_CPU_SSE4 ? "sse4" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
b->cpu&X264_CPU_SSE3 ? "sse3" :
b->cpu&X264_CPU_SSE2 ? "sse2" :
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
}
+ if( x264_cpu_detect() & X264_CPU_SSSE3 )
+ {
+ cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64);
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
+ }
#elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC )
{
#define X264_CPU_CACHELINE_SPLIT 0x200 /* avoid memory loads that span the border between two cachelines */
#define X264_CPU_CACHELINE_32 0x0400 /* size of a cacheline in bytes */
#define X264_CPU_CACHELINE_64 0x0800
+#define X264_CPU_SSE4 0x001000 /* sse 4.1 */
/* Analyse flags
*/