From: Fiona Glaser Date: Wed, 4 Mar 2009 00:21:52 +0000 (-0800) Subject: Slightly faster 8x16 SAD on Penryn Core 2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b77ea4db6faa06d9120defe6fa1a5f6803d224d4;p=libx264 Slightly faster 8x16 SAD on Penryn Core 2 Same as MMX 8x16 cacheline SAD, but calls SSE2 8x16 SAD in non-cacheline case. Only Nehalem benefits from sizes smaller than 8x16, and Nehalem doesn't use cacheline functions, so no smaller versions are included. --- diff --git a/common/pixel.c b/common/pixel.c index 8f1b1f57..7fa9830b 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -679,14 +679,21 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT_ADS( _sse2 ); pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2; -#ifdef ARCH_X86 - if( cpu&X264_CPU_CACHELINE_64 ) + + if( cpu&X264_CPU_CACHELINE_64 ) { +#ifdef ARCH_X86 INIT2( sad, _cache64_sse2 ); INIT2( sad_x3, _cache64_sse2 ); INIT2( sad_x4, _cache64_sse2 ); - } #endif + if( cpu&X264_CPU_SSE2_IS_FAST ) + { + pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_sse2; + pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_sse2; + } + } + if( cpu&X264_CPU_SSE_MISALIGN ) { INIT2( sad_x3, _sse2_misalign ); diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm index 1e3e3ac7..1653abad 100644 --- a/common/x86/sad-a.asm +++ b/common/x86/sad-a.asm @@ -1142,8 +1142,8 @@ cglobal x264_pixel_sad_8x%1_cache%2_mmxext jg .split %endmacro -%macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver -cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5 +%macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name +cglobal x264_pixel_sad_x3_%1x%2_cache%3_%6 CHECK_SPLIT r1m, %1, %3 CHECK_SPLIT r2m, %1, %3 CHECK_SPLIT r3m, %1, %3 @@ -1207,8 +1207,8 @@ cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5 %endif %endmacro -%macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver -cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5 +%macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name +cglobal x264_pixel_sad_x4_%1x%2_cache%3_%6 CHECK_SPLIT r1m, %1, %3 CHECK_SPLIT r2m, %1, %3 CHECK_SPLIT r3m, %1, %3 @@ -1285,9 +1285,9 @@ cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5 %endif %endmacro -%macro SADX34_CACHELINE_FUNC 5 - SADX3_CACHELINE_FUNC %1, %2, %3, %4, %5 - SADX4_CACHELINE_FUNC %1, %2, %3, %4, %5 +%macro SADX34_CACHELINE_FUNC 1+ + SADX3_CACHELINE_FUNC %1 + SADX4_CACHELINE_FUNC %1 %endmacro @@ -1307,15 +1307,15 @@ SAD8_CACHELINE_FUNC_MMX2 8, 64 SAD8_CACHELINE_FUNC_MMX2 16, 64 %ifndef ARCH_X86_64 -SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext -SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext -SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext -SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext -SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext -SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext +SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext, mmxext +SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext, mmxext +SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext, mmxext +SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext, mmxext +SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext, mmxext +SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext, mmxext %endif ; !ARCH_X86_64 -SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext -SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext +SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext, mmxext +SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext, mmxext %ifndef ARCH_X86_64 SAD16_CACHELINE_FUNC sse2, 8 @@ -1325,9 +1325,10 @@ SAD16_CACHELINE_FUNC sse2, 16 SAD16_CACHELINE_LOOP_SSE2 i %assign i i+1 %endrep -SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2 -SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2 +SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2 +SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2 %endif ; !ARCH_X86_64 +SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmxext, sse2 SAD16_CACHELINE_FUNC ssse3, 8 SAD16_CACHELINE_FUNC ssse3, 16 @@ -1336,5 +1337,6 @@ SAD16_CACHELINE_FUNC ssse3, 16 SAD16_CACHELINE_LOOP_SSSE3 i %assign i i+1 %endrep -SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3 -SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3 +SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3 +SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3 +