From cb688111fb28225a4d1fe2a45472ac0cd093a08f Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Tue, 30 Dec 2008 20:47:45 -0500 Subject: [PATCH] Add support for SSE4a (Phenom) LZCNT instruction Significantly speeds up coeff_last and coeff_level_run on Phenom CPUs for faster CAVLC and CABAC. Also a small tweak to coeff_level_run asm. --- common/cpu.c | 2 ++ common/quant.c | 13 ++++++++ common/x86/quant-a.asm | 74 ++++++++++++++++++++++++++++++++---------- common/x86/quant.h | 7 ++++ tools/checkasm.c | 14 +++++++- x264.h | 1 + 6 files changed, 92 insertions(+), 19 deletions(-) diff --git a/common/cpu.c b/common/cpu.c index aff31eb8..c1850462 100644 --- a/common/cpu.c +++ b/common/cpu.c @@ -54,6 +54,7 @@ const x264_cpu_name_t x264_cpu_names[] = { {"Cache32", X264_CPU_CACHELINE_32}, {"Cache64", X264_CPU_CACHELINE_64}, {"SSEMisalign", X264_CPU_SSE_MISALIGN}, + {"LZCNT", X264_CPU_LZCNT}, {"Slow_mod4_stack", X264_CPU_STACK_MOD4}, {"", 0}, }; @@ -117,6 +118,7 @@ uint32_t x264_cpu_detect( void ) { cpu |= X264_CPU_SSE2_IS_FAST; cpu |= X264_CPU_SSE_MISALIGN; + cpu |= X264_CPU_LZCNT; x264_cpu_mask_misalign_sse(); } else diff --git a/common/quant.c b/common/quant.c index fa38360c..ac798a25 100644 --- a/common/quant.c +++ b/common/quant.c @@ -352,6 +352,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) #endif pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext; pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext; + if( cpu&X264_CPU_LZCNT ) + { + pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext_lzcnt; + pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext_lzcnt; + } } if( cpu&X264_CPU_SSE2 ) @@ -376,6 +381,14 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2; + if( cpu&X264_CPU_LZCNT ) + { + pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt; + } } if( cpu&X264_CPU_SSSE3 ) diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index 5cbdf4a8..3b92379e 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -688,37 +688,53 @@ DECIMATE8x8 ssse3 or %1, %3 %endmacro +%macro LAST_X86 3 + bsr %1, %2 +%endmacro + +%macro LAST_SSE4A 3 + lzcnt %1, %2 + xor %1, %3 +%endmacro + +%macro COEFF_LAST4 1 %ifdef ARCH_X86_64 -cglobal x264_coeff_last4_mmxext, 1,1 - bsr rax, [r0] +cglobal x264_coeff_last4_%1, 1,1 + LAST rax, [r0], 0x3f shr eax, 4 RET %else -cglobal x264_coeff_last4_mmxext, 0,3 +cglobal x264_coeff_last4_%1, 0,3 mov edx, r0m mov eax, [edx+4] xor ecx, ecx test eax, eax cmovz eax, [edx] setnz cl - bsr eax, eax + LAST eax, eax, 0x1f shr eax, 4 lea eax, [eax+ecx*2] RET %endif +%endmacro + +%define LAST LAST_X86 +COEFF_LAST4 mmxext +%define LAST LAST_SSE4A +COEFF_LAST4 mmxext_lzcnt %macro COEFF_LAST 1 cglobal x264_coeff_last15_%1, 1,3 LAST_MASK r1d, r0-2, r2d xor r1d, 0xffff - bsr eax, r1d + LAST eax, r1d, 0x1f dec eax RET cglobal x264_coeff_last16_%1, 1,3 LAST_MASK r1d, r0, r2d xor r1d, 0xffff - bsr eax, r1d + LAST eax, r1d, 0x1f RET %ifndef ARCH_X86_64 @@ -738,17 +754,18 @@ cglobal x264_coeff_last16_%1, 1,3 not r1d xor r2d, -1 jne .secondhalf - bsr eax, r1d + LAST eax, r1d, 0x1f RET .secondhalf: - bsr eax, r2d + LAST eax, r2d, 0x1f add eax, 32 RET %endif %endmacro %ifdef ARCH_X86_64 - cglobal x264_coeff_last64_sse2, 1,4 +%macro COEFF_LAST64 1 + cglobal x264_coeff_last64_%1, 1,4 LAST_MASK_SSE2 r1d, r0 LAST_MASK_SSE2 r2d, r0+32 LAST_MASK_SSE2 r3d, r0+64 @@ -760,16 +777,25 @@ cglobal x264_coeff_last16_%1, 1,3 shl r3, 32 or r1, r3 not r1 - bsr rax, r1 + LAST rax, r1, 0x3f RET +%endmacro + +%define LAST LAST_X86 +COEFF_LAST64 sse2 +%define LAST LAST_SSE4A +COEFF_LAST64 sse2_lzcnt %endif +%define LAST LAST_X86 %ifndef ARCH_X86_64 %define LAST_MASK LAST_MASK_MMX COEFF_LAST mmxext %endif %define LAST_MASK LAST_MASK_SSE2 COEFF_LAST sse2 +%define LAST LAST_SSE4A +COEFF_LAST sse2_lzcnt ;----------------------------------------------------------------------------- ; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel ) @@ -783,6 +809,15 @@ COEFF_LAST sse2 pmovmskb %1, mm0 %endmacro +%macro LZCOUNT_X86 3 + bsr %1, %2 + xor %1, %3 +%endmacro + +%macro LZCOUNT_SSE4A 3 + lzcnt %1, %2 +%endmacro + ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args %ifdef ARCH_X86_64 DECLARE_REG_TMP 0,1,2,3,4,5,6 @@ -794,21 +829,18 @@ COEFF_LAST sse2 cglobal x264_coeff_level_run%2_%1,0,7 movifnidn t0d, r0m movifnidn t1d, r1m - LAST_MASK t2d, t0-(%2&1)*2, t4d - not t2d - shl t2d, 32-((%2+1)&~1) + LAST_MASK t5d, t0-(%2&1)*2, t4d + not t5d + shl t5d, 32-((%2+1)&~1) mov t4d, %2-1 - mov t5d, t2d - bsr t3d, t2d + LZCOUNT t3d, t5d, 0x1f xor t6d, t6d shl t5d, 1 - xor t3d, 0x1f sub t4d, t3d shl t5d, t3b mov [t1], t4d .loop: - bsr t3d, t5d - xor t3d, 0x1f + LZCOUNT t3d, t5d, 0x1f mov t2w, [t0+t4*2] mov [t1+t6 +36], t3b mov [t1+t6*2+ 4], t2w @@ -820,6 +852,7 @@ cglobal x264_coeff_level_run%2_%1,0,7 RET %endmacro +%define LZCOUNT LZCOUNT_X86 %ifndef ARCH_X86_64 %define LAST_MASK LAST_MASK_MMX COEFF_LEVELRUN mmxext, 15 @@ -830,3 +863,8 @@ COEFF_LEVELRUN mmxext, 4 %define LAST_MASK LAST_MASK_SSE2 COEFF_LEVELRUN sse2, 15 COEFF_LEVELRUN sse2, 16 +%define LZCOUNT LZCOUNT_SSE4A +COEFF_LEVELRUN sse2_lzcnt, 15 +COEFF_LEVELRUN sse2_lzcnt, 16 +%define LAST_MASK LAST_MASK4_MMX +COEFF_LEVELRUN mmxext_lzcnt, 4 diff --git a/common/x86/quant.h b/common/x86/quant.h index 46186ceb..878699f9 100644 --- a/common/x86/quant.h +++ b/common/x86/quant.h @@ -64,10 +64,17 @@ int x264_coeff_last64_mmxext( int16_t *dct ); int x264_coeff_last15_sse2( int16_t *dct ); int x264_coeff_last16_sse2( int16_t *dct ); int x264_coeff_last64_sse2( int16_t *dct ); +int x264_coeff_last4_mmxext_lzcnt( int16_t *dct ); +int x264_coeff_last15_sse2_lzcnt( int16_t *dct ); +int x264_coeff_last16_sse2_lzcnt( int16_t *dct ); +int x264_coeff_last64_sse2_lzcnt( int16_t *dct ); int x264_coeff_level_run16_mmxext( int16_t *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_sse2( int16_t *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run16_sse2_lzcnt( int16_t *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_mmxext( int16_t *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_sse2( int16_t *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run15_sse2_lzcnt( int16_t *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_mmxext( int16_t *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run4_mmxext_lzcnt( int16_t *dct, x264_run_level_t *runlevel ); #endif diff --git a/tools/checkasm.c b/tools/checkasm.c index d154941e..203a5963 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -156,7 +156,8 @@ static void print_bench(void) b->cpu&X264_CPU_MMX ? "mmx" : "c", b->cpu&X264_CPU_CACHELINE_32 ? "_c32" : b->cpu&X264_CPU_CACHELINE_64 ? "_c64" : - b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" : "", + b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" : + b->cpu&X264_CPU_LZCNT ? "_lzcnt" : "", ((int64_t)10*b->cycles/b->den - nop_time)/4 ); } } @@ -1392,6 +1393,11 @@ static int check_all_flags( void ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" ); cpu1 &= ~X264_CPU_CACHELINE_32; #endif + if( x264_cpu_detect() & X264_CPU_LZCNT ) + { + ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" ); + cpu1 &= ~X264_CPU_LZCNT; + } } if( x264_cpu_detect() & X264_CPU_SSE2 ) { @@ -1405,6 +1411,12 @@ static int check_all_flags( void ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" ); cpu1 &= ~X264_CPU_SSE_MISALIGN; } + if( x264_cpu_detect() & X264_CPU_LZCNT ) + { + cpu1 &= ~X264_CPU_CACHELINE_64; + ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" ); + cpu1 &= ~X264_CPU_LZCNT; + } if( x264_cpu_detect() & X264_CPU_SSE3 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" ); if( x264_cpu_detect() & X264_CPU_SSSE3 ) diff --git a/x264.h b/x264.h index 51be79ee..8c517b1e 100644 --- a/x264.h +++ b/x264.h @@ -62,6 +62,7 @@ typedef struct x264_t x264_t; #define X264_CPU_SSE4 0x002000 /* SSE4.1 */ #define X264_CPU_SSE42 0x004000 /* SSE4.2 */ #define X264_CPU_SSE_MISALIGN 0x008000 /* Phenom support for misaligned SSE instruction arguments */ +#define X264_CPU_LZCNT 0x010000 /* Phenom support for "leading zero count" instruction. */ /* Analyse flags */ -- 2.40.0