From 42db5e6f8f704a2b0a9edf5d9cd4a17d80e5b816 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Tue, 14 Feb 2012 16:54:03 -0800 Subject: [PATCH] BMI1 decimate functions Intel was nice enough to make tzcnt equal to "rep bsf", which is backwards-compatible. This means we don't actually have to add new functions to make it work. --- common/x86/bitstream-a.asm | 2 +- common/x86/quant-a.asm | 10 +++++----- common/x86/x86inc.asm | 4 ++++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm index b3eb8920..4306e50f 100644 --- a/common/x86/bitstream-a.asm +++ b/common/x86/bitstream-a.asm @@ -91,7 +91,7 @@ ALIGN 16 .escape: ; Skip bytes that are known to be valid and r4d, r3d - bsf r3d, r4d + tzcnt r3d, r4d add r1, r3 .escape_loop: inc r1 diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index a0a04622..3c43220b 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -912,7 +912,7 @@ cextern decimate_table8 %macro DECIMATE4x4 1 -;A LUT is faster than bsf on AMD processors. +;A LUT is faster than bsf on older AMD processors. ;This is not true for score64. cglobal decimate_score%1, 1,3 %ifdef PIC @@ -947,7 +947,7 @@ cglobal decimate_score%1, 1,3 add al, byte [mask_table + rdx] %else .loop: - bsf ecx, edx + tzcnt ecx, edx shr edx, cl add al, byte [table + rcx] shr edx, 1 @@ -1011,7 +1011,7 @@ cglobal decimate_score64, 1,5 add eax, r3d jne .ret9 .loop: - bsf rcx, r1 + tzcnt rcx, r1 shr r1, cl add al, byte [table + rcx] shr r1, 1 @@ -1047,7 +1047,7 @@ cglobal decimate_score64, 1,5 add r0, r2 jne .ret9 ;r0 is zero at this point, so we don't need to zero it .loop: - bsf ecx, r3 + tzcnt ecx, r3 test r3, r3 je .largerun shrd r3, r4, cl @@ -1073,7 +1073,7 @@ cglobal decimate_score64, 1,5 .largerun: mov r3, r4 xor r4, r4 - bsf ecx, r3 + tzcnt ecx, r3 shr r3, cl shr r3, 1 jne .loop diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index 63b654fd..def62fa2 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -1098,3 +1098,7 @@ AVX_INSTR pfmul, 1, 0, 1 FMA_INSTR pmacsdd, pmulld, paddd FMA_INSTR pmacsww, pmullw, paddw FMA_INSTR pmadcswd, pmaddwd, paddd + +; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. +; This lets us use tzcnt without bumping the yasm version requirement yet. +%define tzcnt rep bsf -- 2.40.0