From: DRC <information@libjpeg-turbo.org>
Date: Sat, 9 Jul 2016 02:28:48 +0000 (-0500)
Subject: 32-bit AVX2 implementation of integer quantization
X-Git-Tag: 1.5.90~117
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b2921f1bccf3a42ce254116fab6569418166b8ed;p=libjpeg-turbo

32-bit AVX2 implementation of integer quantization
---

diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index 431964f..4afb5f6 100755
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -36,7 +36,7 @@ else()
     jdcolor-sse2 jdmerge-sse2 jdsample-sse2 jfdctfst-sse2 jfdctint-sse2
     jidctflt-sse2 jidctfst-sse2 jidctint-sse2 jidctred-sse2 jquantf-sse2
     jquanti-sse2 jccolor-avx2 jcgray-avx2 jcsample-avx2 jdcolor-avx2
-    jdmerge-avx2 jdsample-avx2)
+    jdmerge-avx2 jdsample-avx2 jquanti-avx2)
   message(STATUS "Building i386 SIMD extensions")
 endif()
 
diff --git a/simd/Makefile.am b/simd/Makefile.am
index 0b7b497..18e1b2a 100644
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -50,7 +50,8 @@ libsimd_la_SOURCES = jsimd_i386.c jsimd.h jsimdcfg.inc.h jsimdext.inc \
 	jidctflt-sse2.asm  jidctfst-sse2.asm  jidctint-sse2.asm \
 	jidctred-sse2.asm  jquantf-sse2.asm   jquanti-sse2.asm \
 	jccolor-avx2.asm   jcgray-avx2.asm    jcsample-avx2.asm \
-	jdcolor-avx2.asm   jdmerge-avx2.asm   jdsample-avx2.asm
+	jdcolor-avx2.asm   jdmerge-avx2.asm   jdsample-avx2.asm \
+	jquanti-avx2.asm
 
 jccolor-mmx.lo:   jccolext-mmx.asm
 jcgray.-mmx.lo:   jcgryext-mmx.asm
diff --git a/simd/jquanti-avx2.asm b/simd/jquanti-avx2.asm
new file mode 100644
index 0000000..0356185
--- /dev/null
+++ b/simd/jquanti-avx2.asm
@@ -0,0 +1,107 @@
+;
+; jquanti.asm - sample quantization (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2 (JCOEFPTR coef_block, DCTELEM *divisors,
+;                      DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m,n,b)  XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b)  XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)       XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+%define coef_block  ebp+8               ; JCOEFPTR coef_block
+%define divisors    ebp+12              ; DCTELEM *divisors
+%define workspace   ebp+16              ; DCTELEM *workspace
+
+    align       32
+    global      EXTN(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+
+    vmovdqu     ymm4, [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, [XMMBLOCK(4,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, [XMMBLOCK(6,0,esi,SIZEOF_DCTELEM)]
+    vpabsw      ymm0, ymm4
+    vpabsw      ymm1, ymm5
+    vpabsw      ymm2, ymm6
+    vpabsw      ymm3, ymm7
+
+    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,edx)]
+    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,edx)]
+    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,edx)]
+    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,edx)]
+    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,edx)]
+    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,edx)]
+    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,edx)]       ; scale
+    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,edx)]
+    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,edx)]
+    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,edx)]
+
+    vpsignw     ymm0, ymm0, ymm4
+    vpsignw     ymm1, ymm1, ymm5
+    vpsignw     ymm2, ymm2, ymm6
+    vpsignw     ymm3, ymm3, ymm7
+
+    vmovdqu     [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     [XMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     [XMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/jsimd_i386.c b/simd/jsimd_i386.c
index 7bc20bc..d64cc20 100644
--- a/simd/jsimd_i386.c
+++ b/simd/jsimd_i386.c
@@ -915,6 +915,8 @@ jsimd_can_quantize (void)
   if (sizeof(DCTELEM) != 2)
     return 0;
 
+  if (simd_support & JSIMD_AVX2)
+    return 1;
   if (simd_support & JSIMD_SSE2)
     return 1;
   if (simd_support & JSIMD_MMX)
@@ -950,7 +952,9 @@ GLOBAL(void)
 jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
                 DCTELEM *workspace)
 {
-  if (simd_support & JSIMD_SSE2)
+  if (simd_support & JSIMD_AVX2)
+    jsimd_quantize_avx2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_SSE2)
     jsimd_quantize_sse2(coef_block, divisors, workspace);
   else if (simd_support & JSIMD_MMX)
     jsimd_quantize_mmx(coef_block, divisors, workspace);