From: DRC <information@libjpeg-turbo.org>
Date: Fri, 8 Jul 2016 18:56:30 +0000 (-0500)
Subject: 64-bit AVX2 implementation of integer quantization
X-Git-Tag: 1.5.90~118
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=eaae2cdb16a3ed7a308f3b834c1be7099880ff01;p=libjpeg-turbo

64-bit AVX2 implementation of integer quantization
---

diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index 26420fc..431964f 100755
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -26,7 +26,7 @@ if(SIMD_X86_64)
     jdsample-sse2-64 jfdctfst-sse2-64 jfdctint-sse2-64 jidctflt-sse2-64
     jidctfst-sse2-64 jidctint-sse2-64 jidctred-sse2-64 jquantf-sse2-64
     jquanti-sse2-64 jccolor-avx2-64 jcgray-avx2-64 jcsample-avx2-64
-    jdcolor-avx2-64 jdmerge-avx2-64 jdsample-avx2-64)
+    jdcolor-avx2-64 jdmerge-avx2-64 jdsample-avx2-64 jquanti-avx2-64)
   message(STATUS "Building x86_64 SIMD extensions")
 else()
   set(SIMD_BASENAMES jsimdcpu jfdctflt-3dn jidctflt-3dn jquant-3dn jccolor-mmx
diff --git a/simd/Makefile.am b/simd/Makefile.am
index 8a06f45..0b7b497 100644
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -20,7 +20,8 @@ libsimd_la_SOURCES = jsimd_x86_64.c jsimd.h jsimdcfg.inc.h jsimdext.inc \
 	jidctflt-sse2-64.asm  jidctfst-sse2-64.asm  jidctint-sse2-64.asm \
 	jidctred-sse2-64.asm  jquantf-sse2-64.asm   jquanti-sse2-64.asm \
 	jccolor-avx2-64.asm   jcgray-avx2-64.asm    jcsample-avx2-64.asm \
-	jdcolor-avx2-64.asm   jdmerge-avx2-64.asm   jdsample-avx2-64.asm
+	jdcolor-avx2-64.asm   jdmerge-avx2-64.asm   jdsample-avx2-64.asm \
+	jquanti-avx2-64.asm
 
 jccolor-sse2-64.lo:  jccolext-sse2-64.asm
 jcgray-sse2-64.lo:   jcgryext-sse2-64.asm
diff --git a/simd/jquanti-avx2-64.asm b/simd/jquanti-avx2-64.asm
new file mode 100644
index 0000000..60ce19f
--- /dev/null
+++ b/simd/jquanti-avx2-64.asm
@@ -0,0 +1,96 @@
+;
+; jquanti.asm - sample data conversion and quantization (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2 (JCOEFPTR coef_block, DCTELEM *divisors,
+;                      DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m,n,b)  XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b)  XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)       XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
+
+    align       32
+    global      EXTN(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    vmovdqu     ymm4, [XMMBLOCK(0,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, [XMMBLOCK(2,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, [XMMBLOCK(4,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, [XMMBLOCK(6,0,r12,SIZEOF_DCTELEM)]
+    vpabsw      ymm0, ymm4
+    vpabsw      ymm1, ymm5
+    vpabsw      ymm2, ymm6
+    vpabsw      ymm3, ymm7
+
+    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,r11)]  ; correction + roundfactor
+    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,r11)]
+    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,r11)]
+    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,r11)]
+    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,r11)]  ; reciprocal
+    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,r11)]
+    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,r11)]
+    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,r11)]
+    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,r11)]       ; scale
+    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,r11)]
+    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,r11)]
+    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,r11)]
+
+    vpsignw     ymm0, ymm0, ymm4
+    vpsignw     ymm1, ymm1, ymm5
+    vpsignw     ymm2, ymm2, ymm6
+    vpsignw     ymm3, ymm3, ymm7
+
+    vmovdqu     [XMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     [XMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     [XMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     [XMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/jsimd.h b/simd/jsimd.h
index fc9bcfe..f3ead87 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -875,6 +875,9 @@ EXTERN(void) jsimd_quantize_mmx
 EXTERN(void) jsimd_quantize_sse2
         (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
+EXTERN(void) jsimd_quantize_avx2
+        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
 EXTERN(void) jsimd_quantize_neon
         (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
diff --git a/simd/jsimd_x86_64.c b/simd/jsimd_x86_64.c
index 89efc86..d22cdc8 100644
--- a/simd/jsimd_x86_64.c
+++ b/simd/jsimd_x86_64.c
@@ -783,6 +783,8 @@ jsimd_can_quantize (void)
   if (sizeof(DCTELEM) != 2)
     return 0;
 
+  if (simd_support & JSIMD_AVX2)
+    return 1;
   if (simd_support & JSIMD_SSE2)
     return 1;
 
@@ -812,7 +814,10 @@ GLOBAL(void)
 jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
                 DCTELEM *workspace)
 {
-  jsimd_quantize_sse2(coef_block, divisors, workspace);
+  if (simd_support & JSIMD_AVX2)
+    jsimd_quantize_avx2(coef_block, divisors, workspace);
+  else
+    jsimd_quantize_sse2(coef_block, divisors, workspace);
 }
 
 GLOBAL(void)