]> granicus.if.org Git - libjpeg-turbo/commitdiff
64-bit AVX2 implementation of integer quantization
authorDRC <information@libjpeg-turbo.org>
Fri, 8 Jul 2016 18:56:30 +0000 (13:56 -0500)
committerDRC <information@libjpeg-turbo.org>
Sat, 9 Jul 2016 02:15:27 +0000 (21:15 -0500)
simd/CMakeLists.txt
simd/Makefile.am
simd/jquanti-avx2-64.asm [new file with mode: 0644]
simd/jsimd.h
simd/jsimd_x86_64.c

index 26420fcc5a3520994f88dbd23109836ae1321a02..431964fdbef1e636f861eac20a678cd126ff7b91 100755 (executable)
@@ -26,7 +26,7 @@ if(SIMD_X86_64)
     jdsample-sse2-64 jfdctfst-sse2-64 jfdctint-sse2-64 jidctflt-sse2-64
     jidctfst-sse2-64 jidctint-sse2-64 jidctred-sse2-64 jquantf-sse2-64
     jquanti-sse2-64 jccolor-avx2-64 jcgray-avx2-64 jcsample-avx2-64
-    jdcolor-avx2-64 jdmerge-avx2-64 jdsample-avx2-64)
+    jdcolor-avx2-64 jdmerge-avx2-64 jdsample-avx2-64 jquanti-avx2-64)
   message(STATUS "Building x86_64 SIMD extensions")
 else()
   set(SIMD_BASENAMES jsimdcpu jfdctflt-3dn jidctflt-3dn jquant-3dn jccolor-mmx
index 8a06f45c126f0abbad8fe644444b6ba3f064a8a6..0b7b497e73b2ada4238c63d8ce3e7e807709426c 100644 (file)
@@ -20,7 +20,8 @@ libsimd_la_SOURCES = jsimd_x86_64.c jsimd.h jsimdcfg.inc.h jsimdext.inc \
        jidctflt-sse2-64.asm  jidctfst-sse2-64.asm  jidctint-sse2-64.asm \
        jidctred-sse2-64.asm  jquantf-sse2-64.asm   jquanti-sse2-64.asm \
        jccolor-avx2-64.asm   jcgray-avx2-64.asm    jcsample-avx2-64.asm \
-       jdcolor-avx2-64.asm   jdmerge-avx2-64.asm   jdsample-avx2-64.asm
+       jdcolor-avx2-64.asm   jdmerge-avx2-64.asm   jdsample-avx2-64.asm \
+       jquanti-avx2-64.asm
 
 jccolor-sse2-64.lo:  jccolext-sse2-64.asm
 jcgray-sse2-64.lo:   jcgryext-sse2-64.asm
diff --git a/simd/jquanti-avx2-64.asm b/simd/jquanti-avx2-64.asm
new file mode 100644 (file)
index 0000000..60ce19f
--- /dev/null
@@ -0,0 +1,96 @@
+;
+; jquanti.asm - sample data conversion and quantization (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2 (JCOEFPTR coef_block, DCTELEM *divisors,
+;                      DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m,n,b)  XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b)  XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)       XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
+
+    align       32
+    global      EXTN(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    vmovdqu     ymm4, [XMMBLOCK(0,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, [XMMBLOCK(2,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, [XMMBLOCK(4,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, [XMMBLOCK(6,0,r12,SIZEOF_DCTELEM)]
+    vpabsw      ymm0, ymm4
+    vpabsw      ymm1, ymm5
+    vpabsw      ymm2, ymm6
+    vpabsw      ymm3, ymm7
+
+    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,r11)]  ; correction + roundfactor
+    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,r11)]
+    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,r11)]
+    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,r11)]
+    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,r11)]  ; reciprocal
+    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,r11)]
+    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,r11)]
+    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,r11)]
+    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,r11)]       ; scale
+    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,r11)]
+    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,r11)]
+    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,r11)]
+
+    vpsignw     ymm0, ymm0, ymm4
+    vpsignw     ymm1, ymm1, ymm5
+    vpsignw     ymm2, ymm2, ymm6
+    vpsignw     ymm3, ymm3, ymm7
+
+    vmovdqu     [XMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     [XMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     [XMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     [XMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
index fc9bcfea1f935441fc2f6906fb5ad3c8d0ae8b2e..f3ead8749239d7aac48e866aa955264cadbdb980 100644 (file)
@@ -875,6 +875,9 @@ EXTERN(void) jsimd_quantize_mmx
 EXTERN(void) jsimd_quantize_sse2
         (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
+EXTERN(void) jsimd_quantize_avx2
+        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
 EXTERN(void) jsimd_quantize_neon
         (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
index 89efc86856701cc9addf39a0c315b728280db3f7..d22cdc81e7b0951cba11c22ad0f680c9695ac1c2 100644 (file)
@@ -783,6 +783,8 @@ jsimd_can_quantize (void)
   if (sizeof(DCTELEM) != 2)
     return 0;
 
+  if (simd_support & JSIMD_AVX2)
+    return 1;
   if (simd_support & JSIMD_SSE2)
     return 1;
 
@@ -812,7 +814,10 @@ GLOBAL(void)
 jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
                 DCTELEM *workspace)
 {
-  jsimd_quantize_sse2(coef_block, divisors, workspace);
+  if (simd_support & JSIMD_AVX2)
+    jsimd_quantize_avx2(coef_block, divisors, workspace);
+  else
+    jsimd_quantize_sse2(coef_block, divisors, workspace);
 }
 
 GLOBAL(void)