32-bit AVX2 implementation of integer quantization

author DRC <information@libjpeg-turbo.org>

Sat, 9 Jul 2016 02:28:48 +0000 (21:28 -0500)

committer DRC <information@libjpeg-turbo.org>

Sat, 9 Jul 2016 02:28:48 +0000 (21:28 -0500)
author DRC <information@libjpeg-turbo.org>
Sat, 9 Jul 2016 02:28:48 +0000 (21:28 -0500)
committer DRC <information@libjpeg-turbo.org>
Sat, 9 Jul 2016 02:28:48 +0000 (21:28 -0500)
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt

index 431964fdbef1e636f861eac20a678cd126ff7b91..4afb5f6f4f281957f39cf2cbf768057100ba980c 100755 (executable)
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -36,7 +36,7 @@ else()
      jdcolor-sse2 jdmerge-sse2 jdsample-sse2 jfdctfst-sse2 jfdctint-sse2
      jidctflt-sse2 jidctfst-sse2 jidctint-sse2 jidctred-sse2 jquantf-sse2
      jquanti-sse2 jccolor-avx2 jcgray-avx2 jcsample-avx2 jdcolor-avx2
-    jdmerge-avx2 jdsample-avx2)
+    jdmerge-avx2 jdsample-avx2 jquanti-avx2)
    message(STATUS "Building i386 SIMD extensions")
  endif()
  
diff --git a/simd/Makefile.am b/simd/Makefile.am

index 0b7b497e73b2ada4238c63d8ce3e7e807709426c..18e1b2aaba0f48124de140243f5f7a7e2174b91b 100644 (file)
--- a/simd/Makefile.am
+++ b/simd/Makefile.am
@@ -50,7 +50,8 @@ libsimd_la_SOURCES = jsimd_i386.c jsimd.h jsimdcfg.inc.h jsimdext.inc \
         jidctflt-sse2.asm  jidctfst-sse2.asm  jidctint-sse2.asm \
         jidctred-sse2.asm  jquantf-sse2.asm   jquanti-sse2.asm \
         jccolor-avx2.asm   jcgray-avx2.asm    jcsample-avx2.asm \
-       jdcolor-avx2.asm   jdmerge-avx2.asm   jdsample-avx2.asm
+       jdcolor-avx2.asm   jdmerge-avx2.asm   jdsample-avx2.asm \
+       jquanti-avx2.asm
  
  jccolor-mmx.lo:   jccolext-mmx.asm
  jcgray.-mmx.lo:   jcgryext-mmx.asm
diff --git a/simd/jquanti-avx2.asm b/simd/jquanti-avx2.asm

new file mode 100644 (file)

index 0000000..0356185
--- /dev/null
+++ b/simd/jquanti-avx2.asm
@@ -0,0 +1,107 @@
+;
+; jquanti.asm - sample quantization (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2 (JCOEFPTR coef_block, DCTELEM *divisors,
+;                      DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m,n,b)  XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b)  XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)       XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+%define coef_block  ebp+8               ; JCOEFPTR coef_block
+%define divisors    ebp+12              ; DCTELEM *divisors
+%define workspace   ebp+16              ; DCTELEM *workspace
+
+    align       32
+    global      EXTN(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+
+    vmovdqu     ymm4, [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, [XMMBLOCK(4,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, [XMMBLOCK(6,0,esi,SIZEOF_DCTELEM)]
+    vpabsw      ymm0, ymm4
+    vpabsw      ymm1, ymm5
+    vpabsw      ymm2, ymm6
+    vpabsw      ymm3, ymm7
+
+    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,edx)]
+    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,edx)]
+    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,edx)]
+    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,edx)]
+    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,edx)]
+    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,edx)]
+    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,edx)]       ; scale
+    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,edx)]
+    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,edx)]
+    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,edx)]
+
+    vpsignw     ymm0, ymm0, ymm4
+    vpsignw     ymm1, ymm1, ymm5
+    vpsignw     ymm2, ymm2, ymm6
+    vpsignw     ymm3, ymm3, ymm7
+
+    vmovdqu     [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     [XMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     [XMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/jsimd_i386.c b/simd/jsimd_i386.c

index 7bc20bc9a18257f98fa3773b04d3738cd4bff8c4..d64cc2010912bc270d4d506bbf8a99cdba26c1de 100644 (file)
--- a/simd/jsimd_i386.c
+++ b/simd/jsimd_i386.c
@@ -915,6 +915,8 @@ jsimd_can_quantize (void)
    if (sizeof(DCTELEM) != 2)
      return 0;
  
+  if (simd_support & JSIMD_AVX2)
+    return 1;
    if (simd_support & JSIMD_SSE2)
      return 1;
    if (simd_support & JSIMD_MMX)
@@ -950,7 +952,9 @@ GLOBAL(void)
  jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
                  DCTELEM *workspace)
  {
-  if (simd_support & JSIMD_SSE2)
+  if (simd_support & JSIMD_AVX2)
+    jsimd_quantize_avx2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_SSE2)
      jsimd_quantize_sse2(coef_block, divisors, workspace);
    else if (simd_support & JSIMD_MMX)
      jsimd_quantize_mmx(coef_block, divisors, workspace);
author	DRC <information@libjpeg-turbo.org>
	Sat, 9 Jul 2016 02:28:48 +0000 (21:28 -0500)
committer	DRC <information@libjpeg-turbo.org>
	Sat, 9 Jul 2016 02:28:48 +0000 (21:28 -0500)
simd/CMakeLists.txt		patch \| blob \| history
simd/Makefile.am		patch \| blob \| history
simd/jquanti-avx2.asm	[new file with mode: 0644]	patch \| blob
simd/jsimd_i386.c		patch \| blob \| history