From: DRC Date: Sat, 9 Jul 2016 02:28:48 +0000 (-0500) Subject: 32-bit AVX2 implementation of integer quantization X-Git-Tag: 1.5.90~117 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b2921f1bccf3a42ce254116fab6569418166b8ed;p=libjpeg-turbo 32-bit AVX2 implementation of integer quantization --- diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt index 431964f..4afb5f6 100755 --- a/simd/CMakeLists.txt +++ b/simd/CMakeLists.txt @@ -36,7 +36,7 @@ else() jdcolor-sse2 jdmerge-sse2 jdsample-sse2 jfdctfst-sse2 jfdctint-sse2 jidctflt-sse2 jidctfst-sse2 jidctint-sse2 jidctred-sse2 jquantf-sse2 jquanti-sse2 jccolor-avx2 jcgray-avx2 jcsample-avx2 jdcolor-avx2 - jdmerge-avx2 jdsample-avx2) + jdmerge-avx2 jdsample-avx2 jquanti-avx2) message(STATUS "Building i386 SIMD extensions") endif() diff --git a/simd/Makefile.am b/simd/Makefile.am index 0b7b497..18e1b2a 100644 --- a/simd/Makefile.am +++ b/simd/Makefile.am @@ -50,7 +50,8 @@ libsimd_la_SOURCES = jsimd_i386.c jsimd.h jsimdcfg.inc.h jsimdext.inc \ jidctflt-sse2.asm jidctfst-sse2.asm jidctint-sse2.asm \ jidctred-sse2.asm jquantf-sse2.asm jquanti-sse2.asm \ jccolor-avx2.asm jcgray-avx2.asm jcsample-avx2.asm \ - jdcolor-avx2.asm jdmerge-avx2.asm jdsample-avx2.asm + jdcolor-avx2.asm jdmerge-avx2.asm jdsample-avx2.asm \ + jquanti-avx2.asm jccolor-mmx.lo: jccolext-mmx.asm jcgray.-mmx.lo: jcgryext-mmx.asm diff --git a/simd/jquanti-avx2.asm b/simd/jquanti-avx2.asm new file mode 100644 index 0000000..0356185 --- /dev/null +++ b/simd/jquanti-avx2.asm @@ -0,0 +1,107 @@ +; +; jquanti.asm - sample quantization (AVX2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, Matthieu Darbois. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_avx2 (JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) +%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) +%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) + +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; DCTELEM *divisors +%define workspace ebp+16 ; DCTELEM *workspace + + align 32 + global EXTN(jsimd_quantize_avx2) + +EXTN(jsimd_quantize_avx2): + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + + vmovdqu ymm4, [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] + vmovdqu ymm5, [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] + vmovdqu ymm6, [XMMBLOCK(4,0,esi,SIZEOF_DCTELEM)] + vmovdqu ymm7, [XMMBLOCK(6,0,esi,SIZEOF_DCTELEM)] + vpabsw ymm0, ymm4 + vpabsw ymm1, ymm5 + vpabsw ymm2, ymm6 + vpabsw ymm3, ymm7 + + vpaddw ymm0, YMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor + vpaddw ymm1, YMMWORD [CORRECTION(2,0,edx)] + vpaddw ymm2, YMMWORD [CORRECTION(4,0,edx)] + vpaddw ymm3, YMMWORD [CORRECTION(6,0,edx)] + vpmulhuw ymm0, YMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal + vpmulhuw ymm1, YMMWORD [RECIPROCAL(2,0,edx)] + vpmulhuw ymm2, YMMWORD [RECIPROCAL(4,0,edx)] + vpmulhuw ymm3, YMMWORD [RECIPROCAL(6,0,edx)] + vpmulhuw ymm0, YMMWORD [SCALE(0,0,edx)] ; scale + vpmulhuw ymm1, YMMWORD [SCALE(2,0,edx)] + vpmulhuw ymm2, YMMWORD [SCALE(4,0,edx)] + vpmulhuw ymm3, YMMWORD [SCALE(6,0,edx)] + + vpsignw ymm0, ymm0, ymm4 + vpsignw ymm1, ymm1, ymm5 + vpsignw ymm2, ymm2, ymm6 + vpsignw ymm3, ymm3, ymm7 + + vmovdqu [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0 + vmovdqu [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1 + vmovdqu [XMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2 + vmovdqu [XMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3 + + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/simd/jsimd_i386.c b/simd/jsimd_i386.c index 7bc20bc..d64cc20 100644 --- a/simd/jsimd_i386.c +++ b/simd/jsimd_i386.c @@ -915,6 +915,8 @@ jsimd_can_quantize (void) if (sizeof(DCTELEM) != 2) return 0; + if (simd_support & JSIMD_AVX2) + return 1; if (simd_support & JSIMD_SSE2) return 1; if (simd_support & JSIMD_MMX) @@ -950,7 +952,9 @@ GLOBAL(void) jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) { - if (simd_support & JSIMD_SSE2) + if (simd_support & JSIMD_AVX2) + jsimd_quantize_avx2(coef_block, divisors, workspace); + else if (simd_support & JSIMD_SSE2) jsimd_quantize_sse2(coef_block, divisors, workspace); else if (simd_support & JSIMD_MMX) jsimd_quantize_mmx(coef_block, divisors, workspace);