From: DRC Date: Fri, 8 Jul 2016 18:56:30 +0000 (-0500) Subject: 64-bit AVX2 implementation of integer quantization X-Git-Tag: 1.5.90~118 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=eaae2cdb16a3ed7a308f3b834c1be7099880ff01;p=libjpeg-turbo 64-bit AVX2 implementation of integer quantization --- diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt index 26420fc..431964f 100755 --- a/simd/CMakeLists.txt +++ b/simd/CMakeLists.txt @@ -26,7 +26,7 @@ if(SIMD_X86_64) jdsample-sse2-64 jfdctfst-sse2-64 jfdctint-sse2-64 jidctflt-sse2-64 jidctfst-sse2-64 jidctint-sse2-64 jidctred-sse2-64 jquantf-sse2-64 jquanti-sse2-64 jccolor-avx2-64 jcgray-avx2-64 jcsample-avx2-64 - jdcolor-avx2-64 jdmerge-avx2-64 jdsample-avx2-64) + jdcolor-avx2-64 jdmerge-avx2-64 jdsample-avx2-64 jquanti-avx2-64) message(STATUS "Building x86_64 SIMD extensions") else() set(SIMD_BASENAMES jsimdcpu jfdctflt-3dn jidctflt-3dn jquant-3dn jccolor-mmx diff --git a/simd/Makefile.am b/simd/Makefile.am index 8a06f45..0b7b497 100644 --- a/simd/Makefile.am +++ b/simd/Makefile.am @@ -20,7 +20,8 @@ libsimd_la_SOURCES = jsimd_x86_64.c jsimd.h jsimdcfg.inc.h jsimdext.inc \ jidctflt-sse2-64.asm jidctfst-sse2-64.asm jidctint-sse2-64.asm \ jidctred-sse2-64.asm jquantf-sse2-64.asm jquanti-sse2-64.asm \ jccolor-avx2-64.asm jcgray-avx2-64.asm jcsample-avx2-64.asm \ - jdcolor-avx2-64.asm jdmerge-avx2-64.asm jdsample-avx2-64.asm + jdcolor-avx2-64.asm jdmerge-avx2-64.asm jdsample-avx2-64.asm \ + jquanti-avx2-64.asm jccolor-sse2-64.lo: jccolext-sse2-64.asm jcgray-sse2-64.lo: jcgryext-sse2-64.asm diff --git a/simd/jquanti-avx2-64.asm b/simd/jquanti-avx2-64.asm new file mode 100644 index 0000000..60ce19f --- /dev/null +++ b/simd/jquanti-avx2-64.asm @@ -0,0 +1,96 @@ +; +; jquanti.asm - sample data conversion and quantization (64-bit AVX2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2016, Matthieu Darbois. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" +%include "jdct.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +; -------------------------------------------------------------------------- +; +; Quantize/descale the coefficients, and store into coef_block +; +; This implementation is based on an algorithm described in +; "How to optimize for the Pentium family of microprocessors" +; (http://www.agner.org/assem/). +; +; GLOBAL(void) +; jsimd_quantize_avx2 (JCOEFPTR coef_block, DCTELEM *divisors, +; DCTELEM *workspace); +; + +%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) +%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) +%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) + +; r10 = JCOEFPTR coef_block +; r11 = DCTELEM *divisors +; r12 = DCTELEM *workspace + + align 32 + global EXTN(jsimd_quantize_avx2) + +EXTN(jsimd_quantize_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 3 + + vmovdqu ymm4, [XMMBLOCK(0,0,r12,SIZEOF_DCTELEM)] + vmovdqu ymm5, [XMMBLOCK(2,0,r12,SIZEOF_DCTELEM)] + vmovdqu ymm6, [XMMBLOCK(4,0,r12,SIZEOF_DCTELEM)] + vmovdqu ymm7, [XMMBLOCK(6,0,r12,SIZEOF_DCTELEM)] + vpabsw ymm0, ymm4 + vpabsw ymm1, ymm5 + vpabsw ymm2, ymm6 + vpabsw ymm3, ymm7 + + vpaddw ymm0, YMMWORD [CORRECTION(0,0,r11)] ; correction + roundfactor + vpaddw ymm1, YMMWORD [CORRECTION(2,0,r11)] + vpaddw ymm2, YMMWORD [CORRECTION(4,0,r11)] + vpaddw ymm3, YMMWORD [CORRECTION(6,0,r11)] + vpmulhuw ymm0, YMMWORD [RECIPROCAL(0,0,r11)] ; reciprocal + vpmulhuw ymm1, YMMWORD [RECIPROCAL(2,0,r11)] + vpmulhuw ymm2, YMMWORD [RECIPROCAL(4,0,r11)] + vpmulhuw ymm3, YMMWORD [RECIPROCAL(6,0,r11)] + vpmulhuw ymm0, YMMWORD [SCALE(0,0,r11)] ; scale + vpmulhuw ymm1, YMMWORD [SCALE(2,0,r11)] + vpmulhuw ymm2, YMMWORD [SCALE(4,0,r11)] + vpmulhuw ymm3, YMMWORD [SCALE(6,0,r11)] + + vpsignw ymm0, ymm0, ymm4 + vpsignw ymm1, ymm1, ymm5 + vpsignw ymm2, ymm2, ymm6 + vpsignw ymm3, ymm3, ymm7 + + vmovdqu [XMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0 + vmovdqu [XMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1 + vmovdqu [XMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2 + vmovdqu [XMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3 + + vzeroupper + uncollect_args 3 + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/simd/jsimd.h b/simd/jsimd.h index fc9bcfe..f3ead87 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -875,6 +875,9 @@ EXTERN(void) jsimd_quantize_mmx EXTERN(void) jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); +EXTERN(void) jsimd_quantize_avx2 + (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); + EXTERN(void) jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); diff --git a/simd/jsimd_x86_64.c b/simd/jsimd_x86_64.c index 89efc86..d22cdc8 100644 --- a/simd/jsimd_x86_64.c +++ b/simd/jsimd_x86_64.c @@ -783,6 +783,8 @@ jsimd_can_quantize (void) if (sizeof(DCTELEM) != 2) return 0; + if (simd_support & JSIMD_AVX2) + return 1; if (simd_support & JSIMD_SSE2) return 1; @@ -812,7 +814,10 @@ GLOBAL(void) jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) { - jsimd_quantize_sse2(coef_block, divisors, workspace); + if (simd_support & JSIMD_AVX2) + jsimd_quantize_avx2(coef_block, divisors, workspace); + else + jsimd_quantize_sse2(coef_block, divisors, workspace); } GLOBAL(void)