From: DRC Date: Fri, 20 May 2016 15:45:32 +0000 (-0500) Subject: Lay the groundwork for 64-bit AVX2 SIMD support X-Git-Tag: 1.5.90~135 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2cf199cbbdcd71c570d794d1d6f38e4101515021;p=libjpeg-turbo Lay the groundwork for 64-bit AVX2 SIMD support --- diff --git a/jmemmgr.c b/jmemmgr.c index 9174ad3..7f6b149 100644 --- a/jmemmgr.c +++ b/jmemmgr.c @@ -85,7 +85,9 @@ round_up_pow2 (size_t a, size_t b) #ifndef WITH_SIMD #define ALIGN_SIZE sizeof(double) #else -#define ALIGN_SIZE 16 /* Most SIMD implementations require this */ +#define ALIGN_SIZE 32 /* Most of the SIMD instructions we support require + 16-byte (128-bit) alignment, but AVX2 requires + 32-byte alignment. */ #endif #endif diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt index 37938ec..2738c77 100755 --- a/simd/CMakeLists.txt +++ b/simd/CMakeLists.txt @@ -21,7 +21,7 @@ if(CMAKE_BUILD_TYPE STREQUAL "Debug" endif() if(SIMD_X86_64) - set(SIMD_BASENAMES jfdctflt-sse-64 jccolor-sse2-64 jcgray-sse2-64 + set(SIMD_BASENAMES jsimdcpu-64 jfdctflt-sse-64 jccolor-sse2-64 jcgray-sse2-64 jchuff-sse2-64 jcsample-sse2-64 jdcolor-sse2-64 jdmerge-sse2-64 jdsample-sse2-64 jfdctfst-sse2-64 jfdctint-sse2-64 jidctflt-sse2-64 jidctfst-sse2-64 jidctint-sse2-64 jidctred-sse2-64 jquantf-sse2-64 diff --git a/simd/Makefile.am b/simd/Makefile.am index fad6c8c..a908385 100644 --- a/simd/Makefile.am +++ b/simd/Makefile.am @@ -12,7 +12,8 @@ EXTRA_DIST = nasm_lt.sh CMakeLists.txt \ if SIMD_X86_64 libsimd_la_SOURCES = jsimd_x86_64.c jsimd.h jsimdcfg.inc.h jsimdext.inc \ - jcolsamp.inc jdct.inc jpeg_nbits_table.inc jfdctflt-sse-64.asm \ + jcolsamp.inc jdct.inc jpeg_nbits_table.inc jsimdcpu-64.asm \ + jfdctflt-sse-64.asm \ jccolor-sse2-64.asm jcgray-sse2-64.asm jchuff-sse2-64.asm \ jcsample-sse2-64.asm jdcolor-sse2-64.asm jdmerge-sse2-64.asm \ jdsample-sse2-64.asm jfdctfst-sse2-64.asm jfdctint-sse2-64.asm \ diff --git a/simd/jccolext-sse2-64.asm b/simd/jccolext-sse2-64.asm index 258dfad..8fe072d 100644 --- a/simd/jccolext-sse2-64.asm +++ b/simd/jccolext-sse2-64.asm @@ -1,7 +1,7 @@ ; ; jccolext.asm - colorspace conversion (64-bit SSE2) ; -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -36,7 +36,7 @@ %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 8 - align 16 + align 32 global EXTN(jsimd_rgb_ycc_convert_sse2) @@ -483,4 +483,4 @@ EXTN(jsimd_rgb_ycc_convert_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jccolor-sse2-64.asm b/simd/jccolor-sse2-64.asm index af6e1e2..0482650 100644 --- a/simd/jccolor-sse2-64.asm +++ b/simd/jccolor-sse2-64.asm @@ -1,7 +1,7 @@ ; ; jccolor.asm - colorspace conversion (64-bit SSE2) ; -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -34,7 +34,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_rgb_ycc_convert_sse2) EXTN(jconst_rgb_ycc_convert_sse2): @@ -46,7 +46,7 @@ PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418 PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/simd/jcgray-sse2-64.asm b/simd/jcgray-sse2-64.asm index 61c9682..1167918 100644 --- a/simd/jcgray-sse2-64.asm +++ b/simd/jcgray-sse2-64.asm @@ -1,7 +1,7 @@ ; ; jcgray.asm - grayscale colorspace conversion (64-bit SSE2) ; -; Copyright (C) 2011, D. R. Commander. +; Copyright (C) 2011, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -30,7 +30,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_rgb_gray_convert_sse2) EXTN(jconst_rgb_gray_convert_sse2): @@ -39,7 +39,7 @@ PW_F0299_F0337 times 4 dw F_0_299, F_0_337 PW_F0114_F0250 times 4 dw F_0_114, F_0_250 PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/simd/jcgryext-sse2-64.asm b/simd/jcgryext-sse2-64.asm index 663104e..d26ada3 100644 --- a/simd/jcgryext-sse2-64.asm +++ b/simd/jcgryext-sse2-64.asm @@ -1,7 +1,7 @@ ; ; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2) ; -; Copyright (C) 2011, D. R. Commander. +; Copyright (C) 2011, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -36,7 +36,7 @@ %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 - align 16 + align 32 global EXTN(jsimd_rgb_gray_convert_sse2) @@ -362,4 +362,4 @@ EXTN(jsimd_rgb_gray_convert_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jchuff-sse2-64.asm b/simd/jchuff-sse2-64.asm index 486fd80..91d8ecf 100644 --- a/simd/jchuff-sse2-64.asm +++ b/simd/jchuff-sse2-64.asm @@ -25,14 +25,14 @@ ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_huff_encode_one_block) EXTN(jconst_huff_encode_one_block): %include "jpeg_nbits_table.inc" - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -182,7 +182,7 @@ EXTN(jconst_huff_encode_one_block): %define put_bits r9d %define buffer rax - align 16 + align 32 global EXTN(jsimd_huff_encode_one_block_sse2) EXTN(jsimd_huff_encode_one_block_sse2): @@ -357,4 +357,4 @@ EXTN(jsimd_huff_encode_one_block_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jcsample-sse2-64.asm b/simd/jcsample-sse2-64.asm index 29fc982..2e5d3f5 100644 --- a/simd/jcsample-sse2-64.asm +++ b/simd/jcsample-sse2-64.asm @@ -2,7 +2,7 @@ ; jcsample.asm - downsampling (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -39,7 +39,7 @@ ; r14 = JSAMPARRAY input_data ; r15 = JSAMPARRAY output_data - align 16 + align 32 global EXTN(jsimd_h2v1_downsample_sse2) EXTN(jsimd_h2v1_downsample_sse2): @@ -183,7 +183,7 @@ EXTN(jsimd_h2v1_downsample_sse2): ; r14 = JSAMPARRAY input_data ; r15 = JSAMPARRAY output_data - align 16 + align 32 global EXTN(jsimd_h2v2_downsample_sse2) EXTN(jsimd_h2v2_downsample_sse2): @@ -326,4 +326,4 @@ EXTN(jsimd_h2v2_downsample_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jdcolext-sse2-64.asm b/simd/jdcolext-sse2-64.asm index b48b2b7..f49709f 100644 --- a/simd/jdcolext-sse2-64.asm +++ b/simd/jdcolext-sse2-64.asm @@ -2,7 +2,7 @@ ; jdcolext.asm - colorspace conversion (64-bit SSE2) ; ; Copyright 2009, 2012 Pierre Ossman for Cendio AB -; Copyright (C) 2009, 2012, D. R. Commander. +; Copyright (C) 2009, 2012, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -37,7 +37,7 @@ %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 - align 16 + align 32 global EXTN(jsimd_ycc_rgb_convert_sse2) EXTN(jsimd_ycc_rgb_convert_sse2): @@ -437,4 +437,4 @@ EXTN(jsimd_ycc_rgb_convert_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jdcolor-sse2-64.asm b/simd/jdcolor-sse2-64.asm index 855badb..6701ee6 100644 --- a/simd/jdcolor-sse2-64.asm +++ b/simd/jdcolor-sse2-64.asm @@ -2,7 +2,7 @@ ; jdcolor.asm - colorspace conversion (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -33,7 +33,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_ycc_rgb_convert_sse2) EXTN(jconst_ycc_rgb_convert_sse2): @@ -44,7 +44,7 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 PW_ONE times 8 dw 1 PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/simd/jdmerge-sse2-64.asm b/simd/jdmerge-sse2-64.asm index dddefd8..4b29639 100644 --- a/simd/jdmerge-sse2-64.asm +++ b/simd/jdmerge-sse2-64.asm @@ -2,7 +2,7 @@ ; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -33,7 +33,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_merged_upsample_sse2) EXTN(jconst_merged_upsample_sse2): @@ -44,7 +44,7 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 PW_ONE times 8 dw 1 PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/simd/jdmrgext-sse2-64.asm b/simd/jdmrgext-sse2-64.asm index 93a3ef3..4c2bb04 100644 --- a/simd/jdmrgext-sse2-64.asm +++ b/simd/jdmrgext-sse2-64.asm @@ -2,7 +2,7 @@ ; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2) ; ; Copyright 2009, 2012 Pierre Ossman for Cendio AB -; Copyright (C) 2009, 2012, D. R. Commander. +; Copyright (C) 2009, 2012, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -37,7 +37,7 @@ %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 3 - align 16 + align 32 global EXTN(jsimd_h2v1_merged_upsample_sse2) EXTN(jsimd_h2v1_merged_upsample_sse2): @@ -444,7 +444,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): ; r12d = JDIMENSION in_row_group_ctr ; r13 = JSAMPARRAY output_buf - align 16 + align 32 global EXTN(jsimd_h2v2_merged_upsample_sse2) EXTN(jsimd_h2v2_merged_upsample_sse2): @@ -534,4 +534,4 @@ EXTN(jsimd_h2v2_merged_upsample_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jdsample-sse2-64.asm b/simd/jdsample-sse2-64.asm index 11c3464..42a8e78 100644 --- a/simd/jdsample-sse2-64.asm +++ b/simd/jdsample-sse2-64.asm @@ -2,7 +2,7 @@ ; jdsample.asm - upsampling (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -21,7 +21,7 @@ ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_fancy_upsample_sse2) EXTN(jconst_fancy_upsample_sse2): @@ -32,7 +32,7 @@ PW_THREE times 8 dw 3 PW_SEVEN times 8 dw 7 PW_EIGHT times 8 dw 8 - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -57,7 +57,7 @@ PW_EIGHT times 8 dw 8 ; r12 = JSAMPARRAY input_data ; r13 = JSAMPARRAY *output_data_ptr - align 16 + align 32 global EXTN(jsimd_h2v1_fancy_upsample_sse2) EXTN(jsimd_h2v1_fancy_upsample_sse2): @@ -199,7 +199,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2): %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 4 - align 16 + align 32 global EXTN(jsimd_h2v2_fancy_upsample_sse2) EXTN(jsimd_h2v2_fancy_upsample_sse2): @@ -496,7 +496,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): ; r12 = JSAMPARRAY input_data ; r13 = JSAMPARRAY *output_data_ptr - align 16 + align 32 global EXTN(jsimd_h2v1_upsample_sse2) EXTN(jsimd_h2v1_upsample_sse2): @@ -585,7 +585,7 @@ EXTN(jsimd_h2v1_upsample_sse2): ; r12 = JSAMPARRAY input_data ; r13 = JSAMPARRAY *output_data_ptr - align 16 + align 32 global EXTN(jsimd_h2v2_upsample_sse2) EXTN(jsimd_h2v2_upsample_sse2): @@ -667,4 +667,4 @@ EXTN(jsimd_h2v2_upsample_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jfdctflt-sse-64.asm b/simd/jfdctflt-sse-64.asm index cd48134..127ef28 100644 --- a/simd/jfdctflt-sse-64.asm +++ b/simd/jfdctflt-sse-64.asm @@ -2,7 +2,7 @@ ; jfdctflt.asm - floating-point FDCT (64-bit SSE) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -36,7 +36,7 @@ ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_fdct_float_sse) EXTN(jconst_fdct_float_sse): @@ -46,7 +46,7 @@ PD_0_707 times 4 dd 0.707106781186547524400844 PD_0_541 times 4 dd 0.541196100146196984399723 PD_1_306 times 4 dd 1.306562964876376527856643 - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -63,7 +63,7 @@ PD_1_306 times 4 dd 1.306562964876376527856643 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 - align 16 + align 32 global EXTN(jsimd_fdct_float_sse) EXTN(jsimd_fdct_float_sse): @@ -354,4 +354,4 @@ EXTN(jsimd_fdct_float_sse): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jfdctfst-sse2-64.asm b/simd/jfdctfst-sse2-64.asm index 5fbc4d7..479e5da 100644 --- a/simd/jfdctfst-sse2-64.asm +++ b/simd/jfdctfst-sse2-64.asm @@ -2,7 +2,7 @@ ; jfdctfst.asm - fast integer FDCT (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -51,7 +51,7 @@ F_1_306 equ DESCALE(1402911301, 30-CONST_BITS) ; FIX(1.306562965) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 16 + alignz 32 global EXTN(jconst_fdct_ifast_sse2) EXTN(jconst_fdct_ifast_sse2): @@ -61,7 +61,7 @@ PW_F0382 times 8 dw F_0_382 << CONST_SHIFT PW_F0541 times 8 dw F_0_541 << CONST_SHIFT PW_F1306 times 8 dw F_1_306 << CONST_SHIFT - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -78,7 +78,7 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 - align 16 + align 32 global EXTN(jsimd_fdct_ifast_sse2) EXTN(jsimd_fdct_ifast_sse2): @@ -388,4 +388,4 @@ EXTN(jsimd_fdct_ifast_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jfdctint-sse2-64.asm b/simd/jfdctint-sse2-64.asm index 70148f1..cdaa56f 100644 --- a/simd/jfdctint-sse2-64.asm +++ b/simd/jfdctint-sse2-64.asm @@ -2,7 +2,7 @@ ; jfdctint.asm - accurate integer FDCT (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -65,7 +65,7 @@ F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_fdct_islow_sse2) EXTN(jconst_fdct_islow_sse2): @@ -82,7 +82,7 @@ PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -99,7 +99,7 @@ PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 6 - align 16 + align 32 global EXTN(jsimd_fdct_islow_sse2) EXTN(jsimd_fdct_islow_sse2): @@ -618,4 +618,4 @@ EXTN(jsimd_fdct_islow_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jidctflt-sse2-64.asm b/simd/jidctflt-sse2-64.asm index 7e714d9..6f22339 100644 --- a/simd/jidctflt-sse2-64.asm +++ b/simd/jidctflt-sse2-64.asm @@ -2,7 +2,7 @@ ; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -36,7 +36,7 @@ ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_idct_float_sse2) EXTN(jconst_idct_float_sse2): @@ -48,7 +48,7 @@ PD_M2_613 times 4 dd -2.613125929752753055713286 PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -72,7 +72,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT ; FAST_FLOAT workspace[DCTSIZE2] - align 16 + align 32 global EXTN(jsimd_idct_float_sse2) EXTN(jsimd_idct_float_sse2): @@ -479,4 +479,4 @@ EXTN(jsimd_idct_float_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jidctfst-sse2-64.asm b/simd/jidctfst-sse2-64.asm index e7a1617..422ad1b 100644 --- a/simd/jidctfst-sse2-64.asm +++ b/simd/jidctfst-sse2-64.asm @@ -2,7 +2,7 @@ ; jidctfst.asm - fast integer IDCT (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -58,7 +58,7 @@ F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 16 + alignz 32 global EXTN(jconst_idct_ifast_sse2) EXTN(jconst_idct_ifast_sse2): @@ -69,7 +69,7 @@ PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT PW_F1082 times 8 dw F_1_082 << CONST_SHIFT PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -91,7 +91,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 - align 16 + align 32 global EXTN(jsimd_idct_ifast_sse2) EXTN(jsimd_idct_ifast_sse2): @@ -488,4 +488,4 @@ EXTN(jsimd_idct_ifast_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jidctint-sse2-64.asm b/simd/jidctint-sse2-64.asm index a8cbce8..6487266 100644 --- a/simd/jidctint-sse2-64.asm +++ b/simd/jidctint-sse2-64.asm @@ -2,7 +2,7 @@ ; jidctint.asm - accurate integer IDCT (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -65,7 +65,7 @@ F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_idct_islow_sse2) EXTN(jconst_idct_islow_sse2): @@ -82,7 +82,7 @@ PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -104,7 +104,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 12 - align 16 + align 32 global EXTN(jsimd_idct_islow_sse2) EXTN(jsimd_idct_islow_sse2): @@ -844,4 +844,4 @@ EXTN(jsimd_idct_islow_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jidctred-sse2-64.asm b/simd/jidctred-sse2-64.asm index dace694..69b71f0 100644 --- a/simd/jidctred-sse2-64.asm +++ b/simd/jidctred-sse2-64.asm @@ -2,7 +2,7 @@ ; jidctred.asm - reduced-size IDCT (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -71,7 +71,7 @@ F_3_624 equ DESCALE(3891787747, 30-CONST_BITS) ; FIX(3.624509785) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_idct_red_sse2) EXTN(jconst_idct_red_sse2): @@ -89,7 +89,7 @@ PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -112,7 +112,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 - align 16 + align 32 global EXTN(jsimd_idct_4x4_sse2) EXTN(jsimd_idct_4x4_sse2): @@ -411,7 +411,7 @@ EXTN(jsimd_idct_4x4_sse2): ; r12 = JSAMPARRAY output_buf ; r13d = JDIMENSION output_col - align 16 + align 32 global EXTN(jsimd_idct_2x2_sse2) EXTN(jsimd_idct_2x2_sse2): @@ -572,4 +572,4 @@ EXTN(jsimd_idct_2x2_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jquantf-sse2-64.asm b/simd/jquantf-sse2-64.asm index 241b857..3699d44 100644 --- a/simd/jquantf-sse2-64.asm +++ b/simd/jquantf-sse2-64.asm @@ -2,7 +2,7 @@ ; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -34,7 +34,7 @@ ; r11d = JDIMENSION start_col ; r12 = FAST_FLOAT *workspace - align 16 + align 32 global EXTN(jsimd_convsamp_float_sse2) EXTN(jsimd_convsamp_float_sse2): @@ -108,7 +108,7 @@ EXTN(jsimd_convsamp_float_sse2): ; r11 = FAST_FLOAT *divisors ; r12 = FAST_FLOAT *workspace - align 16 + align 32 global EXTN(jsimd_quantize_float_sse2) EXTN(jsimd_quantize_float_sse2): @@ -154,4 +154,4 @@ EXTN(jsimd_quantize_float_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jquanti-sse2-64.asm b/simd/jquanti-sse2-64.asm index 0f8f9ec..4b0873e 100644 --- a/simd/jquanti-sse2-64.asm +++ b/simd/jquanti-sse2-64.asm @@ -2,7 +2,7 @@ ; jquanti.asm - sample data conversion and quantization (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -34,7 +34,7 @@ ; r11d = JDIMENSION start_col ; r12 = DCTELEM *workspace - align 16 + align 32 global EXTN(jsimd_convsamp_sse2) EXTN(jsimd_convsamp_sse2): @@ -110,7 +110,7 @@ EXTN(jsimd_convsamp_sse2): ; r11 = DCTELEM *divisors ; r12 = DCTELEM *workspace - align 16 + align 32 global EXTN(jsimd_quantize_sse2) EXTN(jsimd_quantize_sse2): @@ -183,4 +183,4 @@ EXTN(jsimd_quantize_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jsimd.h b/simd/jsimd.h index dc6ec43..a10995e 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -23,6 +23,7 @@ #define JSIMD_ARM_NEON 0x10 #define JSIMD_MIPS_DSPR2 0x20 #define JSIMD_ALTIVEC 0x40 +#define JSIMD_AVX2 0x80 /* SIMD Ext: retrieve SIMD/CPU information */ EXTERN(unsigned int) jpeg_simd_cpu_support (void); diff --git a/simd/jsimd_x86_64.c b/simd/jsimd_x86_64.c index a62bcdb..b2fe5d3 100644 --- a/simd/jsimd_x86_64.c +++ b/simd/jsimd_x86_64.c @@ -46,9 +46,15 @@ init_simd (void) if (simd_support != ~0U) return; - simd_support = JSIMD_SSE2 | JSIMD_SSE; + simd_support = jpeg_simd_cpu_support(); /* Force different settings through environment variables */ + env = getenv("JSIMD_FORCESSE2"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_SSE2; + env = getenv("JSIMD_FORCEAVX2"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_AVX2; env = getenv("JSIMD_FORCENONE"); if ((env != NULL) && (strcmp(env, "1") == 0)) simd_support = 0; diff --git a/simd/jsimdcfg.inc.h b/simd/jsimdcfg.inc.h index 81574d5..7ff7e29 100644 --- a/simd/jsimdcfg.inc.h +++ b/simd/jsimdcfg.inc.h @@ -128,3 +128,4 @@ %define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW %define _cpp_protection_JSIMD_SSE JSIMD_SSE %define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2 +%define _cpp_protection_JSIMD_AVX2 JSIMD_AVX2 diff --git a/simd/jsimdcpu-64.asm b/simd/jsimdcpu-64.asm new file mode 100644 index 0000000..37ac0e8 --- /dev/null +++ b/simd/jsimdcpu-64.asm @@ -0,0 +1,62 @@ +; +; jsimdcpu-64.asm - SIMD instruction support check +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. +; +; Based on +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 +; +; Check if the CPU supports SIMD instructions +; +; GLOBAL(unsigned int) +; jpeg_simd_cpu_support (void) +; + + align 32 + global EXTN(jpeg_simd_cpu_support) + +EXTN(jpeg_simd_cpu_support): + push rbx + push rdi + + xor rdi,rdi ; simd support flag + + ; Check for AVX2 instruction support + mov rax, 7 + xor rcx,rcx + cpuid + mov rax,rbx ; rax = Extended feature flags + + or rdi, JSIMD_SSE2 + or rdi, JSIMD_SSE + test rax, 1<<5 ; bit5:AVX2 + jz short .return + or rdi, JSIMD_AVX2 + +.return: + mov rax,rdi + + pop rdi + pop rbx + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/simd/jsimdcpu.asm b/simd/jsimdcpu.asm index 599083b..580810b 100644 --- a/simd/jsimdcpu.asm +++ b/simd/jsimdcpu.asm @@ -2,6 +2,7 @@ ; jsimdcpu.asm - SIMD instruction support check ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -74,6 +75,16 @@ EXTN(jpeg_simd_cpu_support): or edi, byte JSIMD_SSE2 .no_sse2: + ; Check for AVX2 instruction support + mov eax, 7 + xor ecx,ecx + cpuid + mov eax,ebx + test eax, 1<<5 ; bit5:AVX2 + jz short .no_avx2 + or edi, JSIMD_AVX2 +.no_avx2: + ; Check for 3DNow! instruction support mov eax, 0x80000000 cpuid diff --git a/simd/jsimdext.inc b/simd/jsimdext.inc index c138f01..48d0fb1 100644 --- a/simd/jsimdext.inc +++ b/simd/jsimdext.inc @@ -2,7 +2,7 @@ ; jsimdext.inc - common declarations ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2010, D. R. Commander. +; Copyright (C) 2010, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library - version 1.02 ; @@ -38,11 +38,11 @@ ; -- segment definition -- ; %ifdef __YASM_VER__ -%define SEG_TEXT .text align=16 -%define SEG_CONST .rdata align=16 +%define SEG_TEXT .text align=32 +%define SEG_CONST .rdata align=32 %else -%define SEG_TEXT .text align=16 public use32 class=CODE -%define SEG_CONST .rdata align=16 public use32 class=CONST +%define SEG_TEXT .text align=32 public use32 class=CODE +%define SEG_CONST .rdata align=32 public use32 class=CONST %endif %elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- @@ -51,11 +51,11 @@ ; -- segment definition -- ; %ifdef __YASM_VER__ -%define SEG_TEXT .text align=16 -%define SEG_CONST .rdata align=16 +%define SEG_TEXT .text align=32 +%define SEG_CONST .rdata align=32 %else -%define SEG_TEXT .text align=16 public use64 class=CODE -%define SEG_CONST .rdata align=16 public use64 class=CONST +%define SEG_TEXT .text align=32 public use64 class=CODE +%define SEG_CONST .rdata align=32 public use64 class=CONST %endif %define EXTN(name) name ; foo() -> foo @@ -78,11 +78,11 @@ section .note.GNU-stack noalloc noexec nowrite progbits ; -- segment definition -- ; %ifdef __x86_64__ -%define SEG_TEXT .text progbits align=16 -%define SEG_CONST .rodata progbits align=16 +%define SEG_TEXT .text progbits align=32 +%define SEG_CONST .rodata progbits align=32 %else -%define SEG_TEXT .text progbits alloc exec nowrite align=16 -%define SEG_CONST .rodata progbits alloc noexec nowrite align=16 +%define SEG_TEXT .text progbits alloc exec nowrite align=32 +%define SEG_CONST .rodata progbits alloc noexec nowrite align=32 %endif ; To make the code position-independent, append -DPIC to the commandline @@ -108,8 +108,8 @@ section .note.GNU-stack noalloc noexec nowrite progbits ; -- segment definition -- ; -%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why? -%define SEG_CONST .rodata align=16 +%define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=16. why? +%define SEG_CONST .rodata align=32 ; The generation of position-independent code (PIC) is the default on Darwin. ; @@ -158,6 +158,10 @@ section .note.GNU-stack noalloc noexec nowrite progbits %define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD) %define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT +%define YMMWORD ; int256 (AVX register) +%define SIZEOF_YMMWORD SIZEOF_YWORD ; sizeof(YMMWORD) +%define YMMWORD_BIT YWORD_BIT ; sizeof(YMMWORD)*BYTE_BIT + ; Similar hacks for when we load a dword or MMWORD into an xmm# register %define XMM_DWORD %define XMM_MMWORD @@ -167,12 +171,14 @@ section .note.GNU-stack noalloc noexec nowrite progbits %define SIZEOF_DWORD 4 ; sizeof(DWORD) %define SIZEOF_QWORD 8 ; sizeof(QWORD) %define SIZEOF_OWORD 16 ; sizeof(OWORD) +%define SIZEOF_YWORD 32 ; sizeof(YWORD) %define BYTE_BIT 8 ; CHAR_BIT in C %define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT %define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT %define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT %define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT +%define YWORD_BIT 256 ; sizeof(YWORD)*BYTE_BIT ; -------------------------------------------------------------------------- ; External Symbol Name diff --git a/win/jsimdcfg.inc b/win/jsimdcfg.inc index 9d4aede..14eca46 100755 --- a/win/jsimdcfg.inc +++ b/win/jsimdcfg.inc @@ -90,5 +90,6 @@ %define JSIMD_3DNOW 0x02 %define JSIMD_SSE 0x04 %define JSIMD_SSE2 0x08 +%define JSIMD_AVX2 0x80 ; Short forms of external names for systems with brain-damaged linkers. ;