From: DRC Date: Tue, 5 Jul 2016 21:19:26 +0000 (-0500) Subject: Lay the groundwork for 32-bit AVX2 SIMD support X-Git-Tag: 1.5.90~127 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7ee3ce9ada2cace6ea54ed06df59122800d9d67c;p=libjpeg-turbo Lay the groundwork for 32-bit AVX2 SIMD support --- diff --git a/simd/jccolext-mmx.asm b/simd/jccolext-mmx.asm index 96a0372..4748b20 100644 --- a/simd/jccolext-mmx.asm +++ b/simd/jccolext-mmx.asm @@ -2,6 +2,7 @@ ; jccolext.asm - colorspace conversion (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -38,7 +39,7 @@ %define WK_NUM 8 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 + align 32 global EXTN(jsimd_rgb_ycc_convert_mmx) EXTN(jsimd_rgb_ycc_convert_mmx): @@ -473,4 +474,4 @@ EXTN(jsimd_rgb_ycc_convert_mmx): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jccolext-sse2.asm b/simd/jccolext-sse2.asm index d8496dc..611d804 100644 --- a/simd/jccolext-sse2.asm +++ b/simd/jccolext-sse2.asm @@ -1,7 +1,9 @@ ; ; jccolext.asm - colorspace conversion (SSE2) ; -; x86 SIMD extension for IJG JPEG library +; Copyright (C) 2016, D. R. Commander. +; +; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; @@ -36,7 +38,7 @@ %define WK_NUM 8 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 + align 32 global EXTN(jsimd_rgb_ycc_convert_sse2) @@ -500,4 +502,4 @@ EXTN(jsimd_rgb_ycc_convert_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jccolor-mmx.asm b/simd/jccolor-mmx.asm index c4e6d88..aecdeed 100644 --- a/simd/jccolor-mmx.asm +++ b/simd/jccolor-mmx.asm @@ -2,7 +2,7 @@ ; jccolor.asm - colorspace conversion (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -35,7 +35,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_rgb_ycc_convert_mmx) EXTN(jconst_rgb_ycc_convert_mmx): @@ -47,7 +47,7 @@ PW_MF008_MF041 times 2 dw -F_0_081,-F_0_418 PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/simd/jccolor-sse2.asm b/simd/jccolor-sse2.asm index aae51ba..f9b4c91 100644 --- a/simd/jccolor-sse2.asm +++ b/simd/jccolor-sse2.asm @@ -1,7 +1,7 @@ ; ; jccolor.asm - colorspace conversion (SSE2) ; -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -34,7 +34,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_rgb_ycc_convert_sse2) EXTN(jconst_rgb_ycc_convert_sse2): @@ -46,7 +46,7 @@ PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418 PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/simd/jcgray-mmx.asm b/simd/jcgray-mmx.asm index 0819b6c..48decc8 100644 --- a/simd/jcgray-mmx.asm +++ b/simd/jcgray-mmx.asm @@ -2,7 +2,7 @@ ; jcgray.asm - grayscale colorspace conversion (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2011, D. R. Commander. +; Copyright (C) 2011, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -31,7 +31,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_rgb_gray_convert_mmx) EXTN(jconst_rgb_gray_convert_mmx): @@ -40,7 +40,7 @@ PW_F0299_F0337 times 2 dw F_0_299, F_0_337 PW_F0114_F0250 times 2 dw F_0_114, F_0_250 PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/simd/jcgray-sse2.asm b/simd/jcgray-sse2.asm index 831fda6..dedfc8a 100644 --- a/simd/jcgray-sse2.asm +++ b/simd/jcgray-sse2.asm @@ -1,7 +1,7 @@ ; ; jcgray.asm - grayscale colorspace conversion (SSE2) ; -; Copyright (C) 2011, D. R. Commander. +; Copyright (C) 2011, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -30,7 +30,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_rgb_gray_convert_sse2) EXTN(jconst_rgb_gray_convert_sse2): @@ -39,7 +39,7 @@ PW_F0299_F0337 times 4 dw F_0_299, F_0_337 PW_F0114_F0250 times 4 dw F_0_114, F_0_250 PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/simd/jcgryext-mmx.asm b/simd/jcgryext-mmx.asm index 1c1b8d8..841eb0b 100644 --- a/simd/jcgryext-mmx.asm +++ b/simd/jcgryext-mmx.asm @@ -2,7 +2,7 @@ ; jcgryext.asm - grayscale colorspace conversion (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2011, D. R. Commander. +; Copyright (C) 2011, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -39,7 +39,7 @@ %define WK_NUM 2 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 + align 32 global EXTN(jsimd_rgb_gray_convert_mmx) EXTN(jsimd_rgb_gray_convert_mmx): @@ -353,4 +353,4 @@ EXTN(jsimd_rgb_gray_convert_mmx): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jcgryext-sse2.asm b/simd/jcgryext-sse2.asm index 78beac7..40505a5 100644 --- a/simd/jcgryext-sse2.asm +++ b/simd/jcgryext-sse2.asm @@ -1,7 +1,7 @@ ; ; jcgryext.asm - grayscale colorspace conversion (SSE2) ; -; Copyright (C) 2011, D. R. Commander. +; Copyright (C) 2011, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -38,7 +38,7 @@ %define WK_NUM 2 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 + align 32 global EXTN(jsimd_rgb_gray_convert_sse2) @@ -381,4 +381,4 @@ EXTN(jsimd_rgb_gray_convert_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jchuff-sse2.asm b/simd/jchuff-sse2.asm index cfae68c..c018c87 100644 --- a/simd/jchuff-sse2.asm +++ b/simd/jchuff-sse2.asm @@ -25,14 +25,14 @@ ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_huff_encode_one_block) EXTN(jconst_huff_encode_one_block): %include "jpeg_nbits_table.inc" - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -178,7 +178,7 @@ EXTN(jconst_huff_encode_one_block): %define put_buffer ebx %define put_bits edi - align 16 + align 32 global EXTN(jsimd_huff_encode_one_block_sse2) EXTN(jsimd_huff_encode_one_block_sse2): @@ -423,4 +423,4 @@ EXTN(jsimd_huff_encode_one_block_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jcsample-mmx.asm b/simd/jcsample-mmx.asm index 6cd544e..2fa6a1a 100644 --- a/simd/jcsample-mmx.asm +++ b/simd/jcsample-mmx.asm @@ -2,6 +2,7 @@ ; jcsample.asm - downsampling (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -38,7 +39,7 @@ %define input_data(b) (b)+24 ; JSAMPARRAY input_data %define output_data(b) (b)+28 ; JSAMPARRAY output_data - align 16 + align 32 global EXTN(jsimd_h2v1_downsample_mmx) EXTN(jsimd_h2v1_downsample_mmx): @@ -180,7 +181,7 @@ EXTN(jsimd_h2v1_downsample_mmx): %define input_data(b) (b)+24 ; JSAMPARRAY input_data %define output_data(b) (b)+28 ; JSAMPARRAY output_data - align 16 + align 32 global EXTN(jsimd_h2v2_downsample_mmx) EXTN(jsimd_h2v2_downsample_mmx): @@ -320,4 +321,4 @@ EXTN(jsimd_h2v2_downsample_mmx): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jcsample-sse2.asm b/simd/jcsample-sse2.asm index 5fc2637..3d34ab9 100644 --- a/simd/jcsample-sse2.asm +++ b/simd/jcsample-sse2.asm @@ -2,6 +2,7 @@ ; jcsample.asm - downsampling (SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -38,7 +39,7 @@ %define input_data(b) (b)+24 ; JSAMPARRAY input_data %define output_data(b) (b)+28 ; JSAMPARRAY output_data - align 16 + align 32 global EXTN(jsimd_h2v1_downsample_sse2) EXTN(jsimd_h2v1_downsample_sse2): @@ -193,7 +194,7 @@ EXTN(jsimd_h2v1_downsample_sse2): %define input_data(b) (b)+24 ; JSAMPARRAY input_data %define output_data(b) (b)+28 ; JSAMPARRAY output_data - align 16 + align 32 global EXTN(jsimd_h2v2_downsample_sse2) EXTN(jsimd_h2v2_downsample_sse2): @@ -347,4 +348,4 @@ EXTN(jsimd_h2v2_downsample_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jdcolext-mmx.asm b/simd/jdcolext-mmx.asm index 21e34f6..1f0614e 100644 --- a/simd/jdcolext-mmx.asm +++ b/simd/jdcolext-mmx.asm @@ -2,6 +2,7 @@ ; jdcolext.asm - colorspace conversion (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -38,7 +39,7 @@ %define WK_NUM 2 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 + align 32 global EXTN(jsimd_ycc_rgb_convert_mmx) EXTN(jsimd_ycc_rgb_convert_mmx): @@ -401,4 +402,4 @@ EXTN(jsimd_ycc_rgb_convert_mmx): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jdcolext-sse2.asm b/simd/jdcolext-sse2.asm index 09844bf..69462ef 100644 --- a/simd/jdcolext-sse2.asm +++ b/simd/jdcolext-sse2.asm @@ -2,7 +2,7 @@ ; jdcolext.asm - colorspace conversion (SSE2) ; ; Copyright 2009, 2012 Pierre Ossman for Cendio AB -; Copyright (C) 2012, D. R. Commander. +; Copyright (C) 2012, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -39,7 +39,7 @@ %define WK_NUM 2 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 + align 32 global EXTN(jsimd_ycc_rgb_convert_sse2) EXTN(jsimd_ycc_rgb_convert_sse2): @@ -456,4 +456,4 @@ EXTN(jsimd_ycc_rgb_convert_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jdcolor-mmx.asm b/simd/jdcolor-mmx.asm index 4e58031..5136fe6 100644 --- a/simd/jdcolor-mmx.asm +++ b/simd/jdcolor-mmx.asm @@ -2,7 +2,7 @@ ; jdcolor.asm - colorspace conversion (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -33,7 +33,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_ycc_rgb_convert_mmx) EXTN(jconst_ycc_rgb_convert_mmx): @@ -44,7 +44,7 @@ PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 PW_ONE times 4 dw 1 PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/simd/jdcolor-sse2.asm b/simd/jdcolor-sse2.asm index 1345df9..5552dca 100644 --- a/simd/jdcolor-sse2.asm +++ b/simd/jdcolor-sse2.asm @@ -2,7 +2,7 @@ ; jdcolor.asm - colorspace conversion (SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -33,7 +33,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_ycc_rgb_convert_sse2) EXTN(jconst_ycc_rgb_convert_sse2): @@ -44,7 +44,7 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 PW_ONE times 8 dw 1 PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/simd/jdmerge-mmx.asm b/simd/jdmerge-mmx.asm index ee58bff..7ce1d86 100644 --- a/simd/jdmerge-mmx.asm +++ b/simd/jdmerge-mmx.asm @@ -2,7 +2,7 @@ ; jdmerge.asm - merged upsampling/color conversion (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -33,7 +33,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_merged_upsample_mmx) EXTN(jconst_merged_upsample_mmx): @@ -44,7 +44,7 @@ PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 PW_ONE times 4 dw 1 PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/simd/jdmerge-sse2.asm b/simd/jdmerge-sse2.asm index 0683fd0..fc1d6ad 100644 --- a/simd/jdmerge-sse2.asm +++ b/simd/jdmerge-sse2.asm @@ -2,7 +2,7 @@ ; jdmerge.asm - merged upsampling/color conversion (SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, D. R. Commander. +; Copyright (C) 2009, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -33,7 +33,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_merged_upsample_sse2) EXTN(jconst_merged_upsample_sse2): @@ -44,7 +44,7 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 PW_ONE times 8 dw 1 PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/simd/jdmrgext-mmx.asm b/simd/jdmrgext-mmx.asm index 63f45cf..0444632 100644 --- a/simd/jdmrgext-mmx.asm +++ b/simd/jdmrgext-mmx.asm @@ -2,6 +2,7 @@ ; jdmrgext.asm - merged upsampling/color conversion (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -38,7 +39,7 @@ %define WK_NUM 3 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 + align 32 global EXTN(jsimd_h2v1_merged_upsample_mmx) EXTN(jsimd_h2v1_merged_upsample_mmx): @@ -407,7 +408,7 @@ EXTN(jsimd_h2v1_merged_upsample_mmx): %define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr %define output_buf(b) (b)+20 ; JSAMPARRAY output_buf - align 16 + align 32 global EXTN(jsimd_h2v2_merged_upsample_mmx) EXTN(jsimd_h2v2_merged_upsample_mmx): @@ -460,4 +461,4 @@ EXTN(jsimd_h2v2_merged_upsample_mmx): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jdmrgext-sse2.asm b/simd/jdmrgext-sse2.asm index 187ba0c..50702c6 100644 --- a/simd/jdmrgext-sse2.asm +++ b/simd/jdmrgext-sse2.asm @@ -2,7 +2,7 @@ ; jdmrgext.asm - merged upsampling/color conversion (SSE2) ; ; Copyright 2009, 2012 Pierre Ossman for Cendio AB -; Copyright (C) 2012, D. R. Commander. +; Copyright (C) 2012, 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -39,7 +39,7 @@ %define WK_NUM 3 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 + align 32 global EXTN(jsimd_h2v1_merged_upsample_sse2) EXTN(jsimd_h2v1_merged_upsample_sse2): @@ -462,7 +462,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): %define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr %define output_buf(b) (b)+20 ; JSAMPARRAY output_buf - align 16 + align 32 global EXTN(jsimd_h2v2_merged_upsample_sse2) EXTN(jsimd_h2v2_merged_upsample_sse2): @@ -515,4 +515,4 @@ EXTN(jsimd_h2v2_merged_upsample_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jdsample-mmx.asm b/simd/jdsample-mmx.asm index 5e4fa7a..266ee7a 100644 --- a/simd/jdsample-mmx.asm +++ b/simd/jdsample-mmx.asm @@ -2,6 +2,7 @@ ; jdsample.asm - upsampling (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -20,7 +21,7 @@ ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_fancy_upsample_mmx) EXTN(jconst_fancy_upsample_mmx): @@ -31,7 +32,7 @@ PW_THREE times 4 dw 3 PW_SEVEN times 4 dw 7 PW_EIGHT times 4 dw 8 - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -56,7 +57,7 @@ PW_EIGHT times 4 dw 8 %define input_data(b) (b)+16 ; JSAMPARRAY input_data %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr - align 16 + align 32 global EXTN(jsimd_h2v1_fancy_upsample_mmx) EXTN(jsimd_h2v1_fancy_upsample_mmx): @@ -214,7 +215,7 @@ EXTN(jsimd_h2v1_fancy_upsample_mmx): %define WK_NUM 4 %define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr - align 16 + align 32 global EXTN(jsimd_h2v2_fancy_upsample_mmx) EXTN(jsimd_h2v2_fancy_upsample_mmx): @@ -540,7 +541,7 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx): %define input_data(b) (b)+16 ; JSAMPARRAY input_data %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr - align 16 + align 32 global EXTN(jsimd_h2v1_upsample_mmx) EXTN(jsimd_h2v1_upsample_mmx): @@ -641,7 +642,7 @@ EXTN(jsimd_h2v1_upsample_mmx): %define input_data(b) (b)+16 ; JSAMPARRAY input_data %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr - align 16 + align 32 global EXTN(jsimd_h2v2_upsample_mmx) EXTN(jsimd_h2v2_upsample_mmx): @@ -733,4 +734,4 @@ EXTN(jsimd_h2v2_upsample_mmx): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jdsample-sse2.asm b/simd/jdsample-sse2.asm index 99a8c7c..01d96b7 100644 --- a/simd/jdsample-sse2.asm +++ b/simd/jdsample-sse2.asm @@ -2,6 +2,7 @@ ; jdsample.asm - upsampling (SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -20,7 +21,7 @@ ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_fancy_upsample_sse2) EXTN(jconst_fancy_upsample_sse2): @@ -31,7 +32,7 @@ PW_THREE times 8 dw 3 PW_SEVEN times 8 dw 7 PW_EIGHT times 8 dw 8 - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -56,7 +57,7 @@ PW_EIGHT times 8 dw 8 %define input_data(b) (b)+16 ; JSAMPARRAY input_data %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr - align 16 + align 32 global EXTN(jsimd_h2v1_fancy_upsample_sse2) EXTN(jsimd_h2v1_fancy_upsample_sse2): @@ -212,7 +213,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2): %define WK_NUM 4 %define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr - align 16 + align 32 global EXTN(jsimd_h2v2_fancy_upsample_sse2) EXTN(jsimd_h2v2_fancy_upsample_sse2): @@ -536,7 +537,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): %define input_data(b) (b)+16 ; JSAMPARRAY input_data %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr - align 16 + align 32 global EXTN(jsimd_h2v1_upsample_sse2) EXTN(jsimd_h2v1_upsample_sse2): @@ -635,7 +636,7 @@ EXTN(jsimd_h2v1_upsample_sse2): %define input_data(b) (b)+16 ; JSAMPARRAY input_data %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr - align 16 + align 32 global EXTN(jsimd_h2v2_upsample_sse2) EXTN(jsimd_h2v2_upsample_sse2): @@ -725,4 +726,4 @@ EXTN(jsimd_h2v2_upsample_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jfdctflt-3dn.asm b/simd/jfdctflt-3dn.asm index 2191618..c3d8fba 100644 --- a/simd/jfdctflt-3dn.asm +++ b/simd/jfdctflt-3dn.asm @@ -2,6 +2,7 @@ ; jfdctflt.asm - floating-point FDCT (3DNow!) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -25,7 +26,7 @@ ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_fdct_float_3dnow) EXTN(jconst_fdct_float_3dnow): @@ -35,7 +36,7 @@ PD_0_707 times 2 dd 0.707106781186547524400844 PD_0_541 times 2 dd 0.541196100146196984399723 PD_1_306 times 2 dd 1.306562964876376527856643 - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -53,7 +54,7 @@ PD_1_306 times 2 dd 1.306562964876376527856643 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] %define WK_NUM 2 - align 16 + align 32 global EXTN(jsimd_fdct_float_3dnow) EXTN(jsimd_fdct_float_3dnow): @@ -316,4 +317,4 @@ EXTN(jsimd_fdct_float_3dnow): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jfdctflt-sse.asm b/simd/jfdctflt-sse.asm index 8b1ce18..b15eb5c 100644 --- a/simd/jfdctflt-sse.asm +++ b/simd/jfdctflt-sse.asm @@ -2,6 +2,7 @@ ; jfdctflt.asm - floating-point FDCT (SSE) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -35,7 +36,7 @@ ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_fdct_float_sse) EXTN(jconst_fdct_float_sse): @@ -45,7 +46,7 @@ PD_0_707 times 4 dd 0.707106781186547524400844 PD_0_541 times 4 dd 0.541196100146196984399723 PD_1_306 times 4 dd 1.306562964876376527856643 - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -63,7 +64,7 @@ PD_1_306 times 4 dd 1.306562964876376527856643 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 - align 16 + align 32 global EXTN(jsimd_fdct_float_sse) EXTN(jsimd_fdct_float_sse): @@ -366,4 +367,4 @@ EXTN(jsimd_fdct_float_sse): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jfdctfst-mmx.asm b/simd/jfdctfst-mmx.asm index eb2eb9c..f274924 100644 --- a/simd/jfdctfst-mmx.asm +++ b/simd/jfdctfst-mmx.asm @@ -2,6 +2,7 @@ ; jfdctfst.asm - fast integer FDCT (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -50,7 +51,7 @@ F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 16 + alignz 32 global EXTN(jconst_fdct_ifast_mmx) EXTN(jconst_fdct_ifast_mmx): @@ -60,7 +61,7 @@ PW_F0382 times 4 dw F_0_382 << CONST_SHIFT PW_F0541 times 4 dw F_0_541 << CONST_SHIFT PW_F1306 times 4 dw F_1_306 << CONST_SHIFT - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -78,7 +79,7 @@ PW_F1306 times 4 dw F_1_306 << CONST_SHIFT %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] %define WK_NUM 2 - align 16 + align 32 global EXTN(jsimd_fdct_ifast_mmx) EXTN(jsimd_fdct_ifast_mmx): @@ -393,4 +394,4 @@ EXTN(jsimd_fdct_ifast_mmx): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jfdctfst-sse2.asm b/simd/jfdctfst-sse2.asm index bd7723c..fb140fa 100644 --- a/simd/jfdctfst-sse2.asm +++ b/simd/jfdctfst-sse2.asm @@ -2,6 +2,7 @@ ; jfdctfst.asm - fast integer FDCT (SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -50,7 +51,7 @@ F_1_306 equ DESCALE(1402911301, 30-CONST_BITS) ; FIX(1.306562965) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 16 + alignz 32 global EXTN(jconst_fdct_ifast_sse2) EXTN(jconst_fdct_ifast_sse2): @@ -60,7 +61,7 @@ PW_F0382 times 8 dw F_0_382 << CONST_SHIFT PW_F0541 times 8 dw F_0_541 << CONST_SHIFT PW_F1306 times 8 dw F_1_306 << CONST_SHIFT - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -78,7 +79,7 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 - align 16 + align 32 global EXTN(jsimd_fdct_ifast_sse2) EXTN(jsimd_fdct_ifast_sse2): @@ -400,4 +401,4 @@ EXTN(jsimd_fdct_ifast_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jfdctint-mmx.asm b/simd/jfdctint-mmx.asm index 9142ad8..c035b0c 100644 --- a/simd/jfdctint-mmx.asm +++ b/simd/jfdctint-mmx.asm @@ -2,6 +2,7 @@ ; jfdctint.asm - accurate integer FDCT (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -64,7 +65,7 @@ F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_fdct_islow_mmx) EXTN(jconst_fdct_islow_mmx): @@ -81,7 +82,7 @@ PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1) PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1) PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS-1) - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -99,7 +100,7 @@ PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS-1) %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] %define WK_NUM 2 - align 16 + align 32 global EXTN(jsimd_fdct_islow_mmx) EXTN(jsimd_fdct_islow_mmx): @@ -618,4 +619,4 @@ EXTN(jsimd_fdct_islow_mmx): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jfdctint-sse2.asm b/simd/jfdctint-sse2.asm index ecfff30..66f2cc6 100644 --- a/simd/jfdctint-sse2.asm +++ b/simd/jfdctint-sse2.asm @@ -2,6 +2,7 @@ ; jfdctint.asm - accurate integer FDCT (SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -64,7 +65,7 @@ F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_fdct_islow_sse2) EXTN(jconst_fdct_islow_sse2): @@ -81,7 +82,7 @@ PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -99,7 +100,7 @@ PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 6 - align 16 + align 32 global EXTN(jsimd_fdct_islow_sse2) EXTN(jsimd_fdct_islow_sse2): @@ -630,4 +631,4 @@ EXTN(jsimd_fdct_islow_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jidctflt-3dn.asm b/simd/jidctflt-3dn.asm index 99356f2..1e47cf1 100644 --- a/simd/jidctflt-3dn.asm +++ b/simd/jidctflt-3dn.asm @@ -2,6 +2,7 @@ ; jidctflt.asm - floating-point IDCT (3DNow! & MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -25,7 +26,7 @@ ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_idct_float_3dnow) EXTN(jconst_idct_float_3dnow): @@ -37,7 +38,7 @@ PD_2_613 times 2 dd 2.613125929752753055713286 PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3) PB_CENTERJSAMP times 8 db CENTERJSAMPLE - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -61,7 +62,7 @@ PB_CENTERJSAMP times 8 db CENTERJSAMPLE %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT ; FAST_FLOAT workspace[DCTSIZE2] - align 16 + align 32 global EXTN(jsimd_idct_float_3dnow) EXTN(jsimd_idct_float_3dnow): @@ -448,4 +449,4 @@ EXTN(jsimd_idct_float_3dnow): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jidctflt-sse.asm b/simd/jidctflt-sse.asm index 4d4af2f..a1c99e9 100644 --- a/simd/jidctflt-sse.asm +++ b/simd/jidctflt-sse.asm @@ -2,6 +2,7 @@ ; jidctflt.asm - floating-point IDCT (SSE & MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -35,7 +36,7 @@ ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_idct_float_sse) EXTN(jconst_idct_float_sse): @@ -47,7 +48,7 @@ PD_M2_613 times 4 dd -2.613125929752753055713286 PD_0_125 times 4 dd 0.125 ; 1/8 PB_CENTERJSAMP times 8 db CENTERJSAMPLE - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -71,7 +72,7 @@ PB_CENTERJSAMP times 8 db CENTERJSAMPLE %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT ; FAST_FLOAT workspace[DCTSIZE2] - align 16 + align 32 global EXTN(jsimd_idct_float_sse) EXTN(jsimd_idct_float_sse): @@ -568,4 +569,4 @@ EXTN(jsimd_idct_float_sse): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jidctflt-sse2.asm b/simd/jidctflt-sse2.asm index e34d297..e8be844 100644 --- a/simd/jidctflt-sse2.asm +++ b/simd/jidctflt-sse2.asm @@ -2,6 +2,7 @@ ; jidctflt.asm - floating-point IDCT (SSE & SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -35,7 +36,7 @@ ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_idct_float_sse2) EXTN(jconst_idct_float_sse2): @@ -47,7 +48,7 @@ PD_M2_613 times 4 dd -2.613125929752753055713286 PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -71,7 +72,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT ; FAST_FLOAT workspace[DCTSIZE2] - align 16 + align 32 global EXTN(jsimd_idct_float_sse2) EXTN(jsimd_idct_float_sse2): @@ -494,4 +495,4 @@ EXTN(jsimd_idct_float_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jidctfst-mmx.asm b/simd/jidctfst-mmx.asm index 6e95bfb..7b5c2f2 100644 --- a/simd/jidctfst-mmx.asm +++ b/simd/jidctfst-mmx.asm @@ -2,6 +2,7 @@ ; jidctfst.asm - fast integer IDCT (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -57,7 +58,7 @@ F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 16 + alignz 32 global EXTN(jconst_idct_ifast_mmx) EXTN(jconst_idct_ifast_mmx): @@ -68,7 +69,7 @@ PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT PW_F1082 times 4 dw F_1_082 << CONST_SHIFT PB_CENTERJSAMP times 8 db CENTERJSAMPLE - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -92,7 +93,7 @@ PB_CENTERJSAMP times 8 db CENTERJSAMPLE %define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF ; JCOEF workspace[DCTSIZE2] - align 16 + align 32 global EXTN(jsimd_idct_ifast_mmx) EXTN(jsimd_idct_ifast_mmx): @@ -496,4 +497,4 @@ EXTN(jsimd_idct_ifast_mmx): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jidctfst-sse2.asm b/simd/jidctfst-sse2.asm index c5e5f33..51e3317 100644 --- a/simd/jidctfst-sse2.asm +++ b/simd/jidctfst-sse2.asm @@ -2,6 +2,7 @@ ; jidctfst.asm - fast integer IDCT (SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -57,7 +58,7 @@ F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 16 + alignz 32 global EXTN(jconst_idct_ifast_sse2) EXTN(jconst_idct_ifast_sse2): @@ -68,7 +69,7 @@ PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT PW_F1082 times 8 dw F_1_082 << CONST_SHIFT PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -90,7 +91,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 - align 16 + align 32 global EXTN(jsimd_idct_ifast_sse2) EXTN(jsimd_idct_ifast_sse2): @@ -498,4 +499,4 @@ EXTN(jsimd_idct_ifast_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jidctint-mmx.asm b/simd/jidctint-mmx.asm index 5bd1981..6ef7ff7 100644 --- a/simd/jidctint-mmx.asm +++ b/simd/jidctint-mmx.asm @@ -2,6 +2,7 @@ ; jidctint.asm - accurate integer IDCT (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -64,7 +65,7 @@ F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_idct_islow_mmx) EXTN(jconst_idct_islow_mmx): @@ -81,7 +82,7 @@ PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1) PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1) PB_CENTERJSAMP times 8 db CENTERJSAMPLE - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -105,7 +106,7 @@ PB_CENTERJSAMP times 8 db CENTERJSAMPLE %define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF ; JCOEF workspace[DCTSIZE2] - align 16 + align 32 global EXTN(jsimd_idct_islow_mmx) EXTN(jsimd_idct_islow_mmx): @@ -848,4 +849,4 @@ EXTN(jsimd_idct_islow_mmx): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jidctint-sse2.asm b/simd/jidctint-sse2.asm index 03ef3d9..441da70 100644 --- a/simd/jidctint-sse2.asm +++ b/simd/jidctint-sse2.asm @@ -2,6 +2,7 @@ ; jidctint.asm - accurate integer IDCT (SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -64,7 +65,7 @@ F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_idct_islow_sse2) EXTN(jconst_idct_islow_sse2): @@ -81,7 +82,7 @@ PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -103,7 +104,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 12 - align 16 + align 32 global EXTN(jsimd_idct_islow_sse2) EXTN(jsimd_idct_islow_sse2): @@ -855,4 +856,4 @@ EXTN(jsimd_idct_islow_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jidctred-mmx.asm b/simd/jidctred-mmx.asm index ba054e3..d5b7e2e 100644 --- a/simd/jidctred-mmx.asm +++ b/simd/jidctred-mmx.asm @@ -2,6 +2,7 @@ ; jidctred.asm - reduced-size IDCT (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -70,7 +71,7 @@ F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_idct_red_mmx) EXTN(jconst_idct_red_mmx): @@ -88,7 +89,7 @@ PD_DESCALE_P1_2 times 2 dd 1 << (DESCALE_P1_2-1) PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2-1) PB_CENTERJSAMP times 8 db CENTERJSAMPLE - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -113,7 +114,7 @@ PB_CENTERJSAMP times 8 db CENTERJSAMPLE %define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF ; JCOEF workspace[DCTSIZE2] - align 16 + align 32 global EXTN(jsimd_idct_4x4_mmx) EXTN(jsimd_idct_4x4_mmx): @@ -501,7 +502,7 @@ EXTN(jsimd_idct_4x4_mmx): %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf %define output_col(b) (b)+20 ; JDIMENSION output_col - align 16 + align 32 global EXTN(jsimd_idct_2x2_mmx) EXTN(jsimd_idct_2x2_mmx): @@ -702,4 +703,4 @@ EXTN(jsimd_idct_2x2_mmx): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jidctred-sse2.asm b/simd/jidctred-sse2.asm index 761fba8..d0609ac 100644 --- a/simd/jidctred-sse2.asm +++ b/simd/jidctred-sse2.asm @@ -2,6 +2,7 @@ ; jidctred.asm - reduced-size IDCT (SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -70,7 +71,7 @@ F_3_624 equ DESCALE(3891787747, 30-CONST_BITS) ; FIX(3.624509785) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 16 + alignz 32 global EXTN(jconst_idct_red_sse2) EXTN(jconst_idct_red_sse2): @@ -88,7 +89,7 @@ PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -111,7 +112,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 - align 16 + align 32 global EXTN(jsimd_idct_4x4_sse2) EXTN(jsimd_idct_4x4_sse2): @@ -422,7 +423,7 @@ EXTN(jsimd_idct_4x4_sse2): %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf %define output_col(b) (b)+20 ; JDIMENSION output_col - align 16 + align 32 global EXTN(jsimd_idct_2x2_sse2) EXTN(jsimd_idct_2x2_sse2): @@ -590,4 +591,4 @@ EXTN(jsimd_idct_2x2_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jquant-3dn.asm b/simd/jquant-3dn.asm index 0b4164b..73c02cd 100644 --- a/simd/jquant-3dn.asm +++ b/simd/jquant-3dn.asm @@ -2,6 +2,7 @@ ; jquant.asm - sample data conversion and quantization (3DNow! & MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -33,7 +34,7 @@ %define start_col ebp+12 ; JDIMENSION start_col %define workspace ebp+16 ; FAST_FLOAT *workspace - align 16 + align 32 global EXTN(jsimd_convsamp_float_3dnow) EXTN(jsimd_convsamp_float_3dnow): @@ -136,7 +137,7 @@ EXTN(jsimd_convsamp_float_3dnow): %define divisors ebp+12 ; FAST_FLOAT *divisors %define workspace ebp+16 ; FAST_FLOAT *workspace - align 16 + align 32 global EXTN(jsimd_quantize_float_3dnow) EXTN(jsimd_quantize_float_3dnow): @@ -229,4 +230,4 @@ EXTN(jsimd_quantize_float_3dnow): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jquant-mmx.asm b/simd/jquant-mmx.asm index aed6071..93dea31 100644 --- a/simd/jquant-mmx.asm +++ b/simd/jquant-mmx.asm @@ -2,6 +2,7 @@ ; jquant.asm - sample data conversion and quantization (MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -33,7 +34,7 @@ %define start_col ebp+12 ; JDIMENSION start_col %define workspace ebp+16 ; DCTELEM *workspace - align 16 + align 32 global EXTN(jsimd_convsamp_mmx) EXTN(jsimd_convsamp_mmx): @@ -138,7 +139,7 @@ EXTN(jsimd_convsamp_mmx): %define divisors ebp+12 ; DCTELEM *divisors %define workspace ebp+16 ; DCTELEM *workspace - align 16 + align 32 global EXTN(jsimd_quantize_mmx) EXTN(jsimd_quantize_mmx): @@ -270,4 +271,4 @@ EXTN(jsimd_quantize_mmx): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jquant-sse.asm b/simd/jquant-sse.asm index 1baf88f..d6ab943 100644 --- a/simd/jquant-sse.asm +++ b/simd/jquant-sse.asm @@ -2,6 +2,7 @@ ; jquant.asm - sample data conversion and quantization (SSE & MMX) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -33,7 +34,7 @@ %define start_col ebp+12 ; JDIMENSION start_col %define workspace ebp+16 ; FAST_FLOAT *workspace - align 16 + align 32 global EXTN(jsimd_convsamp_float_sse) EXTN(jsimd_convsamp_float_sse): @@ -136,7 +137,7 @@ EXTN(jsimd_convsamp_float_sse): %define divisors ebp+12 ; FAST_FLOAT *divisors %define workspace ebp+16 ; FAST_FLOAT *workspace - align 16 + align 32 global EXTN(jsimd_quantize_float_sse) EXTN(jsimd_quantize_float_sse): @@ -207,4 +208,4 @@ EXTN(jsimd_quantize_float_sse): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jquantf-sse2.asm b/simd/jquantf-sse2.asm index 1dca26a..3c0000e 100644 --- a/simd/jquantf-sse2.asm +++ b/simd/jquantf-sse2.asm @@ -2,6 +2,7 @@ ; jquantf.asm - sample data conversion and quantization (SSE & SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -33,7 +34,7 @@ %define start_col ebp+12 ; JDIMENSION start_col %define workspace ebp+16 ; FAST_FLOAT *workspace - align 16 + align 32 global EXTN(jsimd_convsamp_float_sse2) EXTN(jsimd_convsamp_float_sse2): @@ -113,7 +114,7 @@ EXTN(jsimd_convsamp_float_sse2): %define divisors ebp+12 ; FAST_FLOAT *divisors %define workspace ebp+16 ; FAST_FLOAT *workspace - align 16 + align 32 global EXTN(jsimd_quantize_float_sse2) EXTN(jsimd_quantize_float_sse2): @@ -167,4 +168,4 @@ EXTN(jsimd_quantize_float_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jquanti-sse2.asm b/simd/jquanti-sse2.asm index 6fb001f..6f42947 100644 --- a/simd/jquanti-sse2.asm +++ b/simd/jquanti-sse2.asm @@ -2,6 +2,7 @@ ; jquanti.asm - sample data conversion and quantization (SSE2) ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2016, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -33,7 +34,7 @@ %define start_col ebp+12 ; JDIMENSION start_col %define workspace ebp+16 ; DCTELEM *workspace - align 16 + align 32 global EXTN(jsimd_convsamp_sse2) EXTN(jsimd_convsamp_sse2): @@ -115,7 +116,7 @@ EXTN(jsimd_convsamp_sse2): %define divisors ebp+12 ; DCTELEM *divisors %define workspace ebp+16 ; DCTELEM *workspace - align 16 + align 32 global EXTN(jsimd_quantize_sse2) EXTN(jsimd_quantize_sse2): @@ -196,4 +197,4 @@ EXTN(jsimd_quantize_sse2): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jsimd_i386.c b/simd/jsimd_i386.c index 6da8bd8..2e770c1 100644 --- a/simd/jsimd_i386.c +++ b/simd/jsimd_i386.c @@ -61,6 +61,9 @@ init_simd (void) env = getenv("JSIMD_FORCESSE2"); if ((env != NULL) && (strcmp(env, "1") == 0)) simd_support &= JSIMD_SSE2; + env = getenv("JSIMD_FORCEAVX2"); + if ((env != NULL) && (strcmp(env, "1") == 0)) + simd_support &= JSIMD_AVX2; env = getenv("JSIMD_FORCENONE"); if ((env != NULL) && (strcmp(env, "1") == 0)) simd_support = 0; diff --git a/simd/jsimdcpu.asm b/simd/jsimdcpu.asm index 580810b..f42206e 100644 --- a/simd/jsimdcpu.asm +++ b/simd/jsimdcpu.asm @@ -28,7 +28,7 @@ ; jpeg_simd_cpu_support (void) ; - align 16 + align 32 global EXTN(jpeg_simd_cpu_support) EXTN(jpeg_simd_cpu_support): @@ -112,4 +112,4 @@ EXTN(jpeg_simd_cpu_support): ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 32 diff --git a/simd/jsimdext.inc b/simd/jsimdext.inc index d16bdef..3485d6d 100644 --- a/simd/jsimdext.inc +++ b/simd/jsimdext.inc @@ -64,8 +64,8 @@ ; -- segment definition -- ; -%define SEG_TEXT _text align=16 public use32 class=CODE -%define SEG_CONST _data align=16 public use32 class=DATA +%define SEG_TEXT _text align=32 public use32 class=CODE +%define SEG_CONST _data align=32 public use32 class=DATA %elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ ; * Linux @@ -108,7 +108,7 @@ section .note.GNU-stack noalloc noexec nowrite progbits ; -- segment definition -- ; -%define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=16. why? +%define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=32. why? %define SEG_CONST .rodata align=32 ; The generation of position-independent code (PIC) is the default on Darwin. diff --git a/turbojpeg.c b/turbojpeg.c index 421b5f8..95bf5dc 100644 --- a/turbojpeg.c +++ b/turbojpeg.c @@ -926,29 +926,29 @@ DLLEXPORT int DLLCALL tjEncodeYUVPlanes(tjhandle handle, compptr=&cinfo->comp_info[i]; _tmpbuf[i]=(JSAMPLE *)malloc( PAD((compptr->width_in_blocks*cinfo->max_h_samp_factor*DCTSIZE) - /compptr->h_samp_factor, 16) * cinfo->max_v_samp_factor + 16); + /compptr->h_samp_factor, 32) * cinfo->max_v_samp_factor + 32); if(!_tmpbuf[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure"); tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*cinfo->max_v_samp_factor); if(!tmpbuf[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure"); for(row=0; rowmax_v_samp_factor; row++) { unsigned char *_tmpbuf_aligned= - (unsigned char *)PAD((size_t)_tmpbuf[i], 16); + (unsigned char *)PAD((size_t)_tmpbuf[i], 32); tmpbuf[i][row]=&_tmpbuf_aligned[ PAD((compptr->width_in_blocks*cinfo->max_h_samp_factor*DCTSIZE) - /compptr->h_samp_factor, 16) * row]; + /compptr->h_samp_factor, 32) * row]; } - _tmpbuf2[i]=(JSAMPLE *)malloc(PAD(compptr->width_in_blocks*DCTSIZE, 16) - * compptr->v_samp_factor + 16); + _tmpbuf2[i]=(JSAMPLE *)malloc(PAD(compptr->width_in_blocks*DCTSIZE, 32) + * compptr->v_samp_factor + 32); if(!_tmpbuf2[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure"); tmpbuf2[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*compptr->v_samp_factor); if(!tmpbuf2[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure"); for(row=0; rowv_samp_factor; row++) { unsigned char *_tmpbuf2_aligned= - (unsigned char *)PAD((size_t)_tmpbuf2[i], 16); + (unsigned char *)PAD((size_t)_tmpbuf2[i], 32); tmpbuf2[i][row]=&_tmpbuf2_aligned[ - PAD(compptr->width_in_blocks*DCTSIZE, 16) * row]; + PAD(compptr->width_in_blocks*DCTSIZE, 32) * row]; } pw[i]=pw0*compptr->h_samp_factor/cinfo->max_h_samp_factor; ph[i]=ph0*compptr->v_samp_factor/cinfo->max_v_samp_factor; @@ -1611,17 +1611,17 @@ DLLEXPORT int DLLCALL tjDecodeYUVPlanes(tjhandle handle, for(i=0; inum_components; i++) { compptr=&dinfo->comp_info[i]; - _tmpbuf[i]=(JSAMPLE *)malloc(PAD(compptr->width_in_blocks*DCTSIZE, 16) - * compptr->v_samp_factor + 16); + _tmpbuf[i]=(JSAMPLE *)malloc(PAD(compptr->width_in_blocks*DCTSIZE, 32) + * compptr->v_samp_factor + 32); if(!_tmpbuf[i]) _throw("tjDecodeYUVPlanes(): Memory allocation failure"); tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*compptr->v_samp_factor); if(!tmpbuf[i]) _throw("tjDecodeYUVPlanes(): Memory allocation failure"); for(row=0; rowv_samp_factor; row++) { unsigned char *_tmpbuf_aligned= - (unsigned char *)PAD((size_t)_tmpbuf[i], 16); + (unsigned char *)PAD((size_t)_tmpbuf[i], 32); tmpbuf[i][row]=&_tmpbuf_aligned[ - PAD(compptr->width_in_blocks*DCTSIZE, 16) * row]; + PAD(compptr->width_in_blocks*DCTSIZE, 32) * row]; } pw[i]=pw0*compptr->h_samp_factor/dinfo->max_h_samp_factor; ph[i]=ph0*compptr->v_samp_factor/dinfo->max_v_samp_factor;