#ifndef WITH_SIMD
#define ALIGN_SIZE sizeof(double)
#else
-#define ALIGN_SIZE 16 /* Most SIMD implementations require this */
+#define ALIGN_SIZE 32 /* Most of the SIMD instructions we support require
+ 16-byte (128-bit) alignment, but AVX2 requires
+ 32-byte alignment. */
#endif
#endif
endif()
if(SIMD_X86_64)
- set(SIMD_BASENAMES jfdctflt-sse-64 jccolor-sse2-64 jcgray-sse2-64
+ set(SIMD_BASENAMES jsimdcpu-64 jfdctflt-sse-64 jccolor-sse2-64 jcgray-sse2-64
jchuff-sse2-64 jcsample-sse2-64 jdcolor-sse2-64 jdmerge-sse2-64
jdsample-sse2-64 jfdctfst-sse2-64 jfdctint-sse2-64 jidctflt-sse2-64
jidctfst-sse2-64 jidctint-sse2-64 jidctred-sse2-64 jquantf-sse2-64
if SIMD_X86_64
libsimd_la_SOURCES = jsimd_x86_64.c jsimd.h jsimdcfg.inc.h jsimdext.inc \
- jcolsamp.inc jdct.inc jpeg_nbits_table.inc jfdctflt-sse-64.asm \
+ jcolsamp.inc jdct.inc jpeg_nbits_table.inc jsimdcpu-64.asm \
+ jfdctflt-sse-64.asm \
jccolor-sse2-64.asm jcgray-sse2-64.asm jchuff-sse2-64.asm \
jcsample-sse2-64.asm jdcolor-sse2-64.asm jdmerge-sse2-64.asm \
jdsample-sse2-64.asm jfdctfst-sse2-64.asm jfdctint-sse2-64.asm \
;
; jccolext.asm - colorspace conversion (64-bit SSE2)
;
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 8
- align 16
+ align 32
global EXTN(jsimd_rgb_ycc_convert_sse2)
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
;
; jccolor.asm - colorspace conversion (64-bit SSE2)
;
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_rgb_ycc_convert_sse2)
EXTN(jconst_rgb_ycc_convert_sse2):
PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
;
; jcgray.asm - grayscale colorspace conversion (64-bit SSE2)
;
-; Copyright (C) 2011, D. R. Commander.
+; Copyright (C) 2011, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_rgb_gray_convert_sse2)
EXTN(jconst_rgb_gray_convert_sse2):
PW_F0114_F0250 times 4 dw F_0_114, F_0_250
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
;
; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
;
-; Copyright (C) 2011, D. R. Commander.
+; Copyright (C) 2011, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
- align 16
+ align 32
global EXTN(jsimd_rgb_gray_convert_sse2)
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_huff_encode_one_block)
EXTN(jconst_huff_encode_one_block):
%include "jpeg_nbits_table.inc"
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define put_bits r9d
%define buffer rax
- align 16
+ align 32
global EXTN(jsimd_huff_encode_one_block_sse2)
EXTN(jsimd_huff_encode_one_block_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jcsample.asm - downsampling (64-bit SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; r14 = JSAMPARRAY input_data
; r15 = JSAMPARRAY output_data
- align 16
+ align 32
global EXTN(jsimd_h2v1_downsample_sse2)
EXTN(jsimd_h2v1_downsample_sse2):
; r14 = JSAMPARRAY input_data
; r15 = JSAMPARRAY output_data
- align 16
+ align 32
global EXTN(jsimd_h2v2_downsample_sse2)
EXTN(jsimd_h2v2_downsample_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jdcolext.asm - colorspace conversion (64-bit SSE2)
;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2012, D. R. Commander.
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
- align 16
+ align 32
global EXTN(jsimd_ycc_rgb_convert_sse2)
EXTN(jsimd_ycc_rgb_convert_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jdcolor.asm - colorspace conversion (64-bit SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_ycc_rgb_convert_sse2)
EXTN(jconst_ycc_rgb_convert_sse2):
PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_merged_upsample_sse2)
EXTN(jconst_merged_upsample_sse2):
PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2)
;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2012, D. R. Commander.
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 3
- align 16
+ align 32
global EXTN(jsimd_h2v1_merged_upsample_sse2)
EXTN(jsimd_h2v1_merged_upsample_sse2):
; r12d = JDIMENSION in_row_group_ctr
; r13 = JSAMPARRAY output_buf
- align 16
+ align 32
global EXTN(jsimd_h2v2_merged_upsample_sse2)
EXTN(jsimd_h2v2_merged_upsample_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jdsample.asm - upsampling (64-bit SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_fancy_upsample_sse2)
EXTN(jconst_fancy_upsample_sse2):
PW_SEVEN times 8 dw 7
PW_EIGHT times 8 dw 8
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr
- align 16
+ align 32
global EXTN(jsimd_h2v1_fancy_upsample_sse2)
EXTN(jsimd_h2v1_fancy_upsample_sse2):
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 4
- align 16
+ align 32
global EXTN(jsimd_h2v2_fancy_upsample_sse2)
EXTN(jsimd_h2v2_fancy_upsample_sse2):
; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr
- align 16
+ align 32
global EXTN(jsimd_h2v1_upsample_sse2)
EXTN(jsimd_h2v1_upsample_sse2):
; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr
- align 16
+ align 32
global EXTN(jsimd_h2v2_upsample_sse2)
EXTN(jsimd_h2v2_upsample_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jfdctflt.asm - floating-point FDCT (64-bit SSE)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_fdct_float_sse)
EXTN(jconst_fdct_float_sse):
PD_0_541 times 4 dd 0.541196100146196984399723
PD_1_306 times 4 dd 1.306562964876376527856643
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
- align 16
+ align 32
global EXTN(jsimd_fdct_float_sse)
EXTN(jsimd_fdct_float_sse):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
- alignz 16
+ alignz 32
global EXTN(jconst_fdct_ifast_sse2)
EXTN(jconst_fdct_ifast_sse2):
PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
- align 16
+ align 32
global EXTN(jsimd_fdct_ifast_sse2)
EXTN(jsimd_fdct_ifast_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jfdctint.asm - accurate integer FDCT (64-bit SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_fdct_islow_sse2)
EXTN(jconst_fdct_islow_sse2):
PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1)
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 6
- align 16
+ align 32
global EXTN(jsimd_fdct_islow_sse2)
EXTN(jsimd_fdct_islow_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_idct_float_sse2)
EXTN(jconst_idct_float_sse2):
PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
; FAST_FLOAT workspace[DCTSIZE2]
- align 16
+ align 32
global EXTN(jsimd_idct_float_sse2)
EXTN(jsimd_idct_float_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jidctfst.asm - fast integer IDCT (64-bit SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
- alignz 16
+ alignz 32
global EXTN(jconst_idct_ifast_sse2)
EXTN(jconst_idct_ifast_sse2):
PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
- align 16
+ align 32
global EXTN(jsimd_idct_ifast_sse2)
EXTN(jsimd_idct_ifast_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jidctint.asm - accurate integer IDCT (64-bit SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_idct_islow_sse2)
EXTN(jconst_idct_islow_sse2):
PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 12
- align 16
+ align 32
global EXTN(jsimd_idct_islow_sse2)
EXTN(jsimd_idct_islow_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jidctred.asm - reduced-size IDCT (64-bit SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_idct_red_sse2)
EXTN(jconst_idct_red_sse2):
PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
- align 16
+ align 32
global EXTN(jsimd_idct_4x4_sse2)
EXTN(jsimd_idct_4x4_sse2):
; r12 = JSAMPARRAY output_buf
; r13d = JDIMENSION output_col
- align 16
+ align 32
global EXTN(jsimd_idct_2x2_sse2)
EXTN(jsimd_idct_2x2_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; r11d = JDIMENSION start_col
; r12 = FAST_FLOAT *workspace
- align 16
+ align 32
global EXTN(jsimd_convsamp_float_sse2)
EXTN(jsimd_convsamp_float_sse2):
; r11 = FAST_FLOAT *divisors
; r12 = FAST_FLOAT *workspace
- align 16
+ align 32
global EXTN(jsimd_quantize_float_sse2)
EXTN(jsimd_quantize_float_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; r11d = JDIMENSION start_col
; r12 = DCTELEM *workspace
- align 16
+ align 32
global EXTN(jsimd_convsamp_sse2)
EXTN(jsimd_convsamp_sse2):
; r11 = DCTELEM *divisors
; r12 = DCTELEM *workspace
- align 16
+ align 32
global EXTN(jsimd_quantize_sse2)
EXTN(jsimd_quantize_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
#define JSIMD_ARM_NEON 0x10
#define JSIMD_MIPS_DSPR2 0x20
#define JSIMD_ALTIVEC 0x40
+#define JSIMD_AVX2 0x80
/* SIMD Ext: retrieve SIMD/CPU information */
EXTERN(unsigned int) jpeg_simd_cpu_support (void);
if (simd_support != ~0U)
return;
- simd_support = JSIMD_SSE2 | JSIMD_SSE;
+ simd_support = jpeg_simd_cpu_support();
/* Force different settings through environment variables */
+ env = getenv("JSIMD_FORCESSE2");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support &= JSIMD_SSE2;
+ env = getenv("JSIMD_FORCEAVX2");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support &= JSIMD_AVX2;
env = getenv("JSIMD_FORCENONE");
if ((env != NULL) && (strcmp(env, "1") == 0))
simd_support = 0;
%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW
%define _cpp_protection_JSIMD_SSE JSIMD_SSE
%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2
+%define _cpp_protection_JSIMD_AVX2 JSIMD_AVX2
--- /dev/null
+;
+; jsimdcpu-64.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support (void)
+;
+
+ align 32
+ global EXTN(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+ push rbx
+ push rdi
+
+ xor rdi,rdi ; simd support flag
+
+ ; Check for AVX2 instruction support
+ mov rax, 7
+ xor rcx,rcx
+ cpuid
+ mov rax,rbx ; rax = Extended feature flags
+
+ or rdi, JSIMD_SSE2
+ or rdi, JSIMD_SSE
+ test rax, 1<<5 ; bit5:AVX2
+ jz short .return
+ or rdi, JSIMD_AVX2
+
+.return:
+ mov rax,rdi
+
+ pop rdi
+ pop rbx
+ ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+ align 32
; jsimdcpu.asm - SIMD instruction support check
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
or edi, byte JSIMD_SSE2
.no_sse2:
+ ; Check for AVX2 instruction support
+ mov eax, 7
+ xor ecx,ecx
+ cpuid
+ mov eax,ebx
+ test eax, 1<<5 ; bit5:AVX2
+ jz short .no_avx2
+ or edi, JSIMD_AVX2
+.no_avx2:
+
; Check for 3DNow! instruction support
mov eax, 0x80000000
cpuid
; jsimdext.inc - common declarations
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2010, D. R. Commander.
+; Copyright (C) 2010, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
;
; -- segment definition --
;
%ifdef __YASM_VER__
-%define SEG_TEXT .text align=16
-%define SEG_CONST .rdata align=16
+%define SEG_TEXT .text align=32
+%define SEG_CONST .rdata align=32
%else
-%define SEG_TEXT .text align=16 public use32 class=CODE
-%define SEG_CONST .rdata align=16 public use32 class=CONST
+%define SEG_TEXT .text align=32 public use32 class=CODE
+%define SEG_CONST .rdata align=32 public use32 class=CONST
%endif
%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)--------
; -- segment definition --
;
%ifdef __YASM_VER__
-%define SEG_TEXT .text align=16
-%define SEG_CONST .rdata align=16
+%define SEG_TEXT .text align=32
+%define SEG_CONST .rdata align=32
%else
-%define SEG_TEXT .text align=16 public use64 class=CODE
-%define SEG_CONST .rdata align=16 public use64 class=CONST
+%define SEG_TEXT .text align=32 public use64 class=CODE
+%define SEG_CONST .rdata align=32 public use64 class=CONST
%endif
%define EXTN(name) name ; foo() -> foo
; -- segment definition --
;
%ifdef __x86_64__
-%define SEG_TEXT .text progbits align=16
-%define SEG_CONST .rodata progbits align=16
+%define SEG_TEXT .text progbits align=32
+%define SEG_CONST .rodata progbits align=32
%else
-%define SEG_TEXT .text progbits alloc exec nowrite align=16
-%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
+%define SEG_TEXT .text progbits alloc exec nowrite align=32
+%define SEG_CONST .rodata progbits alloc noexec nowrite align=32
%endif
; To make the code position-independent, append -DPIC to the commandline
; -- segment definition --
;
-%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why?
-%define SEG_CONST .rodata align=16
+%define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=16. why?
+%define SEG_CONST .rodata align=32
; The generation of position-independent code (PIC) is the default on Darwin.
;
%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
+%define YMMWORD ; int256 (AVX register)
+%define SIZEOF_YMMWORD SIZEOF_YWORD ; sizeof(YMMWORD)
+%define YMMWORD_BIT YWORD_BIT ; sizeof(YMMWORD)*BYTE_BIT
+
; Similar hacks for when we load a dword or MMWORD into an xmm# register
%define XMM_DWORD
%define XMM_MMWORD
%define SIZEOF_DWORD 4 ; sizeof(DWORD)
%define SIZEOF_QWORD 8 ; sizeof(QWORD)
%define SIZEOF_OWORD 16 ; sizeof(OWORD)
+%define SIZEOF_YWORD 32 ; sizeof(YWORD)
%define BYTE_BIT 8 ; CHAR_BIT in C
%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT
%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT
%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT
%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT
+%define YWORD_BIT 256 ; sizeof(YWORD)*BYTE_BIT
; --------------------------------------------------------------------------
; External Symbol Name
%define JSIMD_3DNOW 0x02
%define JSIMD_SSE 0x04
%define JSIMD_SSE2 0x08
+%define JSIMD_AVX2 0x80
; Short forms of external names for systems with brain-damaged linkers.
;