; jccolext.asm - colorspace conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define WK_NUM 8
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
- align 16
+ align 32
global EXTN(jsimd_rgb_ycc_convert_mmx)
EXTN(jsimd_rgb_ycc_convert_mmx):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
;
; jccolext.asm - colorspace conversion (SSE2)
;
-; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
%define WK_NUM 8
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
- align 16
+ align 32
global EXTN(jsimd_rgb_ycc_convert_sse2)
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jccolor.asm - colorspace conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_rgb_ycc_convert_mmx)
EXTN(jconst_rgb_ycc_convert_mmx):
PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
PD_ONEHALF times 2 dd (1 << (SCALEBITS-1))
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
;
; jccolor.asm - colorspace conversion (SSE2)
;
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_rgb_ycc_convert_sse2)
EXTN(jconst_rgb_ycc_convert_sse2):
PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
; jcgray.asm - grayscale colorspace conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2011, D. R. Commander.
+; Copyright (C) 2011, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_rgb_gray_convert_mmx)
EXTN(jconst_rgb_gray_convert_mmx):
PW_F0114_F0250 times 2 dw F_0_114, F_0_250
PD_ONEHALF times 2 dd (1 << (SCALEBITS-1))
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
;
; jcgray.asm - grayscale colorspace conversion (SSE2)
;
-; Copyright (C) 2011, D. R. Commander.
+; Copyright (C) 2011, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_rgb_gray_convert_sse2)
EXTN(jconst_rgb_gray_convert_sse2):
PW_F0114_F0250 times 4 dw F_0_114, F_0_250
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
; jcgryext.asm - grayscale colorspace conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2011, D. R. Commander.
+; Copyright (C) 2011, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define WK_NUM 2
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
- align 16
+ align 32
global EXTN(jsimd_rgb_gray_convert_mmx)
EXTN(jsimd_rgb_gray_convert_mmx):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
;
; jcgryext.asm - grayscale colorspace conversion (SSE2)
;
-; Copyright (C) 2011, D. R. Commander.
+; Copyright (C) 2011, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define WK_NUM 2
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
- align 16
+ align 32
global EXTN(jsimd_rgb_gray_convert_sse2)
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_huff_encode_one_block)
EXTN(jconst_huff_encode_one_block):
%include "jpeg_nbits_table.inc"
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define put_buffer ebx
%define put_bits edi
- align 16
+ align 32
global EXTN(jsimd_huff_encode_one_block_sse2)
EXTN(jsimd_huff_encode_one_block_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jcsample.asm - downsampling (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define input_data(b) (b)+24 ; JSAMPARRAY input_data
%define output_data(b) (b)+28 ; JSAMPARRAY output_data
- align 16
+ align 32
global EXTN(jsimd_h2v1_downsample_mmx)
EXTN(jsimd_h2v1_downsample_mmx):
%define input_data(b) (b)+24 ; JSAMPARRAY input_data
%define output_data(b) (b)+28 ; JSAMPARRAY output_data
- align 16
+ align 32
global EXTN(jsimd_h2v2_downsample_mmx)
EXTN(jsimd_h2v2_downsample_mmx):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jcsample.asm - downsampling (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define input_data(b) (b)+24 ; JSAMPARRAY input_data
%define output_data(b) (b)+28 ; JSAMPARRAY output_data
- align 16
+ align 32
global EXTN(jsimd_h2v1_downsample_sse2)
EXTN(jsimd_h2v1_downsample_sse2):
%define input_data(b) (b)+24 ; JSAMPARRAY input_data
%define output_data(b) (b)+28 ; JSAMPARRAY output_data
- align 16
+ align 32
global EXTN(jsimd_h2v2_downsample_sse2)
EXTN(jsimd_h2v2_downsample_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jdcolext.asm - colorspace conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define WK_NUM 2
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
- align 16
+ align 32
global EXTN(jsimd_ycc_rgb_convert_mmx)
EXTN(jsimd_ycc_rgb_convert_mmx):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jdcolext.asm - colorspace conversion (SSE2)
;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2012, D. R. Commander.
+; Copyright (C) 2012, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define WK_NUM 2
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
- align 16
+ align 32
global EXTN(jsimd_ycc_rgb_convert_sse2)
EXTN(jsimd_ycc_rgb_convert_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jdcolor.asm - colorspace conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_ycc_rgb_convert_mmx)
EXTN(jconst_ycc_rgb_convert_mmx):
PW_ONE times 4 dw 1
PD_ONEHALF times 2 dd 1 << (SCALEBITS-1)
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
; jdcolor.asm - colorspace conversion (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_ycc_rgb_convert_sse2)
EXTN(jconst_ycc_rgb_convert_sse2):
PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
; jdmerge.asm - merged upsampling/color conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_merged_upsample_mmx)
EXTN(jconst_merged_upsample_mmx):
PW_ONE times 4 dw 1
PD_ONEHALF times 2 dd 1 << (SCALEBITS-1)
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
; jdmerge.asm - merged upsampling/color conversion (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_merged_upsample_sse2)
EXTN(jconst_merged_upsample_sse2):
PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
; jdmrgext.asm - merged upsampling/color conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define WK_NUM 3
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
- align 16
+ align 32
global EXTN(jsimd_h2v1_merged_upsample_mmx)
EXTN(jsimd_h2v1_merged_upsample_mmx):
%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
- align 16
+ align 32
global EXTN(jsimd_h2v2_merged_upsample_mmx)
EXTN(jsimd_h2v2_merged_upsample_mmx):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jdmrgext.asm - merged upsampling/color conversion (SSE2)
;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2012, D. R. Commander.
+; Copyright (C) 2012, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define WK_NUM 3
%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
- align 16
+ align 32
global EXTN(jsimd_h2v1_merged_upsample_sse2)
EXTN(jsimd_h2v1_merged_upsample_sse2):
%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
- align 16
+ align 32
global EXTN(jsimd_h2v2_merged_upsample_sse2)
EXTN(jsimd_h2v2_merged_upsample_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jdsample.asm - upsampling (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_fancy_upsample_mmx)
EXTN(jconst_fancy_upsample_mmx):
PW_SEVEN times 4 dw 7
PW_EIGHT times 4 dw 8
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define input_data(b) (b)+16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
- align 16
+ align 32
global EXTN(jsimd_h2v1_fancy_upsample_mmx)
EXTN(jsimd_h2v1_fancy_upsample_mmx):
%define WK_NUM 4
%define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr
- align 16
+ align 32
global EXTN(jsimd_h2v2_fancy_upsample_mmx)
EXTN(jsimd_h2v2_fancy_upsample_mmx):
%define input_data(b) (b)+16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
- align 16
+ align 32
global EXTN(jsimd_h2v1_upsample_mmx)
EXTN(jsimd_h2v1_upsample_mmx):
%define input_data(b) (b)+16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
- align 16
+ align 32
global EXTN(jsimd_h2v2_upsample_mmx)
EXTN(jsimd_h2v2_upsample_mmx):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jdsample.asm - upsampling (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_fancy_upsample_sse2)
EXTN(jconst_fancy_upsample_sse2):
PW_SEVEN times 8 dw 7
PW_EIGHT times 8 dw 8
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define input_data(b) (b)+16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
- align 16
+ align 32
global EXTN(jsimd_h2v1_fancy_upsample_sse2)
EXTN(jsimd_h2v1_fancy_upsample_sse2):
%define WK_NUM 4
%define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr
- align 16
+ align 32
global EXTN(jsimd_h2v2_fancy_upsample_sse2)
EXTN(jsimd_h2v2_fancy_upsample_sse2):
%define input_data(b) (b)+16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
- align 16
+ align 32
global EXTN(jsimd_h2v1_upsample_sse2)
EXTN(jsimd_h2v1_upsample_sse2):
%define input_data(b) (b)+16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
- align 16
+ align 32
global EXTN(jsimd_h2v2_upsample_sse2)
EXTN(jsimd_h2v2_upsample_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jfdctflt.asm - floating-point FDCT (3DNow!)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_fdct_float_3dnow)
EXTN(jconst_fdct_float_3dnow):
PD_0_541 times 2 dd 0.541196100146196984399723
PD_1_306 times 2 dd 1.306562964876376527856643
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
%define WK_NUM 2
- align 16
+ align 32
global EXTN(jsimd_fdct_float_3dnow)
EXTN(jsimd_fdct_float_3dnow):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jfdctflt.asm - floating-point FDCT (SSE)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_fdct_float_sse)
EXTN(jconst_fdct_float_sse):
PD_0_541 times 4 dd 0.541196100146196984399723
PD_1_306 times 4 dd 1.306562964876376527856643
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
- align 16
+ align 32
global EXTN(jsimd_fdct_float_sse)
EXTN(jsimd_fdct_float_sse):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jfdctfst.asm - fast integer FDCT (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
- alignz 16
+ alignz 32
global EXTN(jconst_fdct_ifast_mmx)
EXTN(jconst_fdct_ifast_mmx):
PW_F0541 times 4 dw F_0_541 << CONST_SHIFT
PW_F1306 times 4 dw F_1_306 << CONST_SHIFT
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
%define WK_NUM 2
- align 16
+ align 32
global EXTN(jsimd_fdct_ifast_mmx)
EXTN(jsimd_fdct_ifast_mmx):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jfdctfst.asm - fast integer FDCT (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
- alignz 16
+ alignz 32
global EXTN(jconst_fdct_ifast_sse2)
EXTN(jconst_fdct_ifast_sse2):
PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
- align 16
+ align 32
global EXTN(jsimd_fdct_ifast_sse2)
EXTN(jsimd_fdct_ifast_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jfdctint.asm - accurate integer FDCT (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_fdct_islow_mmx)
EXTN(jconst_fdct_islow_mmx):
PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1)
PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS-1)
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
%define WK_NUM 2
- align 16
+ align 32
global EXTN(jsimd_fdct_islow_mmx)
EXTN(jsimd_fdct_islow_mmx):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jfdctint.asm - accurate integer FDCT (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_fdct_islow_sse2)
EXTN(jconst_fdct_islow_sse2):
PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1)
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 6
- align 16
+ align 32
global EXTN(jsimd_fdct_islow_sse2)
EXTN(jsimd_fdct_islow_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jidctflt.asm - floating-point IDCT (3DNow! & MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_idct_float_3dnow)
EXTN(jconst_idct_float_3dnow):
PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3)
PB_CENTERJSAMP times 8 db CENTERJSAMPLE
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
; FAST_FLOAT workspace[DCTSIZE2]
- align 16
+ align 32
global EXTN(jsimd_idct_float_3dnow)
EXTN(jsimd_idct_float_3dnow):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jidctflt.asm - floating-point IDCT (SSE & MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_idct_float_sse)
EXTN(jconst_idct_float_sse):
PD_0_125 times 4 dd 0.125 ; 1/8
PB_CENTERJSAMP times 8 db CENTERJSAMPLE
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
; FAST_FLOAT workspace[DCTSIZE2]
- align 16
+ align 32
global EXTN(jsimd_idct_float_sse)
EXTN(jsimd_idct_float_sse):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jidctflt.asm - floating-point IDCT (SSE & SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_idct_float_sse2)
EXTN(jconst_idct_float_sse2):
PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
; FAST_FLOAT workspace[DCTSIZE2]
- align 16
+ align 32
global EXTN(jsimd_idct_float_sse2)
EXTN(jsimd_idct_float_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jidctfst.asm - fast integer IDCT (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
- alignz 16
+ alignz 32
global EXTN(jconst_idct_ifast_mmx)
EXTN(jconst_idct_ifast_mmx):
PW_F1082 times 4 dw F_1_082 << CONST_SHIFT
PB_CENTERJSAMP times 8 db CENTERJSAMPLE
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF
; JCOEF workspace[DCTSIZE2]
- align 16
+ align 32
global EXTN(jsimd_idct_ifast_mmx)
EXTN(jsimd_idct_ifast_mmx):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jidctfst.asm - fast integer IDCT (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
- alignz 16
+ alignz 32
global EXTN(jconst_idct_ifast_sse2)
EXTN(jconst_idct_ifast_sse2):
PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
- align 16
+ align 32
global EXTN(jsimd_idct_ifast_sse2)
EXTN(jsimd_idct_ifast_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jidctint.asm - accurate integer IDCT (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_idct_islow_mmx)
EXTN(jconst_idct_islow_mmx):
PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1)
PB_CENTERJSAMP times 8 db CENTERJSAMPLE
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF
; JCOEF workspace[DCTSIZE2]
- align 16
+ align 32
global EXTN(jsimd_idct_islow_mmx)
EXTN(jsimd_idct_islow_mmx):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jidctint.asm - accurate integer IDCT (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_idct_islow_sse2)
EXTN(jconst_idct_islow_sse2):
PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 12
- align 16
+ align 32
global EXTN(jsimd_idct_islow_sse2)
EXTN(jsimd_idct_islow_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jidctred.asm - reduced-size IDCT (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_idct_red_mmx)
EXTN(jconst_idct_red_mmx):
PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2-1)
PB_CENTERJSAMP times 8 db CENTERJSAMPLE
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF
; JCOEF workspace[DCTSIZE2]
- align 16
+ align 32
global EXTN(jsimd_idct_4x4_mmx)
EXTN(jsimd_idct_4x4_mmx):
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
%define output_col(b) (b)+20 ; JDIMENSION output_col
- align 16
+ align 32
global EXTN(jsimd_idct_2x2_mmx)
EXTN(jsimd_idct_2x2_mmx):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jidctred.asm - reduced-size IDCT (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 16
+ alignz 32
global EXTN(jconst_idct_red_sse2)
EXTN(jconst_idct_red_sse2):
PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
- alignz 16
+ alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
- align 16
+ align 32
global EXTN(jsimd_idct_4x4_sse2)
EXTN(jsimd_idct_4x4_sse2):
%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
%define output_col(b) (b)+20 ; JDIMENSION output_col
- align 16
+ align 32
global EXTN(jsimd_idct_2x2_sse2)
EXTN(jsimd_idct_2x2_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define start_col ebp+12 ; JDIMENSION start_col
%define workspace ebp+16 ; FAST_FLOAT *workspace
- align 16
+ align 32
global EXTN(jsimd_convsamp_float_3dnow)
EXTN(jsimd_convsamp_float_3dnow):
%define divisors ebp+12 ; FAST_FLOAT *divisors
%define workspace ebp+16 ; FAST_FLOAT *workspace
- align 16
+ align 32
global EXTN(jsimd_quantize_float_3dnow)
EXTN(jsimd_quantize_float_3dnow):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jquant.asm - sample data conversion and quantization (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define start_col ebp+12 ; JDIMENSION start_col
%define workspace ebp+16 ; DCTELEM *workspace
- align 16
+ align 32
global EXTN(jsimd_convsamp_mmx)
EXTN(jsimd_convsamp_mmx):
%define divisors ebp+12 ; DCTELEM *divisors
%define workspace ebp+16 ; DCTELEM *workspace
- align 16
+ align 32
global EXTN(jsimd_quantize_mmx)
EXTN(jsimd_quantize_mmx):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jquant.asm - sample data conversion and quantization (SSE & MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define start_col ebp+12 ; JDIMENSION start_col
%define workspace ebp+16 ; FAST_FLOAT *workspace
- align 16
+ align 32
global EXTN(jsimd_convsamp_float_sse)
EXTN(jsimd_convsamp_float_sse):
%define divisors ebp+12 ; FAST_FLOAT *divisors
%define workspace ebp+16 ; FAST_FLOAT *workspace
- align 16
+ align 32
global EXTN(jsimd_quantize_float_sse)
EXTN(jsimd_quantize_float_sse):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define start_col ebp+12 ; JDIMENSION start_col
%define workspace ebp+16 ; FAST_FLOAT *workspace
- align 16
+ align 32
global EXTN(jsimd_convsamp_float_sse2)
EXTN(jsimd_convsamp_float_sse2):
%define divisors ebp+12 ; FAST_FLOAT *divisors
%define workspace ebp+16 ; FAST_FLOAT *workspace
- align 16
+ align 32
global EXTN(jsimd_quantize_float_sse2)
EXTN(jsimd_quantize_float_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; jquanti.asm - sample data conversion and quantization (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define start_col ebp+12 ; JDIMENSION start_col
%define workspace ebp+16 ; DCTELEM *workspace
- align 16
+ align 32
global EXTN(jsimd_convsamp_sse2)
EXTN(jsimd_convsamp_sse2):
%define divisors ebp+12 ; DCTELEM *divisors
%define workspace ebp+16 ; DCTELEM *workspace
- align 16
+ align 32
global EXTN(jsimd_quantize_sse2)
EXTN(jsimd_quantize_sse2):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
env = getenv("JSIMD_FORCESSE2");
if ((env != NULL) && (strcmp(env, "1") == 0))
simd_support &= JSIMD_SSE2;
+ env = getenv("JSIMD_FORCEAVX2");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support &= JSIMD_AVX2;
env = getenv("JSIMD_FORCENONE");
if ((env != NULL) && (strcmp(env, "1") == 0))
simd_support = 0;
; jpeg_simd_cpu_support (void)
;
- align 16
+ align 32
global EXTN(jpeg_simd_cpu_support)
EXTN(jpeg_simd_cpu_support):
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 32
; -- segment definition --
;
-%define SEG_TEXT _text align=16 public use32 class=CODE
-%define SEG_CONST _data align=16 public use32 class=DATA
+%define SEG_TEXT _text align=32 public use32 class=CODE
+%define SEG_CONST _data align=32 public use32 class=DATA
%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
; * Linux
; -- segment definition --
;
-%define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=16. why?
+%define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=32. why?
%define SEG_CONST .rodata align=32
; The generation of position-independent code (PIC) is the default on Darwin.
compptr=&cinfo->comp_info[i];
_tmpbuf[i]=(JSAMPLE *)malloc(
PAD((compptr->width_in_blocks*cinfo->max_h_samp_factor*DCTSIZE)
- /compptr->h_samp_factor, 16) * cinfo->max_v_samp_factor + 16);
+ /compptr->h_samp_factor, 32) * cinfo->max_v_samp_factor + 32);
if(!_tmpbuf[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure");
tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*cinfo->max_v_samp_factor);
if(!tmpbuf[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure");
for(row=0; row<cinfo->max_v_samp_factor; row++)
{
unsigned char *_tmpbuf_aligned=
- (unsigned char *)PAD((size_t)_tmpbuf[i], 16);
+ (unsigned char *)PAD((size_t)_tmpbuf[i], 32);
tmpbuf[i][row]=&_tmpbuf_aligned[
PAD((compptr->width_in_blocks*cinfo->max_h_samp_factor*DCTSIZE)
- /compptr->h_samp_factor, 16) * row];
+ /compptr->h_samp_factor, 32) * row];
}
- _tmpbuf2[i]=(JSAMPLE *)malloc(PAD(compptr->width_in_blocks*DCTSIZE, 16)
- * compptr->v_samp_factor + 16);
+ _tmpbuf2[i]=(JSAMPLE *)malloc(PAD(compptr->width_in_blocks*DCTSIZE, 32)
+ * compptr->v_samp_factor + 32);
if(!_tmpbuf2[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure");
tmpbuf2[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*compptr->v_samp_factor);
if(!tmpbuf2[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure");
for(row=0; row<compptr->v_samp_factor; row++)
{
unsigned char *_tmpbuf2_aligned=
- (unsigned char *)PAD((size_t)_tmpbuf2[i], 16);
+ (unsigned char *)PAD((size_t)_tmpbuf2[i], 32);
tmpbuf2[i][row]=&_tmpbuf2_aligned[
- PAD(compptr->width_in_blocks*DCTSIZE, 16) * row];
+ PAD(compptr->width_in_blocks*DCTSIZE, 32) * row];
}
pw[i]=pw0*compptr->h_samp_factor/cinfo->max_h_samp_factor;
ph[i]=ph0*compptr->v_samp_factor/cinfo->max_v_samp_factor;
for(i=0; i<dinfo->num_components; i++)
{
compptr=&dinfo->comp_info[i];
- _tmpbuf[i]=(JSAMPLE *)malloc(PAD(compptr->width_in_blocks*DCTSIZE, 16)
- * compptr->v_samp_factor + 16);
+ _tmpbuf[i]=(JSAMPLE *)malloc(PAD(compptr->width_in_blocks*DCTSIZE, 32)
+ * compptr->v_samp_factor + 32);
if(!_tmpbuf[i]) _throw("tjDecodeYUVPlanes(): Memory allocation failure");
tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*compptr->v_samp_factor);
if(!tmpbuf[i]) _throw("tjDecodeYUVPlanes(): Memory allocation failure");
for(row=0; row<compptr->v_samp_factor; row++)
{
unsigned char *_tmpbuf_aligned=
- (unsigned char *)PAD((size_t)_tmpbuf[i], 16);
+ (unsigned char *)PAD((size_t)_tmpbuf[i], 32);
tmpbuf[i][row]=&_tmpbuf_aligned[
- PAD(compptr->width_in_blocks*DCTSIZE, 16) * row];
+ PAD(compptr->width_in_blocks*DCTSIZE, 32) * row];
}
pw[i]=pw0*compptr->h_samp_factor/dinfo->max_h_samp_factor;
ph[i]=ph0*compptr->v_samp_factor/dinfo->max_v_samp_factor;