; jquanti.asm - sample quantization (AVX2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2018, D. R. Commander.
; Copyright (C) 2016, Matthieu Darbois.
;
; Based on the x86 SIMD extension for IJG JPEG library
; DCTELEM *workspace);
;
-%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+%define RECIPROCAL(m,n,b) YMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) YMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b) YMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
%define coef_block ebp+8 ; JCOEFPTR coef_block
%define divisors ebp+12 ; DCTELEM *divisors
mov edx, POINTER [divisors]
mov edi, JCOEFPTR [coef_block]
- vmovdqu ymm4, [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
- vmovdqu ymm5, [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
- vmovdqu ymm6, [XMMBLOCK(4,0,esi,SIZEOF_DCTELEM)]
- vmovdqu ymm7, [XMMBLOCK(6,0,esi,SIZEOF_DCTELEM)]
+ vmovdqu ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+ vmovdqu ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+ vmovdqu ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)]
+ vmovdqu ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)]
vpabsw ymm0, ymm4
vpabsw ymm1, ymm5
vpabsw ymm2, ymm6
vpsignw ymm2, ymm2, ymm6
vpsignw ymm3, ymm3, ymm7
- vmovdqu [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
- vmovdqu [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1
- vmovdqu [XMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2
- vmovdqu [XMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3
+ vmovdqu [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+ vmovdqu [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1
+ vmovdqu [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2
+ vmovdqu [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3
vzeroupper
pop edi
; jdct.inc - private declarations for forward & reverse DCT subsystems
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2018, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
%define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
%define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
%define XMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
+%define YMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_YMMWORD)
; --------------------------------------------------------------------------
; jquanti.asm - sample data conversion and quantization (64-bit AVX2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2018, D. R. Commander.
; Copyright (C) 2016, Matthieu Darbois.
;
; Based on the x86 SIMD extension for IJG JPEG library
; DCTELEM *workspace);
;
-%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+%define RECIPROCAL(m,n,b) YMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) YMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b) YMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
; r10 = JCOEFPTR coef_block
; r11 = DCTELEM *divisors
mov rbp, rsp
collect_args 3
- vmovdqu ymm4, [XMMBLOCK(0,0,r12,SIZEOF_DCTELEM)]
- vmovdqu ymm5, [XMMBLOCK(2,0,r12,SIZEOF_DCTELEM)]
- vmovdqu ymm6, [XMMBLOCK(4,0,r12,SIZEOF_DCTELEM)]
- vmovdqu ymm7, [XMMBLOCK(6,0,r12,SIZEOF_DCTELEM)]
+ vmovdqu ymm4, [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)]
+ vmovdqu ymm5, [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)]
+ vmovdqu ymm6, [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)]
+ vmovdqu ymm7, [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)]
vpabsw ymm0, ymm4
vpabsw ymm1, ymm5
vpabsw ymm2, ymm6
vpsignw ymm2, ymm2, ymm6
vpsignw ymm3, ymm3, ymm7
- vmovdqu [XMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0
- vmovdqu [XMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1
- vmovdqu [XMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2
- vmovdqu [XMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3
+ vmovdqu [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0
+ vmovdqu [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1
+ vmovdqu [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2
+ vmovdqu [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3
vzeroupper
uncollect_args 3