From ff392d81ef3f96ee668511935d4d90b24cda2808 Mon Sep 17 00:00:00 2001 From: DRC Date: Sat, 17 Feb 2018 17:29:38 -0600 Subject: [PATCH] AVX2: Introduce YMMBLOCK macro for readability --- simd/i386/jquanti-avx2.asm | 24 ++++++++++++------------ simd/nasm/jdct.inc | 2 ++ simd/x86_64/jquanti-avx2.asm | 24 ++++++++++++------------ 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/simd/i386/jquanti-avx2.asm b/simd/i386/jquanti-avx2.asm index 0356185..a8f24f2 100644 --- a/simd/i386/jquanti-avx2.asm +++ b/simd/i386/jquanti-avx2.asm @@ -2,7 +2,7 @@ ; jquanti.asm - sample quantization (AVX2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2018, D. R. Commander. ; Copyright (C) 2016, Matthieu Darbois. ; ; Based on the x86 SIMD extension for IJG JPEG library @@ -37,9 +37,9 @@ ; DCTELEM *workspace); ; -%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) -%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) -%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) +%define RECIPROCAL(m,n,b) YMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) +%define CORRECTION(m,n,b) YMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) +%define SCALE(m,n,b) YMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) %define coef_block ebp+8 ; JCOEFPTR coef_block %define divisors ebp+12 ; DCTELEM *divisors @@ -61,10 +61,10 @@ EXTN(jsimd_quantize_avx2): mov edx, POINTER [divisors] mov edi, JCOEFPTR [coef_block] - vmovdqu ymm4, [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] - vmovdqu ymm5, [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] - vmovdqu ymm6, [XMMBLOCK(4,0,esi,SIZEOF_DCTELEM)] - vmovdqu ymm7, [XMMBLOCK(6,0,esi,SIZEOF_DCTELEM)] + vmovdqu ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] + vmovdqu ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] + vmovdqu ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)] + vmovdqu ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)] vpabsw ymm0, ymm4 vpabsw ymm1, ymm5 vpabsw ymm2, ymm6 @@ -88,10 +88,10 @@ EXTN(jsimd_quantize_avx2): vpsignw ymm2, ymm2, ymm6 vpsignw ymm3, ymm3, ymm7 - vmovdqu [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0 - vmovdqu [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1 - vmovdqu [XMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2 - vmovdqu [XMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3 + vmovdqu [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0 + vmovdqu [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1 + vmovdqu [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2 + vmovdqu [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3 vzeroupper pop edi diff --git a/simd/nasm/jdct.inc b/simd/nasm/jdct.inc index 7ae2ca4..20c8f3d 100644 --- a/simd/nasm/jdct.inc +++ b/simd/nasm/jdct.inc @@ -2,6 +2,7 @@ ; jdct.inc - private declarations for forward & reverse DCT subsystems ; ; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2018, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -23,5 +24,6 @@ %define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD) %define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD) %define XMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD) +%define YMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_YMMWORD) ; -------------------------------------------------------------------------- diff --git a/simd/x86_64/jquanti-avx2.asm b/simd/x86_64/jquanti-avx2.asm index 60ce19f..5eadeaa 100644 --- a/simd/x86_64/jquanti-avx2.asm +++ b/simd/x86_64/jquanti-avx2.asm @@ -2,7 +2,7 @@ ; jquanti.asm - sample data conversion and quantization (64-bit AVX2) ; ; Copyright 2009 Pierre Ossman for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2018, D. R. Commander. ; Copyright (C) 2016, Matthieu Darbois. ; ; Based on the x86 SIMD extension for IJG JPEG library @@ -37,9 +37,9 @@ ; DCTELEM *workspace); ; -%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) -%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) -%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) +%define RECIPROCAL(m,n,b) YMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) +%define CORRECTION(m,n,b) YMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) +%define SCALE(m,n,b) YMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) ; r10 = JCOEFPTR coef_block ; r11 = DCTELEM *divisors @@ -54,10 +54,10 @@ EXTN(jsimd_quantize_avx2): mov rbp, rsp collect_args 3 - vmovdqu ymm4, [XMMBLOCK(0,0,r12,SIZEOF_DCTELEM)] - vmovdqu ymm5, [XMMBLOCK(2,0,r12,SIZEOF_DCTELEM)] - vmovdqu ymm6, [XMMBLOCK(4,0,r12,SIZEOF_DCTELEM)] - vmovdqu ymm7, [XMMBLOCK(6,0,r12,SIZEOF_DCTELEM)] + vmovdqu ymm4, [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)] + vmovdqu ymm5, [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)] + vmovdqu ymm6, [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)] + vmovdqu ymm7, [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)] vpabsw ymm0, ymm4 vpabsw ymm1, ymm5 vpabsw ymm2, ymm6 @@ -81,10 +81,10 @@ EXTN(jsimd_quantize_avx2): vpsignw ymm2, ymm2, ymm6 vpsignw ymm3, ymm3, ymm7 - vmovdqu [XMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0 - vmovdqu [XMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1 - vmovdqu [XMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2 - vmovdqu [XMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3 + vmovdqu [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0 + vmovdqu [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1 + vmovdqu [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2 + vmovdqu [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3 vzeroupper uncollect_args 3 -- 2.40.0