;
-; jquanti.asm - sample quantization (AVX2)
+; jquanti.asm - sample data conversion and quantization (AVX2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, 2018, D. R. Commander.
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_avx2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+; DCTELEM *workspace);
+;
+
+%define sample_data ebp+8 ; JSAMPARRAY sample_data
+%define start_col ebp+12 ; JDIMENSION start_col
+%define workspace ebp+16 ; DCTELEM *workspace
+
+ align 32
+ global EXTN(jsimd_convsamp_avx2)
+
+EXTN(jsimd_convsamp_avx2):
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ mov ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ mov ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ movq xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ vinserti128 ymm0, xmm1, 1
+ vinserti128 ymm2, xmm3, 1
+ vinserti128 ymm4, xmm5, 1
+ vinserti128 ymm6, xmm7, 1
+
+ vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
+ vpunpcklbw ymm0, ymm0, ymm1
+ vpunpcklbw ymm2, ymm2, ymm1
+ vpunpcklbw ymm4, ymm4, ymm1
+ vpunpcklbw ymm6, ymm6, ymm1
+
+ vpcmpeqw ymm7, ymm7, ymm7
+ vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ vpaddw ymm0, ymm0, ymm7
+ vpaddw ymm2, ymm2, ymm7
+ vpaddw ymm4, ymm4, ymm7
+ vpaddw ymm6, ymm6, ymm7
+
+ vmovdqu YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+ vmovdqu YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2
+ vmovdqu YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4
+ vmovdqu YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6
+
+ vzeroupper
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
; --------------------------------------------------------------------------
;