From: DRC Date: Sun, 18 Feb 2018 04:15:58 +0000 (-0600) Subject: 32-bit AVX2 implementation of int sample conv. X-Git-Tag: 1.5.90~37 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=715b7c38a8d58f0d8f2026311901a8fc2b371214;p=libjpeg-turbo 32-bit AVX2 implementation of int sample conv. --- diff --git a/simd/i386/jquanti-avx2.asm b/simd/i386/jquanti-avx2.asm index a8f24f2..456d86e 100644 --- a/simd/i386/jquanti-avx2.asm +++ b/simd/i386/jquanti-avx2.asm @@ -1,5 +1,5 @@ ; -; jquanti.asm - sample quantization (AVX2) +; jquanti.asm - sample data conversion and quantization (AVX2) ; ; Copyright 2009 Pierre Ossman for Cendio AB ; Copyright (C) 2016, 2018, D. R. Commander. @@ -23,6 +23,86 @@ ; -------------------------------------------------------------------------- SECTION SEG_TEXT BITS 32 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_avx2 (JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; DCTELEM *workspace + + align 32 + global EXTN(jsimd_convsamp_avx2) + +EXTN(jsimd_convsamp_avx2): + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + mov ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + mov ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + vinserti128 ymm0, xmm1, 1 + vinserti128 ymm2, xmm3, 1 + vinserti128 ymm4, xmm5, 1 + vinserti128 ymm6, xmm7, 1 + + vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) + vpunpcklbw ymm0, ymm0, ymm1 + vpunpcklbw ymm2, ymm2, ymm1 + vpunpcklbw ymm4, ymm4, ymm1 + vpunpcklbw ymm6, ymm6, ymm1 + + vpcmpeqw ymm7, ymm7, ymm7 + vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm2, ymm2, ymm7 + vpaddw ymm4, ymm4, ymm7 + vpaddw ymm6, ymm6, ymm7 + + vmovdqu YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0 + vmovdqu YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2 + vmovdqu YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4 + vmovdqu YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6 + + vzeroupper + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret ; -------------------------------------------------------------------------- ; diff --git a/simd/i386/jsimd.c b/simd/i386/jsimd.c index 85a66cb..b326562 100644 --- a/simd/i386/jsimd.c +++ b/simd/i386/jsimd.c @@ -763,6 +763,8 @@ jsimd_can_convsamp (void) if (sizeof(DCTELEM) != 2) return 0; + if (simd_support & JSIMD_AVX2) + return 1; if (simd_support & JSIMD_SSE2) return 1; if (simd_support & JSIMD_MMX) @@ -800,7 +802,9 @@ GLOBAL(void) jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace) { - if (simd_support & JSIMD_SSE2) + if (simd_support & JSIMD_AVX2) + jsimd_convsamp_avx2(sample_data, start_col, workspace); + else if (simd_support & JSIMD_SSE2) jsimd_convsamp_sse2(sample_data, start_col, workspace); else if (simd_support & JSIMD_MMX) jsimd_convsamp_mmx(sample_data, start_col, workspace);