From: DRC Date: Sun, 18 Feb 2018 01:39:53 +0000 (-0600) Subject: 64-bit AVX2 implementation of int sample conv. X-Git-Tag: 1.5.90~39 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=39e9e65c5b41efd116ed27388ae957517d0c531b;p=libjpeg-turbo 64-bit AVX2 implementation of int sample conv. --- diff --git a/simd/jsimd.h b/simd/jsimd.h index e3fde7e..455c249 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -816,6 +816,9 @@ EXTERN(void) jsimd_convsamp_mmx EXTERN(void) jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); +EXTERN(void) jsimd_convsamp_avx2 + (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); + EXTERN(void) jsimd_convsamp_neon (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); diff --git a/simd/x86_64/jquanti-avx2.asm b/simd/x86_64/jquanti-avx2.asm index 5eadeaa..ec96da5 100644 --- a/simd/x86_64/jquanti-avx2.asm +++ b/simd/x86_64/jquanti-avx2.asm @@ -23,6 +23,71 @@ ; -------------------------------------------------------------------------- SECTION SEG_TEXT BITS 64 +; +; Load data into workspace, applying unsigned->signed conversion +; +; GLOBAL(void) +; jsimd_convsamp_avx2 (JSAMPARRAY sample_data, JDIMENSION start_col, +; DCTELEM *workspace); +; + +; r10 = JSAMPARRAY sample_data +; r11d = JDIMENSION start_col +; r12 = DCTELEM *workspace + + align 32 + global EXTN(jsimd_convsamp_avx2) + +EXTN(jsimd_convsamp_avx2): + push rbp + mov rax, rsp + mov rbp, rsp + collect_args 3 + + mov eax, r11d + + mov rsi, JSAMPROW [r10+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdi, JSAMPROW [r10+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm0, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE] + pinsrq xmm0, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1 + + mov rsi, JSAMPROW [r10+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdi, JSAMPROW [r10+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm1, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE] + pinsrq xmm1, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1 + + mov rsi, JSAMPROW [r10+4*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdi, JSAMPROW [r10+5*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm2, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE] + pinsrq xmm2, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1 + + mov rsi, JSAMPROW [r10+6*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdi, JSAMPROW [r10+7*SIZEOF_JSAMPROW] ; (JSAMPLE *) + movq xmm3, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE] + pinsrq xmm3, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1 + + vpmovzxbw ymm0, xmm0 ; ymm0=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + vpmovzxbw ymm1, xmm1 ; ymm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + vpmovzxbw ymm2, xmm2 ; ymm2=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + vpmovzxbw ymm3, xmm3 ; ymm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + vpcmpeqw ymm7, ymm7, ymm7 + vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + vpaddw ymm0, ymm0, ymm7 + vpaddw ymm1, ymm1, ymm7 + vpaddw ymm2, ymm2, ymm7 + vpaddw ymm3, ymm3, ymm7 + + vmovdqu YMMWORD [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)], ymm0 + vmovdqu YMMWORD [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)], ymm1 + vmovdqu YMMWORD [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)], ymm2 + vmovdqu YMMWORD [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)], ymm3 + + vzeroupper + uncollect_args 3 + pop rbp + ret ; -------------------------------------------------------------------------- ; diff --git a/simd/x86_64/jsimd.c b/simd/x86_64/jsimd.c index e6d6e1c..53a9db9 100644 --- a/simd/x86_64/jsimd.c +++ b/simd/x86_64/jsimd.c @@ -660,6 +660,8 @@ jsimd_can_convsamp (void) if (sizeof(DCTELEM) != 2) return 0; + if (simd_support & JSIMD_AVX2) + return 1; if (simd_support & JSIMD_SSE2) return 1; @@ -691,7 +693,10 @@ GLOBAL(void) jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace) { - jsimd_convsamp_sse2(sample_data, start_col, workspace); + if (simd_support & JSIMD_AVX2) + jsimd_convsamp_avx2(sample_data, start_col, workspace); + else + jsimd_convsamp_sse2(sample_data, start_col, workspace); } GLOBAL(void)