]> granicus.if.org Git - libjpeg-turbo/commitdiff
32-bit AVX2 implementation of int sample conv.
authorDRC <information@libjpeg-turbo.org>
Sun, 18 Feb 2018 04:15:58 +0000 (22:15 -0600)
committerDRC <information@libjpeg-turbo.org>
Mon, 19 Feb 2018 06:24:53 +0000 (00:24 -0600)
simd/i386/jquanti-avx2.asm
simd/i386/jsimd.c

index a8f24f2986ddf3afa175217ffd6b875c65b7d221..456d86ebb17feaacb53635e04a4d0fbe8f62ef1a 100644 (file)
@@ -1,5 +1,5 @@
 ;
-; jquanti.asm - sample quantization (AVX2)
+; jquanti.asm - sample data conversion and quantization (AVX2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2016, 2018, D. R. Commander.
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
     BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_avx2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                      DCTELEM *workspace);
+;
+
+%define sample_data  ebp+8              ; JSAMPARRAY sample_data
+%define start_col    ebp+12             ; JDIMENSION start_col
+%define workspace    ebp+16             ; DCTELEM *workspace
+
+    align       32
+    global      EXTN(jsimd_convsamp_avx2)
+
+EXTN(jsimd_convsamp_avx2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    vinserti128 ymm0, xmm1, 1
+    vinserti128 ymm2, xmm3, 1
+    vinserti128 ymm4, xmm5, 1
+    vinserti128 ymm6, xmm7, 1
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+    vpunpcklbw  ymm0, ymm0, ymm1
+    vpunpcklbw  ymm2, ymm2, ymm1
+    vpunpcklbw  ymm4, ymm4, ymm1
+    vpunpcklbw  ymm6, ymm6, ymm1
+
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpaddw      ymm4, ymm4, ymm7
+    vpaddw      ymm6, ymm6, ymm7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6
+
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
 
 ; --------------------------------------------------------------------------
 ;
index 85a66cb090bd0bfa7dfa761bfbfb1902fbc4a5df..b326562aa694fe515bdc81096278303d1ab16a1d 100644 (file)
@@ -763,6 +763,8 @@ jsimd_can_convsamp (void)
   if (sizeof(DCTELEM) != 2)
     return 0;
 
+  if (simd_support & JSIMD_AVX2)
+    return 1;
   if (simd_support & JSIMD_SSE2)
     return 1;
   if (simd_support & JSIMD_MMX)
@@ -800,7 +802,9 @@ GLOBAL(void)
 jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
                 DCTELEM *workspace)
 {
-  if (simd_support & JSIMD_SSE2)
+  if (simd_support & JSIMD_AVX2)
+    jsimd_convsamp_avx2(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_SSE2)
     jsimd_convsamp_sse2(sample_data, start_col, workspace);
   else if (simd_support & JSIMD_MMX)
     jsimd_convsamp_mmx(sample_data, start_col, workspace);