]> granicus.if.org Git - libjpeg-turbo/commitdiff
64-bit AVX2 impl. of YCC->RGB color conversion
authorDRC <information@libjpeg-turbo.org>
Sun, 29 May 2016 11:54:56 +0000 (06:54 -0500)
committerDRC <information@libjpeg-turbo.org>
Wed, 1 Jun 2016 03:56:11 +0000 (22:56 -0500)
simd/CMakeLists.txt
simd/Makefile.am
simd/jdcolext-avx2-64.asm [new file with mode: 0644]
simd/jdcolor-avx2-64.asm [new file with mode: 0644]
simd/jsimd.h
simd/jsimd_x86_64.c

index 97caf51d2bbe8485c1c9f823cf1e34b045920a31..492cbfe8431f6e04f3d3f204d46270e8d6dadb3b 100755 (executable)
@@ -25,7 +25,8 @@ if(SIMD_X86_64)
     jchuff-sse2-64 jcsample-sse2-64 jdcolor-sse2-64 jdmerge-sse2-64
     jdsample-sse2-64 jfdctfst-sse2-64 jfdctint-sse2-64 jidctflt-sse2-64
     jidctfst-sse2-64 jidctint-sse2-64 jidctred-sse2-64 jquantf-sse2-64
-    jquanti-sse2-64 jccolor-avx2-64 jcgray-avx2-64 jcsample-avx2-64)
+    jquanti-sse2-64 jccolor-avx2-64 jcgray-avx2-64 jcsample-avx2-64
+    jdcolor-avx2-64)
   message(STATUS "Building x86_64 SIMD extensions")
 else()
   set(SIMD_BASENAMES jsimdcpu jfdctflt-3dn jidctflt-3dn jquant-3dn jccolor-mmx
index edf29ff4d175a67c7d3c88781a2fcf0d9dc40b79..214f70123f104ef2bf9b121cfd4595e702f021b4 100644 (file)
@@ -19,7 +19,8 @@ libsimd_la_SOURCES = jsimd_x86_64.c jsimd.h jsimdcfg.inc.h jsimdext.inc \
        jdsample-sse2-64.asm  jfdctfst-sse2-64.asm  jfdctint-sse2-64.asm \
        jidctflt-sse2-64.asm  jidctfst-sse2-64.asm  jidctint-sse2-64.asm \
        jidctred-sse2-64.asm  jquantf-sse2-64.asm   jquanti-sse2-64.asm \
-       jccolor-avx2-64.asm   jcgray-avx2-64.asm    jcsample-avx2-64.asm
+       jccolor-avx2-64.asm   jcgray-avx2-64.asm    jcsample-avx2-64.asm \
+       jdcolor-avx2-64.asm
 
 jccolor-sse2-64.lo:  jccolext-sse2-64.asm
 jcgray-sse2-64.lo:   jcgryext-sse2-64.asm
@@ -27,6 +28,7 @@ jdcolor-sse2-64.lo:  jdcolext-sse2-64.asm
 jdmerge-sse2-64.lo:  jdmrgext-sse2-64.asm
 jccolor-avx2-64.lo:  jccolext-avx2-64.asm
 jcgray-avx2-64.lo:   jcgryext-avx2-64.asm
+jdcolor-avx2-64.lo:  jdcolext-avx2-64.asm
 
 endif
 
diff --git a/simd/jdcolext-avx2-64.asm b/simd/jdcolext-avx2-64.asm
new file mode 100644 (file)
index 0000000..28278bf
--- /dev/null
@@ -0,0 +1,429 @@
+;
+; jdcolext.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_avx2 (JDIMENSION out_width,
+;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                             JSAMPARRAY output_buf, int num_rows)
+;
+
+; r10d = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14d = int num_rows
+
+%define wk(i)   rbp-(WK_NUM-(i))*SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    global      EXTN(jsimd_ycc_rgb_convert_avx2)
+
+EXTN(jsimd_ycc_rgb_convert_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d               ; num_cols
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rdi, r13
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rax
+    push        rdi
+    push        rdx
+    push        rbx
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsi, JSAMPROW [rsi]     ; inptr0
+    mov         rbx, JSAMPROW [rbx]     ; inptr1
+    mov         rdx, JSAMPROW [rdx]     ; inptr2
+    mov         rdi, JSAMPROW [rdi]     ; outptr
+.columnloop:
+
+    vmovdqu     ymm5, YMMWORD [rbx]
+    vmovdqu     ymm1, YMMWORD [rdx]
+
+    vpcmpeqw    ymm0, ymm0, ymm0
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsrlw      ymm0, ymm0, BYTE_BIT
+    vpsllw      ymm7, ymm7, 7
+
+    vpand       ymm4, ymm0, ymm5
+    vpsrlw      ymm5, ymm5, BYTE_BIT
+    vpand       ymm0, ymm0, ymm1
+    vpsrlw      ymm1, ymm1, BYTE_BIT
+
+    vpaddw      ymm2, ymm4, ymm7
+    vpaddw      ymm3, ymm5, ymm7
+    vpaddw      ymm6, ymm0, ymm7
+    vpaddw      ymm7, ymm1, ymm7
+
+    vpaddw      ymm4, ymm2, ymm2
+    vpaddw      ymm5, ymm3, ymm3
+    vpaddw      ymm0, ymm6, ymm6
+    vpaddw      ymm1, ymm7, ymm7
+
+    vpmulhw     ymm4, ymm4, [rel PW_MF0228]
+    vpmulhw     ymm5, ymm5, [rel PW_MF0228]
+    vpmulhw     ymm0, ymm0, [rel PW_F0402]
+    vpmulhw     ymm1, ymm1, [rel PW_F0402]
+
+    vpaddw      ymm4, ymm4, [rel PW_ONE]
+    vpaddw      ymm5, ymm5, [rel PW_ONE]
+    vpsraw      ymm4, ymm4, 1
+    vpsraw      ymm5, ymm5, 1
+    vpaddw      ymm0, ymm0, [rel PW_ONE]
+    vpaddw      ymm1, ymm1, [rel PW_ONE]
+    vpsraw      ymm0, ymm0, 1
+    vpsraw      ymm1, ymm1, 1
+
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm5, ymm5, ymm3
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm5, ymm5, ymm3
+    vpaddw      ymm0, ymm0, ymm6
+    vpaddw      ymm1, ymm1, ymm7
+
+    vmovdqa     YMMWORD [wk(0)], ymm4
+    vmovdqa     YMMWORD [wk(1)], ymm5
+
+    vpunpckhwd  ymm4, ymm2, ymm6
+    vpunpcklwd  ymm2, ymm2, ymm6
+    vpmaddwd    ymm2, ymm2, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm4, ymm4, [rel PW_MF0344_F0285]
+    vpunpckhwd  ymm5, ymm3, ymm7
+    vpunpcklwd  ymm3, ymm3, ymm7
+    vpmaddwd    ymm3, ymm3, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm5, ymm5, [rel PW_MF0344_F0285]
+
+    vpaddd      ymm2, ymm2, [rel PD_ONEHALF]
+    vpaddd      ymm4, ymm4, [rel PD_ONEHALF]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm4, ymm4, SCALEBITS
+    vpaddd      ymm3, ymm3, [rel PD_ONEHALF]
+    vpaddd      ymm5, ymm5, [rel PD_ONEHALF]
+    vpsrad      ymm3, ymm3, SCALEBITS
+    vpsrad      ymm5, ymm5, SCALEBITS
+
+    vpackssdw   ymm2, ymm2, ymm4
+    vpackssdw   ymm3, ymm3, ymm5
+    vpsubw      ymm2, ymm2, ymm6
+    vpsubw      ymm3, ymm3, ymm7
+
+    vmovdqu     ymm5, YMMWORD [rsi]
+
+    vpcmpeqw    ymm4, ymm4, ymm4
+    vpsrlw      ymm4, ymm4, BYTE_BIT
+    vpand       ymm4, ymm4, ymm5
+    vpsrlw      ymm5, ymm5, BYTE_BIT
+
+    vpaddw      ymm0, ymm0, ymm4
+    vpaddw      ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm0
+    vpackuswb   ymm1, ymm1, ymm1
+
+    vpaddw      ymm2, ymm2, ymm4
+    vpaddw      ymm3, ymm3, ymm5
+    vpackuswb   ymm2, ymm2, ymm2
+    vpackuswb   ymm3, ymm3, ymm3
+
+    vpaddw      ymm4, ymm4, YMMWORD [wk(0)]
+    vpaddw      ymm5, ymm5, YMMWORD [wk(1)]
+    vpackuswb   ymm4, ymm4, ymm4
+    vpackuswb   ymm5, ymm5, ymm5
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    vpunpcklbw  ymmA, ymmA, ymmC
+    vpunpcklbw  ymmE, ymmE, ymmB
+    vpunpcklbw  ymmD, ymmD, ymmF
+
+    vpsrldq     ymmH, ymmA, 2
+    vpunpckhwd  ymmG, ymmA, ymmE
+    vpunpcklwd  ymmA, ymmA, ymmE
+
+    vpsrldq     ymmE, ymmE, 2
+
+    vmovdqa     ymmC, ymmD
+    vpsrldq     ymmB, ymmD, 2
+    vpunpckhwd  ymmC, ymmD, ymmH
+    vpunpcklwd  ymmD, ymmD, ymmH
+
+    vpunpckhwd  ymmF, ymmE, ymmB
+    vpunpcklwd  ymmE, ymmE, ymmB
+
+    vpshufd     ymmH, ymmA, 0x4E
+    vpunpckldq  ymmA, ymmA, ymmD
+    vpunpckhdq  ymmD, ymmD, ymmE
+    vpunpckldq  ymmE, ymmE, ymmH
+
+    vpshufd     ymmH, ymmG, 0x4E
+    vpunpckldq  ymmG, ymmG, ymmC
+    vpunpckhdq  ymmC, ymmC, ymmF
+    vpunpckldq  ymmF, ymmF, ymmH
+
+    vpunpcklqdq ymmH, ymmA, ymmE
+    vpunpcklqdq ymmG, ymmD, ymmG
+    vpunpcklqdq ymmC, ymmF, ymmC
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20
+    vperm2i128  ymmD, ymmC, ymmH, 0x30
+    vperm2i128  ymmF, ymmG, ymmC, 0x31
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         rcx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         WORD [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    mov         BYTE [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6
+    vpcmpeqb    ymm7, ymm7, ymm7
+%else
+    vpxor       ymm6, ymm6, ymm6
+    vpxor       ymm7, ymm7, ymm7
+%endif
+
+    vpunpcklbw  ymmA, ymmA, ymmC
+    vpunpcklbw  ymmE, ymmE, ymmG
+    vpunpcklbw  ymmB, ymmB, ymmD
+    vpunpcklbw  ymmF, ymmF, ymmH
+
+    vpunpckhwd  ymmC, ymmA, ymmE
+    vpunpcklwd  ymmA, ymmA, ymmE
+    vpunpckhwd  ymmG, ymmB, ymmF
+    vpunpcklwd  ymmB, ymmB, ymmF
+
+    vpunpckhdq  ymmE, ymmA, ymmB
+    vpunpckldq  ymmB, ymmA, ymmB
+    vpunpckhdq  ymmF, ymmC, ymmG
+    vpunpckldq  ymmG, ymmC, ymmG
+
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20
+    vperm2i128  ymmD, ymmG, ymmF, 0x20
+    vperm2i128  ymmC, ymmB, ymmE, 0x31
+    vperm2i128  ymmH, ymmG, ymmF, 0x31
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    cmp         rcx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         rcx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         rcx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_YMMWORD/16*4
+    sub         rcx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    vmovd       XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+    pop         rcx
+    pop         rsi
+    pop         rbx
+    pop         rdx
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/jdcolor-avx2-64.asm b/simd/jdcolor-avx2-64.asm
new file mode 100644 (file)
index 0000000..b9ca499
--- /dev/null
@@ -0,0 +1,120 @@
+;
+; jdcolor.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    global      EXTN(jconst_ycc_rgb_convert_avx2)
+
+EXTN(jconst_ycc_rgb_convert_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS-1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdcolext-avx2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgb_convert_avx2
+%include "jdcolext-avx2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgbx_convert_avx2
+%include "jdcolext-avx2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgr_convert_avx2
+%include "jdcolext-avx2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgrx_convert_avx2
+%include "jdcolext-avx2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxbgr_convert_avx2
+%include "jdcolext-avx2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxrgb_convert_avx2
+%include "jdcolext-avx2-64.asm"
index 9869aec2ee345547c9490c1c62ad45c2b0c685d8..ec64383349d3786717cec813c9fb7d0964c832e6 100644 (file)
@@ -329,6 +329,29 @@ EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
         (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
          JSAMPARRAY output_buf, int num_rows);
 
+extern const int jconst_ycc_rgb_convert_avx2[];
+EXTERN(void) jsimd_ycc_rgb_convert_avx2
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_avx2
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_avx2
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_avx2
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_avx2
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_avx2
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_avx2
+        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+         JSAMPARRAY output_buf, int num_rows);
+
 EXTERN(void) jsimd_ycc_rgb_convert_neon
         (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
          JSAMPARRAY output_buf, int num_rows);
index ddad42b9f25b2159c4a330176803105f49e43bf3..73723403374b320eec9aa20fa5a398b8c573d46c 100644 (file)
@@ -123,6 +123,9 @@ jsimd_can_ycc_rgb (void)
   if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
     return 0;
 
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2))
+    return 1;
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
     return 1;
@@ -239,37 +242,48 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
                        JSAMPIMAGE input_buf, JDIMENSION input_row,
                        JSAMPARRAY output_buf, int num_rows)
 {
+  void (*avx2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
   void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
 
   switch(cinfo->out_color_space) {
     case JCS_EXT_RGB:
+      avx2fct=jsimd_ycc_extrgb_convert_avx2;
       sse2fct=jsimd_ycc_extrgb_convert_sse2;
       break;
     case JCS_EXT_RGBX:
     case JCS_EXT_RGBA:
+      avx2fct=jsimd_ycc_extrgbx_convert_avx2;
       sse2fct=jsimd_ycc_extrgbx_convert_sse2;
       break;
     case JCS_EXT_BGR:
+      avx2fct=jsimd_ycc_extbgr_convert_avx2;
       sse2fct=jsimd_ycc_extbgr_convert_sse2;
       break;
     case JCS_EXT_BGRX:
     case JCS_EXT_BGRA:
+      avx2fct=jsimd_ycc_extbgrx_convert_avx2;
       sse2fct=jsimd_ycc_extbgrx_convert_sse2;
       break;
     case JCS_EXT_XBGR:
     case JCS_EXT_ABGR:
+      avx2fct=jsimd_ycc_extxbgr_convert_avx2;
       sse2fct=jsimd_ycc_extxbgr_convert_sse2;
       break;
     case JCS_EXT_XRGB:
     case JCS_EXT_ARGB:
+      avx2fct=jsimd_ycc_extxrgb_convert_avx2;
       sse2fct=jsimd_ycc_extxrgb_convert_sse2;
       break;
     default:
+      avx2fct=jsimd_ycc_rgb_convert_avx2;
       sse2fct=jsimd_ycc_rgb_convert_sse2;
       break;
   }
 
-  sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  else
+    sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
 }
 
 GLOBAL(void)