]> granicus.if.org Git - libjpeg-turbo/commitdiff
64-bit AVX2 impl. of RGB->Gray color conversion
authorDRC <information@libjpeg-turbo.org>
Sun, 29 May 2016 00:15:18 +0000 (19:15 -0500)
committerDRC <information@libjpeg-turbo.org>
Wed, 1 Jun 2016 03:56:11 +0000 (22:56 -0500)
simd/CMakeLists.txt
simd/Makefile.am
simd/jcgray-avx2-64.asm [new file with mode: 0644]
simd/jcgryext-avx2-64.asm [new file with mode: 0644]
simd/jsimd.h
simd/jsimd_x86_64.c

index 2422439cbff07880191f576116033a52e611cfeb..67683c57f53d56c5975be3e7939baa00c7d4da13 100755 (executable)
@@ -25,7 +25,7 @@ if(SIMD_X86_64)
     jchuff-sse2-64 jcsample-sse2-64 jdcolor-sse2-64 jdmerge-sse2-64
     jdsample-sse2-64 jfdctfst-sse2-64 jfdctint-sse2-64 jidctflt-sse2-64
     jidctfst-sse2-64 jidctint-sse2-64 jidctred-sse2-64 jquantf-sse2-64
-    jquanti-sse2-64 jccolor-avx2-64)
+    jquanti-sse2-64 jccolor-avx2-64 jcgray-avx2-64)
   message(STATUS "Building x86_64 SIMD extensions")
 else()
   set(SIMD_BASENAMES jsimdcpu jfdctflt-3dn jidctflt-3dn jquant-3dn jccolor-mmx
index 80af2955fbfcdd424cad2fd9f16658955f317236..6df26f18301f30ab55482cf2bac46600231747b5 100644 (file)
@@ -19,13 +19,14 @@ libsimd_la_SOURCES = jsimd_x86_64.c jsimd.h jsimdcfg.inc.h jsimdext.inc \
        jdsample-sse2-64.asm  jfdctfst-sse2-64.asm  jfdctint-sse2-64.asm \
        jidctflt-sse2-64.asm  jidctfst-sse2-64.asm  jidctint-sse2-64.asm \
        jidctred-sse2-64.asm  jquantf-sse2-64.asm   jquanti-sse2-64.asm \
-       jccolor-avx2-64.asm
+       jccolor-avx2-64.asm   jcgray-avx2-64.asm
 
 jccolor-sse2-64.lo:  jccolext-sse2-64.asm
 jcgray-sse2-64.lo:   jcgryext-sse2-64.asm
 jdcolor-sse2-64.lo:  jdcolext-sse2-64.asm
 jdmerge-sse2-64.lo:  jdmrgext-sse2-64.asm
 jccolor-avx2-64.lo:  jccolext-avx2-64.asm
+jcgray-avx2-64.lo:   jcgryext-avx2-64.asm
 
 endif
 
diff --git a/simd/jcgray-avx2-64.asm b/simd/jcgray-avx2-64.asm
new file mode 100644 (file)
index 0000000..d09ec4e
--- /dev/null
@@ -0,0 +1,115 @@
+;
+; jcgray.asm - grayscale colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    global      EXTN(jconst_rgb_gray_convert_avx2)
+
+EXTN(jconst_rgb_gray_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PD_ONEHALF     times 8 dd (1 << (SCALEBITS-1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jcgryext-avx2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extrgb_gray_convert_avx2
+%include "jcgryext-avx2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extrgbx_gray_convert_avx2
+%include "jcgryext-avx2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extbgr_gray_convert_avx2
+%include "jcgryext-avx2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extbgrx_gray_convert_avx2
+%include "jcgryext-avx2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extxbgr_gray_convert_avx2
+%include "jcgryext-avx2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extxrgb_gray_convert_avx2
+%include "jcgryext-avx2-64.asm"
diff --git a/simd/jcgryext-avx2-64.asm b/simd/jcgryext-avx2-64.asm
new file mode 100644 (file)
index 0000000..0342209
--- /dev/null
@@ -0,0 +1,386 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_avx2 (JDIMENSION img_width,
+;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                              JDIMENSION output_row, int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp-(WK_NUM-(i))*SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+
+    global      EXTN(jsimd_rgb_gray_convert_avx2)
+
+EXTN(jsimd_rgb_gray_convert_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsi, JSAMPROW [rsi]     ; inptr
+    mov         rdi, JSAMPROW [rdi]     ; outptr0
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, BYTE [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, WORD [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         rcx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         rcx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0
+    vinserti128 ymmC, ymmC, xmmB, 0
+    vinserti128 ymmB, ymmB, xmmF, 0
+    vperm2i128  ymmF, ymmC, ymmC, 1
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8
+    vpsrldq     ymmG, ymmG, 8
+
+    vpunpckhbw  ymmA, ymmA, ymmF
+    vpslldq     ymmF, ymmF, 8
+
+    vpunpcklbw  ymmG, ymmG, ymmB
+    vpunpckhbw  ymmF, ymmF, ymmB
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8
+    vpsrldq     ymmD, ymmD, 8
+
+    vpunpckhbw  ymmA, ymmA, ymmG
+    vpslldq     ymmG, ymmG, 8
+
+    vpunpcklbw  ymmD, ymmD, ymmF
+    vpunpckhbw  ymmG, ymmG, ymmF
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8
+    vpsrldq     ymmE, ymmE, 8
+
+    vpunpckhbw  ymmA, ymmA, ymmD
+    vpslldq     ymmD, ymmD, 8
+
+    vpunpcklbw  ymmE, ymmE, ymmG
+    vpunpckhbw  ymmD, ymmD, ymmG
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH
+    vpunpckhbw  ymmC, ymmC, ymmH
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH
+    vpunpckhbw  ymmB, ymmB, ymmH
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH
+    vpunpckhbw  ymmF, ymmF, ymmH
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmE, xmmA
+    vperm2i128  ymmE, ymmE, ymmE, 1
+    vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmE
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmE, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmF, ymmA
+    vmovdqa     ymmH, ymmE
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmF, 1
+    vperm2i128  ymmF, ymmB, ymmF, 0x31
+
+    vmovdqa     ymmB, ymmE
+    vinserti128 ymmE, ymmE, xmmH, 1
+    vperm2i128  ymmH, ymmB, ymmH, 0x31
+
+    vmovdqa     ymmB, ymmE
+    vmovdqa     ymmE, ymmF
+    vmovdqa     ymmF, ymmB
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE
+    vpunpckhbw  ymmD, ymmD, ymmE
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmC, ymmC, ymmH
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF
+    vpunpckhwd  ymmB, ymmB, ymmF
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC
+    vpunpckhwd  ymmG, ymmG, ymmC
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD
+    vpunpckhbw  ymmE, ymmE, ymmD
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG
+    vpunpckhbw  ymmH, ymmH, ymmG
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF
+    vpunpckhbw  ymmC, ymmC, ymmF
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF
+    vpunpckhbw  ymmD, ymmD, ymmF
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF
+    vpunpckhbw  ymmG, ymmG, ymmF
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT
+    vpsrlw      ymmH, ymmH, BYTE_BIT
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm7, ymm6              ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(1)], ymm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     ymm0, ymm5              ; ymm0=BO
+    vmovdqa     ymm6, ymm4              ; ymm6=BE
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, ymm1
+    vpaddd      ymm4, ymm4, ymm7
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(0)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(1)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
+
+    sub         rcx, byte SIZEOF_YMMWORD
+    add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte SIZEOF_YMMWORD           ; outptr0
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
index c44e109193831063ce53a781159106d627e4300a..f7e6d76d02391898b8fa2f1ac5f0f104803b721d 100644 (file)
@@ -216,6 +216,29 @@ EXTERN(void) jsimd_extxrgb_gray_convert_sse2
         (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
          JDIMENSION output_row, int num_rows);
 
+extern const int jconst_rgb_gray_convert_avx2[];
+EXTERN(void) jsimd_rgb_gray_convert_avx2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_avx2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_avx2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_avx2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_avx2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_avx2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_avx2
+        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+         JDIMENSION output_row, int num_rows);
+
 EXTERN(void) jsimd_rgb_gray_convert_mips_dspr2
         (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
          JDIMENSION output_row, int num_rows);
index 16213163baacfd743befee20baea45f42c82ac1f..740a962a129e7979fd849f2f39d533d71d4737f3 100644 (file)
@@ -100,6 +100,9 @@ jsimd_can_rgb_gray (void)
   if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
     return 0;
 
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2))
+    return 1;
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
     return 1;
@@ -187,37 +190,48 @@ jsimd_rgb_gray_convert (j_compress_ptr cinfo,
                         JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
                         JDIMENSION output_row, int num_rows)
 {
+  void (*avx2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
   void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
 
   switch(cinfo->in_color_space) {
     case JCS_EXT_RGB:
+      avx2fct=jsimd_extrgb_gray_convert_avx2;
       sse2fct=jsimd_extrgb_gray_convert_sse2;
       break;
     case JCS_EXT_RGBX:
     case JCS_EXT_RGBA:
+      avx2fct=jsimd_extrgbx_gray_convert_avx2;
       sse2fct=jsimd_extrgbx_gray_convert_sse2;
       break;
     case JCS_EXT_BGR:
+      avx2fct=jsimd_extbgr_gray_convert_avx2;
       sse2fct=jsimd_extbgr_gray_convert_sse2;
       break;
     case JCS_EXT_BGRX:
     case JCS_EXT_BGRA:
+      avx2fct=jsimd_extbgrx_gray_convert_avx2;
       sse2fct=jsimd_extbgrx_gray_convert_sse2;
       break;
     case JCS_EXT_XBGR:
     case JCS_EXT_ABGR:
+      avx2fct=jsimd_extxbgr_gray_convert_avx2;
       sse2fct=jsimd_extxbgr_gray_convert_sse2;
       break;
     case JCS_EXT_XRGB:
     case JCS_EXT_ARGB:
+      avx2fct=jsimd_extxrgb_gray_convert_avx2;
       sse2fct=jsimd_extxrgb_gray_convert_sse2;
       break;
     default:
+      avx2fct=jsimd_rgb_gray_convert_avx2;
       sse2fct=jsimd_rgb_gray_convert_sse2;
       break;
   }
 
-  sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)