; r13 = JDIMENSION output_row
; r14 = int num_rows
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 8
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 8
- align 16
+ align 16
- global EXTN(jsimd_rgb_ycc_convert_sse2)
+ global EXTN(jsimd_rgb_ycc_convert_sse2)
EXTN(jsimd_rgb_ycc_convert_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
- push rbx
-
- mov ecx, r10d
- test rcx,rcx
- jz near .return
-
- push rcx
-
- mov rsi, r12
- mov ecx, r13d
- mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
- mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
- mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
- lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
- lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
- lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
-
- pop rcx
-
- mov rsi, r11
- mov eax, r14d
- test rax,rax
- jle near .return
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args
+ push rbx
+
+ mov ecx, r10d
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rsi, r12
+ mov ecx, r13d
+ mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+ mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+ mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+ lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+ lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+ lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+ pop rcx
+
+ mov rsi, r11
+ mov eax, r14d
+ test rax, rax
+ jle near .return
.rowloop:
- push rdx
- push rbx
- push rdi
- push rsi
- push rcx ; col
+ push rdx
+ push rbx
+ push rdi
+ push rsi
+ push rcx ; col
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr0
- mov rbx, JSAMPROW [rbx] ; outptr1
- mov rdx, JSAMPROW [rdx] ; outptr2
+ mov rsi, JSAMPROW [rsi] ; inptr
+ mov rdi, JSAMPROW [rdi] ; outptr0
+ mov rbx, JSAMPROW [rbx] ; outptr1
+ mov rdx, JSAMPROW [rdx] ; outptr2
- cmp rcx, byte SIZEOF_XMMWORD
- jae near .columnloop
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
-%if RGB_PIXELSIZE == 3 ; ---------------
+%if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1:
- push rax
- push rdx
- lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
- test cl, SIZEOF_BYTE
- jz short .column_ld2
- sub rcx, byte SIZEOF_BYTE
- movzx rax, BYTE [rsi+rcx]
+ push rax
+ push rdx
+ lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_BYTE
+ movzx rax, BYTE [rsi+rcx]
.column_ld2:
- test cl, SIZEOF_WORD
- jz short .column_ld4
- sub rcx, byte SIZEOF_WORD
- movzx rdx, WORD [rsi+rcx]
- shl rax, WORD_BIT
- or rax,rdx
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_WORD
+ movzx rdx, WORD [rsi+rcx]
+ shl rax, WORD_BIT
+ or rax, rdx
.column_ld4:
- movd xmmA,eax
- pop rdx
- pop rax
- test cl, SIZEOF_DWORD
- jz short .column_ld8
- sub rcx, byte SIZEOF_DWORD
- movd xmmF, XMM_DWORD [rsi+rcx]
- pslldq xmmA, SIZEOF_DWORD
- por xmmA,xmmF
+ movd xmmA, eax
+ pop rdx
+ pop rax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_DWORD
+ movd xmmF, XMM_DWORD [rsi+rcx]
+ pslldq xmmA, SIZEOF_DWORD
+ por xmmA, xmmF
.column_ld8:
- test cl, SIZEOF_MMWORD
- jz short .column_ld16
- sub rcx, byte SIZEOF_MMWORD
- movq xmmB, XMM_MMWORD [rsi+rcx]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmB
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub rcx, byte SIZEOF_MMWORD
+ movq xmmB, XMM_MMWORD [rsi+rcx]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmB
.column_ld16:
- test cl, SIZEOF_XMMWORD
- jz short .column_ld32
- movdqa xmmF,xmmA
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- mov rcx, SIZEOF_XMMWORD
- jmp short .rgb_ycc_cnv
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ movdqa xmmF, xmmA
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ mov rcx, SIZEOF_XMMWORD
+ jmp short .rgb_ycc_cnv
.column_ld32:
- test cl, 2*SIZEOF_XMMWORD
- mov rcx, SIZEOF_XMMWORD
- jz short .rgb_ycc_cnv
- movdqa xmmB,xmmA
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- jmp short .rgb_ycc_cnv
+ test cl, 2*SIZEOF_XMMWORD
+ mov rcx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmB, xmmA
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
.columnloop:
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
.rgb_ycc_cnv:
- ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+ ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
- movdqa xmmG,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
- psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+ movdqa xmmG, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+ psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
- punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
- pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+ punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+ pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
- punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
- punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+ punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+ punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
- movdqa xmmD,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
- psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+ movdqa xmmD, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+ psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
- punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
- pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+ punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+ pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
- punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
- punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+ punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+ punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
- movdqa xmmE,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
- psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+ movdqa xmmE, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+ psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
- punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+ punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
- punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+ punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
- pxor xmmH,xmmH
+ pxor xmmH, xmmH
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
- movdqa xmmB,xmmE
- punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ movdqa xmmB, xmmE
+ punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
- movdqa xmmF,xmmD
- punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ movdqa xmmF, xmmD
+ punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
-%else ; RGB_PIXELSIZE == 4 ; -----------
+%else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1:
- test cl, SIZEOF_XMMWORD/16
- jz short .column_ld2
- sub rcx, byte SIZEOF_XMMWORD/16
- movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_XMMWORD/16
+ movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
.column_ld2:
- test cl, SIZEOF_XMMWORD/8
- jz short .column_ld4
- sub rcx, byte SIZEOF_XMMWORD/8
- movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmE
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmE
.column_ld4:
- test cl, SIZEOF_XMMWORD/4
- jz short .column_ld8
- sub rcx, byte SIZEOF_XMMWORD/4
- movdqa xmmE,xmmA
- movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_XMMWORD/4
+ movdqa xmmE, xmmA
+ movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
.column_ld8:
- test cl, SIZEOF_XMMWORD/2
- mov rcx, SIZEOF_XMMWORD
- jz short .rgb_ycc_cnv
- movdqa xmmF,xmmA
- movdqa xmmH,xmmE
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- jmp short .rgb_ycc_cnv
+ test cl, SIZEOF_XMMWORD/2
+ mov rcx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmF, xmmA
+ movdqa xmmH, xmmE
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
.columnloop:
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
- movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+ movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
.rgb_ycc_cnv:
- ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
- punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
- movdqa xmmC,xmmF
- punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
- punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
- movdqa xmmB,xmmA
- punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
- punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
- movdqa xmmG,xmmD
- punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
- punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
- movdqa xmmE,xmmA
- punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
- movdqa xmmH,xmmB
- punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
- pxor xmmF,xmmF
-
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
- movdqa xmmD,xmmB
- punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
- movdqa xmmG,xmmE
- punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
- punpcklbw xmmF,xmmH
- punpckhbw xmmH,xmmH
- psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
- psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
- ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
- ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
- ; (Original)
- ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
- ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
- ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
- ;
- ; (This implementation)
- ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
- ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
- ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
- movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
- movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
- movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
- movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
-
- movdqa xmm6,xmm1
- punpcklwd xmm1,xmm3
- punpckhwd xmm6,xmm3
- movdqa xmm7,xmm1
- movdqa xmm4,xmm6
- pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
- pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
- pmaddwd xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
- pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
- movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
- movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
- pxor xmm1,xmm1
- pxor xmm6,xmm6
- punpcklwd xmm1,xmm5 ; xmm1=BOL
- punpckhwd xmm6,xmm5 ; xmm6=BOH
- psrld xmm1,1 ; xmm1=BOL*FIX(0.500)
- psrld xmm6,1 ; xmm6=BOH*FIX(0.500)
-
- movdqa xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
-
- paddd xmm7,xmm1
- paddd xmm4,xmm6
- paddd xmm7,xmm5
- paddd xmm4,xmm5
- psrld xmm7,SCALEBITS ; xmm7=CbOL
- psrld xmm4,SCALEBITS ; xmm4=CbOH
- packssdw xmm7,xmm4 ; xmm7=CbO
-
- movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
-
- movdqa xmm6,xmm0
- punpcklwd xmm0,xmm2
- punpckhwd xmm6,xmm2
- movdqa xmm5,xmm0
- movdqa xmm4,xmm6
- pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
- pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
- pmaddwd xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
- pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
- movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
- movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
- pxor xmm0,xmm0
- pxor xmm6,xmm6
- punpcklwd xmm0,xmm1 ; xmm0=BEL
- punpckhwd xmm6,xmm1 ; xmm6=BEH
- psrld xmm0,1 ; xmm0=BEL*FIX(0.500)
- psrld xmm6,1 ; xmm6=BEH*FIX(0.500)
-
- movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
-
- paddd xmm5,xmm0
- paddd xmm4,xmm6
- paddd xmm5,xmm1
- paddd xmm4,xmm1
- psrld xmm5,SCALEBITS ; xmm5=CbEL
- psrld xmm4,SCALEBITS ; xmm4=CbEH
- packssdw xmm5,xmm4 ; xmm5=CbE
-
- psllw xmm7,BYTE_BIT
- por xmm5,xmm7 ; xmm5=Cb
- movdqa XMMWORD [rbx], xmm5 ; Save Cb
-
- movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
- movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
- movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
-
- movdqa xmm4,xmm0
- punpcklwd xmm0,xmm3
- punpckhwd xmm4,xmm3
- movdqa xmm7,xmm0
- movdqa xmm5,xmm4
- pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
- pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
- pmaddwd xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
- pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
- movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
-
- paddd xmm0, XMMWORD [wk(4)]
- paddd xmm4, XMMWORD [wk(5)]
- paddd xmm0,xmm3
- paddd xmm4,xmm3
- psrld xmm0,SCALEBITS ; xmm0=YOL
- psrld xmm4,SCALEBITS ; xmm4=YOH
- packssdw xmm0,xmm4 ; xmm0=YO
-
- pxor xmm3,xmm3
- pxor xmm4,xmm4
- punpcklwd xmm3,xmm1 ; xmm3=ROL
- punpckhwd xmm4,xmm1 ; xmm4=ROH
- psrld xmm3,1 ; xmm3=ROL*FIX(0.500)
- psrld xmm4,1 ; xmm4=ROH*FIX(0.500)
-
- movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
-
- paddd xmm7,xmm3
- paddd xmm5,xmm4
- paddd xmm7,xmm1
- paddd xmm5,xmm1
- psrld xmm7,SCALEBITS ; xmm7=CrOL
- psrld xmm5,SCALEBITS ; xmm5=CrOH
- packssdw xmm7,xmm5 ; xmm7=CrO
-
- movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
-
- movdqa xmm4,xmm6
- punpcklwd xmm6,xmm2
- punpckhwd xmm4,xmm2
- movdqa xmm1,xmm6
- movdqa xmm5,xmm4
- pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
- pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
- pmaddwd xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
- pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
- movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
-
- paddd xmm6, XMMWORD [wk(6)]
- paddd xmm4, XMMWORD [wk(7)]
- paddd xmm6,xmm2
- paddd xmm4,xmm2
- psrld xmm6,SCALEBITS ; xmm6=YEL
- psrld xmm4,SCALEBITS ; xmm4=YEH
- packssdw xmm6,xmm4 ; xmm6=YE
-
- psllw xmm0,BYTE_BIT
- por xmm6,xmm0 ; xmm6=Y
- movdqa XMMWORD [rdi], xmm6 ; Save Y
-
- pxor xmm2,xmm2
- pxor xmm4,xmm4
- punpcklwd xmm2,xmm3 ; xmm2=REL
- punpckhwd xmm4,xmm3 ; xmm4=REH
- psrld xmm2,1 ; xmm2=REL*FIX(0.500)
- psrld xmm4,1 ; xmm4=REH*FIX(0.500)
-
- movdqa xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
-
- paddd xmm1,xmm2
- paddd xmm5,xmm4
- paddd xmm1,xmm0
- paddd xmm5,xmm0
- psrld xmm1,SCALEBITS ; xmm1=CrEL
- psrld xmm5,SCALEBITS ; xmm5=CrEH
- packssdw xmm1,xmm5 ; xmm1=CrE
-
- psllw xmm7,BYTE_BIT
- por xmm1,xmm7 ; xmm1=Cr
- movdqa XMMWORD [rdx], xmm1 ; Save Cr
-
- sub rcx, byte SIZEOF_XMMWORD
- add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
- add rdi, byte SIZEOF_XMMWORD ; outptr0
- add rbx, byte SIZEOF_XMMWORD ; outptr1
- add rdx, byte SIZEOF_XMMWORD ; outptr2
- cmp rcx, byte SIZEOF_XMMWORD
- jae near .columnloop
- test rcx,rcx
- jnz near .column_ld1
-
- pop rcx ; col
- pop rsi
- pop rdi
- pop rbx
- pop rdx
-
- add rsi, byte SIZEOF_JSAMPROW ; input_buf
- add rdi, byte SIZEOF_JSAMPROW
- add rbx, byte SIZEOF_JSAMPROW
- add rdx, byte SIZEOF_JSAMPROW
- dec rax ; num_rows
- jg near .rowloop
+ ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+ punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+ movdqa xmmC, xmmF
+ punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+ punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+ movdqa xmmB, xmmA
+ punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+ punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+ movdqa xmmG, xmmD
+ punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+ punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+ movdqa xmmE, xmmA
+ punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+ movdqa xmmH, xmmB
+ punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+ pxor xmmF, xmmF
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmD, xmmB
+ punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+ movdqa xmmG, xmmE
+ punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+ punpcklbw xmmF, xmmH
+ punpckhbw xmmH, xmmH
+ psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+ ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
+ movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
+ movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
+
+ movdqa xmm6, xmm1
+ punpcklwd xmm1, xmm3
+ punpckhwd xmm6, xmm3
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm6
+ pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+ pmaddwd xmm7, [rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+ pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+ movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ pxor xmm1, xmm1
+ pxor xmm6, xmm6
+ punpcklwd xmm1, xmm5 ; xmm1=BOL
+ punpckhwd xmm6, xmm5 ; xmm6=BOH
+ psrld xmm1, 1 ; xmm1=BOL*FIX(0.500)
+ psrld xmm6, 1 ; xmm6=BOH*FIX(0.500)
+
+ movdqa xmm5, [rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7, xmm1
+ paddd xmm4, xmm6
+ paddd xmm7, xmm5
+ paddd xmm4, xmm5
+ psrld xmm7, SCALEBITS ; xmm7=CbOL
+ psrld xmm4, SCALEBITS ; xmm4=CbOH
+ packssdw xmm7, xmm4 ; xmm7=CbO
+
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
+
+ movdqa xmm6, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm6, xmm2
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm6
+ pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+ pmaddwd xmm5, [rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+ pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ pxor xmm0, xmm0
+ pxor xmm6, xmm6
+ punpcklwd xmm0, xmm1 ; xmm0=BEL
+ punpckhwd xmm6, xmm1 ; xmm6=BEH
+ psrld xmm0, 1 ; xmm0=BEL*FIX(0.500)
+ psrld xmm6, 1 ; xmm6=BEH*FIX(0.500)
+
+ movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm5, xmm0
+ paddd xmm4, xmm6
+ paddd xmm5, xmm1
+ paddd xmm4, xmm1
+ psrld xmm5, SCALEBITS ; xmm5=CbEL
+ psrld xmm4, SCALEBITS ; xmm4=CbEH
+ packssdw xmm5, xmm4 ; xmm5=CbE
+
+ psllw xmm7, BYTE_BIT
+ por xmm5, xmm7 ; xmm5=Cb
+ movdqa XMMWORD [rbx], xmm5 ; Save Cb
+
+ movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
+ movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm4, xmm3
+ movdqa xmm7, xmm0
+ movdqa xmm5, xmm4
+ pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+ pmaddwd xmm7, [rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+ pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+ movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
+
+ paddd xmm0, XMMWORD [wk(4)]
+ paddd xmm4, XMMWORD [wk(5)]
+ paddd xmm0, xmm3
+ paddd xmm4, xmm3
+ psrld xmm0, SCALEBITS ; xmm0=YOL
+ psrld xmm4, SCALEBITS ; xmm4=YOH
+ packssdw xmm0, xmm4 ; xmm0=YO
+
+ pxor xmm3, xmm3
+ pxor xmm4, xmm4
+ punpcklwd xmm3, xmm1 ; xmm3=ROL
+ punpckhwd xmm4, xmm1 ; xmm4=ROH
+ psrld xmm3, 1 ; xmm3=ROL*FIX(0.500)
+ psrld xmm4, 1 ; xmm4=ROH*FIX(0.500)
+
+ movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7, xmm3
+ paddd xmm5, xmm4
+ paddd xmm7, xmm1
+ paddd xmm5, xmm1
+ psrld xmm7, SCALEBITS ; xmm7=CrOL
+ psrld xmm5, SCALEBITS ; xmm5=CrOH
+ packssdw xmm7, xmm5 ; xmm7=CrO
+
+ movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
+
+ movdqa xmm4, xmm6
+ punpcklwd xmm6, xmm2
+ punpckhwd xmm4, xmm2
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm4
+ pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+ pmaddwd xmm1, [rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+ pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+ movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
+
+ paddd xmm6, XMMWORD [wk(6)]
+ paddd xmm4, XMMWORD [wk(7)]
+ paddd xmm6, xmm2
+ paddd xmm4, xmm2
+ psrld xmm6, SCALEBITS ; xmm6=YEL
+ psrld xmm4, SCALEBITS ; xmm4=YEH
+ packssdw xmm6, xmm4 ; xmm6=YE
+
+ psllw xmm0, BYTE_BIT
+ por xmm6, xmm0 ; xmm6=Y
+ movdqa XMMWORD [rdi], xmm6 ; Save Y
+
+ pxor xmm2, xmm2
+ pxor xmm4, xmm4
+ punpcklwd xmm2, xmm3 ; xmm2=REL
+ punpckhwd xmm4, xmm3 ; xmm4=REH
+ psrld xmm2, 1 ; xmm2=REL*FIX(0.500)
+ psrld xmm4, 1 ; xmm4=REH*FIX(0.500)
+
+ movdqa xmm0, [rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
+
+ paddd xmm1, xmm2
+ paddd xmm5, xmm4
+ paddd xmm1, xmm0
+ paddd xmm5, xmm0
+ psrld xmm1, SCALEBITS ; xmm1=CrEL
+ psrld xmm5, SCALEBITS ; xmm5=CrEH
+ packssdw xmm1, xmm5 ; xmm1=CrE
+
+ psllw xmm7, BYTE_BIT
+ por xmm1, xmm7 ; xmm1=Cr
+ movdqa XMMWORD [rdx], xmm1 ; Save Cr
+
+ sub rcx, byte SIZEOF_XMMWORD
+ add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
+ add rdi, byte SIZEOF_XMMWORD ; outptr0
+ add rbx, byte SIZEOF_XMMWORD ; outptr1
+ add rdx, byte SIZEOF_XMMWORD ; outptr2
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test rcx, rcx
+ jnz near .column_ld1
+
+ pop rcx ; col
+ pop rsi
+ pop rdi
+ pop rbx
+ pop rdx
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_buf
+ add rdi, byte SIZEOF_JSAMPROW
+ add rbx, byte SIZEOF_JSAMPROW
+ add rdx, byte SIZEOF_JSAMPROW
+ dec rax ; num_rows
+ jg near .rowloop
.return:
- pop rbx
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
+ pop rbx
+ uncollect_args
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; JDIMENSION output_row, int num_rows);
;
-%define img_width(b) (b)+8 ; JDIMENSION img_width
-%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
-%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
-%define output_row(b) (b)+20 ; JDIMENSION output_row
-%define num_rows(b) (b)+24 ; int num_rows
+%define img_width(b) (b)+8 ; JDIMENSION img_width
+%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b)+20 ; JDIMENSION output_row
+%define num_rows(b) (b)+24 ; int num_rows
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 8
-%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 8
+%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
- align 16
+ align 16
- global EXTN(jsimd_rgb_ycc_convert_sse2)
+ global EXTN(jsimd_rgb_ycc_convert_sse2)
EXTN(jsimd_rgb_ycc_convert_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov ecx, JDIMENSION [img_width(eax)]
- test ecx,ecx
- jz near .return
-
- push ecx
-
- mov esi, JSAMPIMAGE [output_buf(eax)]
- mov ecx, JDIMENSION [output_row(eax)]
- mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
- lea edi, [edi+ecx*SIZEOF_JSAMPROW]
- lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
- lea edx, [edx+ecx*SIZEOF_JSAMPROW]
-
- pop ecx
-
- mov esi, JSAMPARRAY [input_buf(eax)]
- mov eax, INT [num_rows(eax)]
- test eax,eax
- jle near .return
- alignx 16,7
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)]
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
.rowloop:
- pushpic eax
- push edx
- push ebx
- push edi
- push esi
- push ecx ; col
+ pushpic eax
+ push edx
+ push ebx
+ push edi
+ push esi
+ push ecx ; col
- mov esi, JSAMPROW [esi] ; inptr
- mov edi, JSAMPROW [edi] ; outptr0
- mov ebx, JSAMPROW [ebx] ; outptr1
- mov edx, JSAMPROW [edx] ; outptr2
- movpic eax, POINTER [gotptr] ; load GOT address (eax)
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ mov ebx, JSAMPROW [ebx] ; outptr1
+ mov edx, JSAMPROW [edx] ; outptr2
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
- cmp ecx, byte SIZEOF_XMMWORD
- jae near .columnloop
- alignx 16,7
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ alignx 16, 7
-%if RGB_PIXELSIZE == 3 ; ---------------
+%if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1:
- push eax
- push edx
- lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
- test cl, SIZEOF_BYTE
- jz short .column_ld2
- sub ecx, byte SIZEOF_BYTE
- movzx eax, BYTE [esi+ecx]
+ push eax
+ push edx
+ lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ movzx eax, BYTE [esi+ecx]
.column_ld2:
- test cl, SIZEOF_WORD
- jz short .column_ld4
- sub ecx, byte SIZEOF_WORD
- movzx edx, WORD [esi+ecx]
- shl eax, WORD_BIT
- or eax,edx
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ movzx edx, WORD [esi+ecx]
+ shl eax, WORD_BIT
+ or eax, edx
.column_ld4:
- movd xmmA,eax
- pop edx
- pop eax
- test cl, SIZEOF_DWORD
- jz short .column_ld8
- sub ecx, byte SIZEOF_DWORD
- movd xmmF, XMM_DWORD [esi+ecx]
- pslldq xmmA, SIZEOF_DWORD
- por xmmA,xmmF
+ movd xmmA, eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ movd xmmF, XMM_DWORD [esi+ecx]
+ pslldq xmmA, SIZEOF_DWORD
+ por xmmA, xmmF
.column_ld8:
- test cl, SIZEOF_MMWORD
- jz short .column_ld16
- sub ecx, byte SIZEOF_MMWORD
- movq xmmB, XMM_MMWORD [esi+ecx]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmB
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub ecx, byte SIZEOF_MMWORD
+ movq xmmB, XMM_MMWORD [esi+ecx]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmB
.column_ld16:
- test cl, SIZEOF_XMMWORD
- jz short .column_ld32
- movdqa xmmF,xmmA
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- mov ecx, SIZEOF_XMMWORD
- jmp short .rgb_ycc_cnv
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ movdqa xmmF, xmmA
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ mov ecx, SIZEOF_XMMWORD
+ jmp short .rgb_ycc_cnv
.column_ld32:
- test cl, 2*SIZEOF_XMMWORD
- mov ecx, SIZEOF_XMMWORD
- jz short .rgb_ycc_cnv
- movdqa xmmB,xmmA
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
- jmp short .rgb_ycc_cnv
- alignx 16,7
+ test cl, 2*SIZEOF_XMMWORD
+ mov ecx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmB, xmmA
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16, 7
.columnloop:
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
- movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
.rgb_ycc_cnv:
- ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+ ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
- movdqa xmmG,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
- psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+ movdqa xmmG, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+ psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
- punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
- pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+ punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+ pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
- punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
- punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+ punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+ punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
- movdqa xmmD,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
- psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+ movdqa xmmD, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+ psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
- punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
- pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+ punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+ pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
- punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
- punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+ punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+ punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
- movdqa xmmE,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
- psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+ movdqa xmmE, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+ psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
- punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+ punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
- punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+ punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
- pxor xmmH,xmmH
+ pxor xmmH, xmmH
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
- movdqa xmmB,xmmE
- punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ movdqa xmmB, xmmE
+ punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
- movdqa xmmF,xmmD
- punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ movdqa xmmF, xmmD
+ punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
-%else ; RGB_PIXELSIZE == 4 ; -----------
+%else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1:
- test cl, SIZEOF_XMMWORD/16
- jz short .column_ld2
- sub ecx, byte SIZEOF_XMMWORD/16
- movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_XMMWORD/16
+ movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld2:
- test cl, SIZEOF_XMMWORD/8
- jz short .column_ld4
- sub ecx, byte SIZEOF_XMMWORD/8
- movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmE
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmE
.column_ld4:
- test cl, SIZEOF_XMMWORD/4
- jz short .column_ld8
- sub ecx, byte SIZEOF_XMMWORD/4
- movdqa xmmE,xmmA
- movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_XMMWORD/4
+ movdqa xmmE, xmmA
+ movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld8:
- test cl, SIZEOF_XMMWORD/2
- mov ecx, SIZEOF_XMMWORD
- jz short .rgb_ycc_cnv
- movdqa xmmF,xmmA
- movdqa xmmH,xmmE
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
- jmp short .rgb_ycc_cnv
- alignx 16,7
+ test cl, SIZEOF_XMMWORD/2
+ mov ecx, SIZEOF_XMMWORD
+ jz short .rgb_ycc_cnv
+ movdqa xmmF, xmmA
+ movdqa xmmH, xmmE
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_ycc_cnv
+ alignx 16, 7
.columnloop:
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
- movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+ movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
.rgb_ycc_cnv:
- ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
- punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
- movdqa xmmC,xmmF
- punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
- punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
- movdqa xmmB,xmmA
- punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
- punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
- movdqa xmmG,xmmD
- punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
- punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
- movdqa xmmE,xmmA
- punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
- movdqa xmmH,xmmB
- punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
- pxor xmmF,xmmF
-
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
- movdqa xmmD,xmmB
- punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
- movdqa xmmG,xmmE
- punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
- punpcklbw xmmF,xmmH
- punpckhbw xmmH,xmmH
- psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
- psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
- ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
- ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
- ; (Original)
- ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
- ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
- ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
- ;
- ; (This implementation)
- ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
- ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
- ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
- movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
- movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
- movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
- movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
-
- movdqa xmm6,xmm1
- punpcklwd xmm1,xmm3
- punpckhwd xmm6,xmm3
- movdqa xmm7,xmm1
- movdqa xmm4,xmm6
- pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
- pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
- pmaddwd xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
- pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
- movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
- movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
- pxor xmm1,xmm1
- pxor xmm6,xmm6
- punpcklwd xmm1,xmm5 ; xmm1=BOL
- punpckhwd xmm6,xmm5 ; xmm6=BOH
- psrld xmm1,1 ; xmm1=BOL*FIX(0.500)
- psrld xmm6,1 ; xmm6=BOH*FIX(0.500)
-
- movdqa xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
-
- paddd xmm7,xmm1
- paddd xmm4,xmm6
- paddd xmm7,xmm5
- paddd xmm4,xmm5
- psrld xmm7,SCALEBITS ; xmm7=CbOL
- psrld xmm4,SCALEBITS ; xmm4=CbOH
- packssdw xmm7,xmm4 ; xmm7=CbO
-
- movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
-
- movdqa xmm6,xmm0
- punpcklwd xmm0,xmm2
- punpckhwd xmm6,xmm2
- movdqa xmm5,xmm0
- movdqa xmm4,xmm6
- pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
- pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
- pmaddwd xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
- pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
- movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
- movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
- pxor xmm0,xmm0
- pxor xmm6,xmm6
- punpcklwd xmm0,xmm1 ; xmm0=BEL
- punpckhwd xmm6,xmm1 ; xmm6=BEH
- psrld xmm0,1 ; xmm0=BEL*FIX(0.500)
- psrld xmm6,1 ; xmm6=BEH*FIX(0.500)
-
- movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
-
- paddd xmm5,xmm0
- paddd xmm4,xmm6
- paddd xmm5,xmm1
- paddd xmm4,xmm1
- psrld xmm5,SCALEBITS ; xmm5=CbEL
- psrld xmm4,SCALEBITS ; xmm4=CbEH
- packssdw xmm5,xmm4 ; xmm5=CbE
-
- psllw xmm7,BYTE_BIT
- por xmm5,xmm7 ; xmm5=Cb
- movdqa XMMWORD [ebx], xmm5 ; Save Cb
-
- movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
- movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
- movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
-
- movdqa xmm4,xmm0
- punpcklwd xmm0,xmm3
- punpckhwd xmm4,xmm3
- movdqa xmm7,xmm0
- movdqa xmm5,xmm4
- pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
- pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
- pmaddwd xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
- pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
- movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
-
- paddd xmm0, XMMWORD [wk(4)]
- paddd xmm4, XMMWORD [wk(5)]
- paddd xmm0,xmm3
- paddd xmm4,xmm3
- psrld xmm0,SCALEBITS ; xmm0=YOL
- psrld xmm4,SCALEBITS ; xmm4=YOH
- packssdw xmm0,xmm4 ; xmm0=YO
-
- pxor xmm3,xmm3
- pxor xmm4,xmm4
- punpcklwd xmm3,xmm1 ; xmm3=ROL
- punpckhwd xmm4,xmm1 ; xmm4=ROH
- psrld xmm3,1 ; xmm3=ROL*FIX(0.500)
- psrld xmm4,1 ; xmm4=ROH*FIX(0.500)
-
- movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
-
- paddd xmm7,xmm3
- paddd xmm5,xmm4
- paddd xmm7,xmm1
- paddd xmm5,xmm1
- psrld xmm7,SCALEBITS ; xmm7=CrOL
- psrld xmm5,SCALEBITS ; xmm5=CrOH
- packssdw xmm7,xmm5 ; xmm7=CrO
-
- movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
-
- movdqa xmm4,xmm6
- punpcklwd xmm6,xmm2
- punpckhwd xmm4,xmm2
- movdqa xmm1,xmm6
- movdqa xmm5,xmm4
- pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
- pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
- pmaddwd xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
- pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
- movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
-
- paddd xmm6, XMMWORD [wk(6)]
- paddd xmm4, XMMWORD [wk(7)]
- paddd xmm6,xmm2
- paddd xmm4,xmm2
- psrld xmm6,SCALEBITS ; xmm6=YEL
- psrld xmm4,SCALEBITS ; xmm4=YEH
- packssdw xmm6,xmm4 ; xmm6=YE
-
- psllw xmm0,BYTE_BIT
- por xmm6,xmm0 ; xmm6=Y
- movdqa XMMWORD [edi], xmm6 ; Save Y
-
- pxor xmm2,xmm2
- pxor xmm4,xmm4
- punpcklwd xmm2,xmm3 ; xmm2=REL
- punpckhwd xmm4,xmm3 ; xmm4=REH
- psrld xmm2,1 ; xmm2=REL*FIX(0.500)
- psrld xmm4,1 ; xmm4=REH*FIX(0.500)
-
- movdqa xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
-
- paddd xmm1,xmm2
- paddd xmm5,xmm4
- paddd xmm1,xmm0
- paddd xmm5,xmm0
- psrld xmm1,SCALEBITS ; xmm1=CrEL
- psrld xmm5,SCALEBITS ; xmm5=CrEH
- packssdw xmm1,xmm5 ; xmm1=CrE
-
- psllw xmm7,BYTE_BIT
- por xmm1,xmm7 ; xmm1=Cr
- movdqa XMMWORD [edx], xmm1 ; Save Cr
-
- sub ecx, byte SIZEOF_XMMWORD
- add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
- add edi, byte SIZEOF_XMMWORD ; outptr0
- add ebx, byte SIZEOF_XMMWORD ; outptr1
- add edx, byte SIZEOF_XMMWORD ; outptr2
- cmp ecx, byte SIZEOF_XMMWORD
- jae near .columnloop
- test ecx,ecx
- jnz near .column_ld1
-
- pop ecx ; col
- pop esi
- pop edi
- pop ebx
- pop edx
- poppic eax
-
- add esi, byte SIZEOF_JSAMPROW ; input_buf
- add edi, byte SIZEOF_JSAMPROW
- add ebx, byte SIZEOF_JSAMPROW
- add edx, byte SIZEOF_JSAMPROW
- dec eax ; num_rows
- jg near .rowloop
+ ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+ punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+ movdqa xmmC, xmmF
+ punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+ punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+ movdqa xmmB, xmmA
+ punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+ punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+ movdqa xmmG, xmmD
+ punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+ punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+ movdqa xmmE, xmmA
+ punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+ movdqa xmmH, xmmB
+ punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+ pxor xmmF, xmmF
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmD, xmmB
+ punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+ movdqa xmmG, xmmE
+ punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+ punpcklbw xmmF, xmmH
+ punpckhbw xmmH, xmmH
+ psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+ ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
+ movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
+ movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
+
+ movdqa xmm6, xmm1
+ punpcklwd xmm1, xmm3
+ punpckhwd xmm6, xmm3
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm6
+ pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+ pmaddwd xmm7, [GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+ pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+ movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ pxor xmm1, xmm1
+ pxor xmm6, xmm6
+ punpcklwd xmm1, xmm5 ; xmm1=BOL
+ punpckhwd xmm6, xmm5 ; xmm6=BOH
+ psrld xmm1, 1 ; xmm1=BOL*FIX(0.500)
+ psrld xmm6, 1 ; xmm6=BOH*FIX(0.500)
+
+ movdqa xmm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7, xmm1
+ paddd xmm4, xmm6
+ paddd xmm7, xmm5
+ paddd xmm4, xmm5
+ psrld xmm7, SCALEBITS ; xmm7=CbOL
+ psrld xmm4, SCALEBITS ; xmm4=CbOH
+ packssdw xmm7, xmm4 ; xmm7=CbO
+
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
+
+ movdqa xmm6, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm6, xmm2
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm6
+ pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+ pmaddwd xmm5, [GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+ pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ pxor xmm0, xmm0
+ pxor xmm6, xmm6
+ punpcklwd xmm0, xmm1 ; xmm0=BEL
+ punpckhwd xmm6, xmm1 ; xmm6=BEH
+ psrld xmm0, 1 ; xmm0=BEL*FIX(0.500)
+ psrld xmm6, 1 ; xmm6=BEH*FIX(0.500)
+
+ movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm5, xmm0
+ paddd xmm4, xmm6
+ paddd xmm5, xmm1
+ paddd xmm4, xmm1
+ psrld xmm5, SCALEBITS ; xmm5=CbEL
+ psrld xmm4, SCALEBITS ; xmm4=CbEH
+ packssdw xmm5, xmm4 ; xmm5=CbE
+
+ psllw xmm7, BYTE_BIT
+ por xmm5, xmm7 ; xmm5=Cb
+ movdqa XMMWORD [ebx], xmm5 ; Save Cb
+
+ movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
+ movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm4, xmm3
+ movdqa xmm7, xmm0
+ movdqa xmm5, xmm4
+ pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+ pmaddwd xmm7, [GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+ pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+ movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
+
+ paddd xmm0, XMMWORD [wk(4)]
+ paddd xmm4, XMMWORD [wk(5)]
+ paddd xmm0, xmm3
+ paddd xmm4, xmm3
+ psrld xmm0, SCALEBITS ; xmm0=YOL
+ psrld xmm4, SCALEBITS ; xmm4=YOH
+ packssdw xmm0, xmm4 ; xmm0=YO
+
+ pxor xmm3, xmm3
+ pxor xmm4, xmm4
+ punpcklwd xmm3, xmm1 ; xmm3=ROL
+ punpckhwd xmm4, xmm1 ; xmm4=ROH
+ psrld xmm3, 1 ; xmm3=ROL*FIX(0.500)
+ psrld xmm4, 1 ; xmm4=ROH*FIX(0.500)
+
+ movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+ paddd xmm7, xmm3
+ paddd xmm5, xmm4
+ paddd xmm7, xmm1
+ paddd xmm5, xmm1
+ psrld xmm7, SCALEBITS ; xmm7=CrOL
+ psrld xmm5, SCALEBITS ; xmm5=CrOH
+ packssdw xmm7, xmm5 ; xmm7=CrO
+
+ movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
+
+ movdqa xmm4, xmm6
+ punpcklwd xmm6, xmm2
+ punpckhwd xmm4, xmm2
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm4
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+ pmaddwd xmm1, [GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+ pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+ movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
+
+ paddd xmm6, XMMWORD [wk(6)]
+ paddd xmm4, XMMWORD [wk(7)]
+ paddd xmm6, xmm2
+ paddd xmm4, xmm2
+ psrld xmm6, SCALEBITS ; xmm6=YEL
+ psrld xmm4, SCALEBITS ; xmm4=YEH
+ packssdw xmm6, xmm4 ; xmm6=YE
+
+ psllw xmm0, BYTE_BIT
+ por xmm6, xmm0 ; xmm6=Y
+ movdqa XMMWORD [edi], xmm6 ; Save Y
+
+ pxor xmm2, xmm2
+ pxor xmm4, xmm4
+ punpcklwd xmm2, xmm3 ; xmm2=REL
+ punpckhwd xmm4, xmm3 ; xmm4=REH
+ psrld xmm2, 1 ; xmm2=REL*FIX(0.500)
+ psrld xmm4, 1 ; xmm4=REH*FIX(0.500)
+
+ movdqa xmm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
+
+ paddd xmm1, xmm2
+ paddd xmm5, xmm4
+ paddd xmm1, xmm0
+ paddd xmm5, xmm0
+ psrld xmm1, SCALEBITS ; xmm1=CrEL
+ psrld xmm5, SCALEBITS ; xmm5=CrEH
+ packssdw xmm1, xmm5 ; xmm1=CrE
+
+ psllw xmm7, BYTE_BIT
+ por xmm1, xmm7 ; xmm1=Cr
+ movdqa XMMWORD [edx], xmm1 ; Save Cr
+
+ sub ecx, byte SIZEOF_XMMWORD
+ add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
+ add edi, byte SIZEOF_XMMWORD ; outptr0
+ add ebx, byte SIZEOF_XMMWORD ; outptr1
+ add edx, byte SIZEOF_XMMWORD ; outptr2
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ pop ebx
+ pop edx
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; --------------------------------------------------------------------------
-%define SCALEBITS 16
-
-F_0_081 equ 5329 ; FIX(0.08131)
-F_0_114 equ 7471 ; FIX(0.11400)
-F_0_168 equ 11059 ; FIX(0.16874)
-F_0_250 equ 16384 ; FIX(0.25000)
-F_0_299 equ 19595 ; FIX(0.29900)
-F_0_331 equ 21709 ; FIX(0.33126)
-F_0_418 equ 27439 ; FIX(0.41869)
-F_0_587 equ 38470 ; FIX(0.58700)
-F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+%define SCALEBITS 16
+
+F_0_081 equ 5329 ; FIX(0.08131)
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_168 equ 11059 ; FIX(0.16874)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_331 equ 21709 ; FIX(0.33126)
+F_0_418 equ 27439 ; FIX(0.41869)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_rgb_ycc_convert_sse2)
+ alignz 16
+ global EXTN(jconst_rgb_ycc_convert_sse2)
EXTN(jconst_rgb_ycc_convert_sse2):
PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
+ SECTION SEG_TEXT
+ BITS 64
%include "jccolext-sse2-64.asm"
; --------------------------------------------------------------------------
-%define SCALEBITS 16
-
-F_0_081 equ 5329 ; FIX(0.08131)
-F_0_114 equ 7471 ; FIX(0.11400)
-F_0_168 equ 11059 ; FIX(0.16874)
-F_0_250 equ 16384 ; FIX(0.25000)
-F_0_299 equ 19595 ; FIX(0.29900)
-F_0_331 equ 21709 ; FIX(0.33126)
-F_0_418 equ 27439 ; FIX(0.41869)
-F_0_587 equ 38470 ; FIX(0.58700)
-F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+%define SCALEBITS 16
+
+F_0_081 equ 5329 ; FIX(0.08131)
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_168 equ 11059 ; FIX(0.16874)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_331 equ 21709 ; FIX(0.33126)
+F_0_418 equ 27439 ; FIX(0.41869)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_rgb_ycc_convert_sse2)
+ alignz 16
+ global EXTN(jconst_rgb_ycc_convert_sse2)
EXTN(jconst_rgb_ycc_convert_sse2):
PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
+ SECTION SEG_TEXT
+ BITS 32
%include "jccolext-sse2.asm"
; --------------------------------------------------------------------------
-%define SCALEBITS 16
+%define SCALEBITS 16
-F_0_114 equ 7471 ; FIX(0.11400)
-F_0_250 equ 16384 ; FIX(0.25000)
-F_0_299 equ 19595 ; FIX(0.29900)
-F_0_587 equ 38470 ; FIX(0.58700)
-F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_rgb_gray_convert_sse2)
+ alignz 16
+ global EXTN(jconst_rgb_gray_convert_sse2)
EXTN(jconst_rgb_gray_convert_sse2):
-PW_F0299_F0337 times 4 dw F_0_299, F_0_337
-PW_F0114_F0250 times 4 dw F_0_114, F_0_250
-PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
+ SECTION SEG_TEXT
+ BITS 64
%include "jcgryext-sse2-64.asm"
; --------------------------------------------------------------------------
-%define SCALEBITS 16
+%define SCALEBITS 16
-F_0_114 equ 7471 ; FIX(0.11400)
-F_0_250 equ 16384 ; FIX(0.25000)
-F_0_299 equ 19595 ; FIX(0.29900)
-F_0_587 equ 38470 ; FIX(0.58700)
-F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
+F_0_114 equ 7471 ; FIX(0.11400)
+F_0_250 equ 16384 ; FIX(0.25000)
+F_0_299 equ 19595 ; FIX(0.29900)
+F_0_587 equ 38470 ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_rgb_gray_convert_sse2)
+ alignz 16
+ global EXTN(jconst_rgb_gray_convert_sse2)
EXTN(jconst_rgb_gray_convert_sse2):
-PW_F0299_F0337 times 4 dw F_0_299, F_0_337
-PW_F0114_F0250 times 4 dw F_0_114, F_0_250
-PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
+ SECTION SEG_TEXT
+ BITS 32
%include "jcgryext-sse2.asm"
; r13 = JDIMENSION output_row
; r14 = int num_rows
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
- align 16
+ align 16
- global EXTN(jsimd_rgb_gray_convert_sse2)
+ global EXTN(jsimd_rgb_gray_convert_sse2)
EXTN(jsimd_rgb_gray_convert_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
- push rbx
-
- mov ecx, r10d
- test rcx,rcx
- jz near .return
-
- push rcx
-
- mov rsi, r12
- mov ecx, r13d
- mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
- lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
-
- pop rcx
-
- mov rsi, r11
- mov eax, r14d
- test rax,rax
- jle near .return
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args
+ push rbx
+
+ mov ecx, r10d
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rsi, r12
+ mov ecx, r13d
+ mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+ lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+ pop rcx
+
+ mov rsi, r11
+ mov eax, r14d
+ test rax, rax
+ jle near .return
.rowloop:
- push rdi
- push rsi
- push rcx ; col
+ push rdi
+ push rsi
+ push rcx ; col
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr0
+ mov rsi, JSAMPROW [rsi] ; inptr
+ mov rdi, JSAMPROW [rdi] ; outptr0
- cmp rcx, byte SIZEOF_XMMWORD
- jae near .columnloop
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
-%if RGB_PIXELSIZE == 3 ; ---------------
+%if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1:
- push rax
- push rdx
- lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
- test cl, SIZEOF_BYTE
- jz short .column_ld2
- sub rcx, byte SIZEOF_BYTE
- movzx rax, BYTE [rsi+rcx]
+ push rax
+ push rdx
+ lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_BYTE
+ movzx rax, BYTE [rsi+rcx]
.column_ld2:
- test cl, SIZEOF_WORD
- jz short .column_ld4
- sub rcx, byte SIZEOF_WORD
- movzx rdx, WORD [rsi+rcx]
- shl rax, WORD_BIT
- or rax,rdx
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_WORD
+ movzx rdx, WORD [rsi+rcx]
+ shl rax, WORD_BIT
+ or rax, rdx
.column_ld4:
- movd xmmA,eax
- pop rdx
- pop rax
- test cl, SIZEOF_DWORD
- jz short .column_ld8
- sub rcx, byte SIZEOF_DWORD
- movd xmmF, XMM_DWORD [rsi+rcx]
- pslldq xmmA, SIZEOF_DWORD
- por xmmA,xmmF
+ movd xmmA, eax
+ pop rdx
+ pop rax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_DWORD
+ movd xmmF, XMM_DWORD [rsi+rcx]
+ pslldq xmmA, SIZEOF_DWORD
+ por xmmA, xmmF
.column_ld8:
- test cl, SIZEOF_MMWORD
- jz short .column_ld16
- sub rcx, byte SIZEOF_MMWORD
- movq xmmB, XMM_MMWORD [rsi+rcx]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmB
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub rcx, byte SIZEOF_MMWORD
+ movq xmmB, XMM_MMWORD [rsi+rcx]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmB
.column_ld16:
- test cl, SIZEOF_XMMWORD
- jz short .column_ld32
- movdqa xmmF,xmmA
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- mov rcx, SIZEOF_XMMWORD
- jmp short .rgb_gray_cnv
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ movdqa xmmF, xmmA
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ mov rcx, SIZEOF_XMMWORD
+ jmp short .rgb_gray_cnv
.column_ld32:
- test cl, 2*SIZEOF_XMMWORD
- mov rcx, SIZEOF_XMMWORD
- jz short .rgb_gray_cnv
- movdqa xmmB,xmmA
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- jmp short .rgb_gray_cnv
+ test cl, 2*SIZEOF_XMMWORD
+ mov rcx, SIZEOF_XMMWORD
+ jz short .rgb_gray_cnv
+ movdqa xmmB, xmmA
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_gray_cnv
.columnloop:
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
.rgb_gray_cnv:
- ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+ ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
- movdqa xmmG,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
- psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+ movdqa xmmG, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+ psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
- punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
- pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+ punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+ pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
- punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
- punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+ punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+ punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
- movdqa xmmD,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
- psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+ movdqa xmmD, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+ psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
- punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
- pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+ punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+ pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
- punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
- punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+ punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+ punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
- movdqa xmmE,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
- psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+ movdqa xmmE, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+ psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
- punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+ punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
- punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+ punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
- pxor xmmH,xmmH
+ pxor xmmH, xmmH
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
- movdqa xmmB,xmmE
- punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ movdqa xmmB, xmmE
+ punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
- movdqa xmmF,xmmD
- punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ movdqa xmmF, xmmD
+ punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
-%else ; RGB_PIXELSIZE == 4 ; -----------
+%else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1:
- test cl, SIZEOF_XMMWORD/16
- jz short .column_ld2
- sub rcx, byte SIZEOF_XMMWORD/16
- movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub rcx, byte SIZEOF_XMMWORD/16
+ movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
.column_ld2:
- test cl, SIZEOF_XMMWORD/8
- jz short .column_ld4
- sub rcx, byte SIZEOF_XMMWORD/8
- movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmE
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmE
.column_ld4:
- test cl, SIZEOF_XMMWORD/4
- jz short .column_ld8
- sub rcx, byte SIZEOF_XMMWORD/4
- movdqa xmmE,xmmA
- movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub rcx, byte SIZEOF_XMMWORD/4
+ movdqa xmmE, xmmA
+ movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
.column_ld8:
- test cl, SIZEOF_XMMWORD/2
- mov rcx, SIZEOF_XMMWORD
- jz short .rgb_gray_cnv
- movdqa xmmF,xmmA
- movdqa xmmH,xmmE
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- jmp short .rgb_gray_cnv
+ test cl, SIZEOF_XMMWORD/2
+ mov rcx, SIZEOF_XMMWORD
+ jz short .rgb_gray_cnv
+ movdqa xmmF, xmmA
+ movdqa xmmH, xmmE
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_gray_cnv
.columnloop:
- movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
- movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+ movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+ movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
.rgb_gray_cnv:
- ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
- punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
- movdqa xmmC,xmmF
- punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
- punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
- movdqa xmmB,xmmA
- punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
- punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
- movdqa xmmG,xmmD
- punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
- punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
- movdqa xmmE,xmmA
- punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
- movdqa xmmH,xmmB
- punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
- pxor xmmF,xmmF
-
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
- movdqa xmmD,xmmB
- punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
- movdqa xmmG,xmmE
- punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
- punpcklbw xmmF,xmmH
- punpckhbw xmmH,xmmH
- psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
- psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
- ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
- ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
- ; (Original)
- ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
- ;
- ; (This implementation)
- ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
- movdqa xmm6,xmm1
- punpcklwd xmm1,xmm3
- punpckhwd xmm6,xmm3
- pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
- pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
- movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
- movdqa xmm6,xmm0
- punpcklwd xmm0,xmm2
- punpckhwd xmm6,xmm2
- pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
- pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
- movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
- movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
- movdqa xmm0, xmm5 ; xmm0=BO
- movdqa xmm6, xmm4 ; xmm6=BE
-
- movdqa xmm4,xmm0
- punpcklwd xmm0,xmm3
- punpckhwd xmm4,xmm3
- pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
- pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
- movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
-
- paddd xmm0, xmm1
- paddd xmm4, xmm7
- paddd xmm0,xmm3
- paddd xmm4,xmm3
- psrld xmm0,SCALEBITS ; xmm0=YOL
- psrld xmm4,SCALEBITS ; xmm4=YOH
- packssdw xmm0,xmm4 ; xmm0=YO
-
- movdqa xmm4,xmm6
- punpcklwd xmm6,xmm2
- punpckhwd xmm4,xmm2
- pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
- pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
- movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
-
- paddd xmm6, XMMWORD [wk(0)]
- paddd xmm4, XMMWORD [wk(1)]
- paddd xmm6,xmm2
- paddd xmm4,xmm2
- psrld xmm6,SCALEBITS ; xmm6=YEL
- psrld xmm4,SCALEBITS ; xmm4=YEH
- packssdw xmm6,xmm4 ; xmm6=YE
-
- psllw xmm0,BYTE_BIT
- por xmm6,xmm0 ; xmm6=Y
- movdqa XMMWORD [rdi], xmm6 ; Save Y
-
- sub rcx, byte SIZEOF_XMMWORD
- add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
- add rdi, byte SIZEOF_XMMWORD ; outptr0
- cmp rcx, byte SIZEOF_XMMWORD
- jae near .columnloop
- test rcx,rcx
- jnz near .column_ld1
-
- pop rcx ; col
- pop rsi
- pop rdi
-
- add rsi, byte SIZEOF_JSAMPROW ; input_buf
- add rdi, byte SIZEOF_JSAMPROW
- dec rax ; num_rows
- jg near .rowloop
+ ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+ punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+ movdqa xmmC, xmmF
+ punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+ punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+ movdqa xmmB, xmmA
+ punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+ punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+ movdqa xmmG, xmmD
+ punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+ punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+ movdqa xmmE, xmmA
+ punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+ movdqa xmmH, xmmB
+ punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+ pxor xmmF, xmmF
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmD, xmmB
+ punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+ movdqa xmmG, xmmE
+ punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+ punpcklbw xmmF, xmmH
+ punpckhbw xmmH, xmmH
+ psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+ ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+ movdqa xmm6, xmm1
+ punpcklwd xmm1, xmm3
+ punpckhwd xmm6, xmm3
+ pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ movdqa xmm6, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm6, xmm2
+ pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ movdqa xmm0, xmm5 ; xmm0=BO
+ movdqa xmm6, xmm4 ; xmm6=BE
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm4, xmm3
+ pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+ movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
+
+ paddd xmm0, xmm1
+ paddd xmm4, xmm7
+ paddd xmm0, xmm3
+ paddd xmm4, xmm3
+ psrld xmm0, SCALEBITS ; xmm0=YOL
+ psrld xmm4, SCALEBITS ; xmm4=YOH
+ packssdw xmm0, xmm4 ; xmm0=YO
+
+ movdqa xmm4, xmm6
+ punpcklwd xmm6, xmm2
+ punpckhwd xmm4, xmm2
+ pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+ movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
+
+ paddd xmm6, XMMWORD [wk(0)]
+ paddd xmm4, XMMWORD [wk(1)]
+ paddd xmm6, xmm2
+ paddd xmm4, xmm2
+ psrld xmm6, SCALEBITS ; xmm6=YEL
+ psrld xmm4, SCALEBITS ; xmm4=YEH
+ packssdw xmm6, xmm4 ; xmm6=YE
+
+ psllw xmm0, BYTE_BIT
+ por xmm6, xmm0 ; xmm6=Y
+ movdqa XMMWORD [rdi], xmm6 ; Save Y
+
+ sub rcx, byte SIZEOF_XMMWORD
+ add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
+ add rdi, byte SIZEOF_XMMWORD ; outptr0
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test rcx, rcx
+ jnz near .column_ld1
+
+ pop rcx ; col
+ pop rsi
+ pop rdi
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_buf
+ add rdi, byte SIZEOF_JSAMPROW
+ dec rax ; num_rows
+ jg near .rowloop
.return:
- pop rbx
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
+ pop rbx
+ uncollect_args
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; JDIMENSION output_row, int num_rows);
;
-%define img_width(b) (b)+8 ; JDIMENSION img_width
-%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
-%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
-%define output_row(b) (b)+20 ; JDIMENSION output_row
-%define num_rows(b) (b)+24 ; int num_rows
+%define img_width(b) (b)+8 ; JDIMENSION img_width
+%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf
+%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf
+%define output_row(b) (b)+20 ; JDIMENSION output_row
+%define num_rows(b) (b)+24 ; int num_rows
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
- align 16
+ align 16
- global EXTN(jsimd_rgb_gray_convert_sse2)
+ global EXTN(jsimd_rgb_gray_convert_sse2)
EXTN(jsimd_rgb_gray_convert_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov ecx, JDIMENSION [img_width(eax)]
- test ecx,ecx
- jz near .return
-
- push ecx
-
- mov esi, JSAMPIMAGE [output_buf(eax)]
- mov ecx, JDIMENSION [output_row(eax)]
- mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
- lea edi, [edi+ecx*SIZEOF_JSAMPROW]
-
- pop ecx
-
- mov esi, JSAMPARRAY [input_buf(eax)]
- mov eax, INT [num_rows(eax)]
- test eax,eax
- jle near .return
- alignx 16,7
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [img_width(eax)]
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov esi, JSAMPIMAGE [output_buf(eax)]
+ mov ecx, JDIMENSION [output_row(eax)]
+ mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+ lea edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov esi, JSAMPARRAY [input_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
.rowloop:
- pushpic eax
- push edi
- push esi
- push ecx ; col
+ pushpic eax
+ push edi
+ push esi
+ push ecx ; col
- mov esi, JSAMPROW [esi] ; inptr
- mov edi, JSAMPROW [edi] ; outptr0
- movpic eax, POINTER [gotptr] ; load GOT address (eax)
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr0
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
- cmp ecx, byte SIZEOF_XMMWORD
- jae near .columnloop
- alignx 16,7
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ alignx 16, 7
-%if RGB_PIXELSIZE == 3 ; ---------------
+%if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1:
- push eax
- push edx
- lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
- test cl, SIZEOF_BYTE
- jz short .column_ld2
- sub ecx, byte SIZEOF_BYTE
- movzx eax, BYTE [esi+ecx]
+ push eax
+ push edx
+ lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
+ test cl, SIZEOF_BYTE
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_BYTE
+ movzx eax, BYTE [esi+ecx]
.column_ld2:
- test cl, SIZEOF_WORD
- jz short .column_ld4
- sub ecx, byte SIZEOF_WORD
- movzx edx, WORD [esi+ecx]
- shl eax, WORD_BIT
- or eax,edx
+ test cl, SIZEOF_WORD
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_WORD
+ movzx edx, WORD [esi+ecx]
+ shl eax, WORD_BIT
+ or eax, edx
.column_ld4:
- movd xmmA,eax
- pop edx
- pop eax
- test cl, SIZEOF_DWORD
- jz short .column_ld8
- sub ecx, byte SIZEOF_DWORD
- movd xmmF, XMM_DWORD [esi+ecx]
- pslldq xmmA, SIZEOF_DWORD
- por xmmA,xmmF
+ movd xmmA, eax
+ pop edx
+ pop eax
+ test cl, SIZEOF_DWORD
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_DWORD
+ movd xmmF, XMM_DWORD [esi+ecx]
+ pslldq xmmA, SIZEOF_DWORD
+ por xmmA, xmmF
.column_ld8:
- test cl, SIZEOF_MMWORD
- jz short .column_ld16
- sub ecx, byte SIZEOF_MMWORD
- movq xmmB, XMM_MMWORD [esi+ecx]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmB
+ test cl, SIZEOF_MMWORD
+ jz short .column_ld16
+ sub ecx, byte SIZEOF_MMWORD
+ movq xmmB, XMM_MMWORD [esi+ecx]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmB
.column_ld16:
- test cl, SIZEOF_XMMWORD
- jz short .column_ld32
- movdqa xmmF,xmmA
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- mov ecx, SIZEOF_XMMWORD
- jmp short .rgb_gray_cnv
+ test cl, SIZEOF_XMMWORD
+ jz short .column_ld32
+ movdqa xmmF, xmmA
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ mov ecx, SIZEOF_XMMWORD
+ jmp short .rgb_gray_cnv
.column_ld32:
- test cl, 2*SIZEOF_XMMWORD
- mov ecx, SIZEOF_XMMWORD
- jz short .rgb_gray_cnv
- movdqa xmmB,xmmA
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
- jmp short .rgb_gray_cnv
- alignx 16,7
+ test cl, 2*SIZEOF_XMMWORD
+ mov ecx, SIZEOF_XMMWORD
+ jz short .rgb_gray_cnv
+ movdqa xmmB, xmmA
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_gray_cnv
+ alignx 16, 7
.columnloop:
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
- movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
.rgb_gray_cnv:
- ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+ ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
- movdqa xmmG,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
- psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+ movdqa xmmG, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+ psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
- punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
- pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+ punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+ pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
- punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
- punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+ punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+ punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
- movdqa xmmD,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
- psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+ movdqa xmmD, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+ psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
- punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
- pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+ punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+ pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
- punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
- punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+ punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+ punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
- movdqa xmmE,xmmA
- pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
- psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+ movdqa xmmE, xmmA
+ pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+ psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
- punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+ punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
- punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+ punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
- pxor xmmH,xmmH
+ pxor xmmH, xmmH
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
- movdqa xmmB,xmmE
- punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ movdqa xmmB, xmmE
+ punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
- movdqa xmmF,xmmD
- punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ movdqa xmmF, xmmD
+ punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
-%else ; RGB_PIXELSIZE == 4 ; -----------
+%else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1:
- test cl, SIZEOF_XMMWORD/16
- jz short .column_ld2
- sub ecx, byte SIZEOF_XMMWORD/16
- movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+ test cl, SIZEOF_XMMWORD/16
+ jz short .column_ld2
+ sub ecx, byte SIZEOF_XMMWORD/16
+ movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld2:
- test cl, SIZEOF_XMMWORD/8
- jz short .column_ld4
- sub ecx, byte SIZEOF_XMMWORD/8
- movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
- pslldq xmmA, SIZEOF_MMWORD
- por xmmA,xmmE
+ test cl, SIZEOF_XMMWORD/8
+ jz short .column_ld4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+ pslldq xmmA, SIZEOF_MMWORD
+ por xmmA, xmmE
.column_ld4:
- test cl, SIZEOF_XMMWORD/4
- jz short .column_ld8
- sub ecx, byte SIZEOF_XMMWORD/4
- movdqa xmmE,xmmA
- movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+ test cl, SIZEOF_XMMWORD/4
+ jz short .column_ld8
+ sub ecx, byte SIZEOF_XMMWORD/4
+ movdqa xmmE, xmmA
+ movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld8:
- test cl, SIZEOF_XMMWORD/2
- mov ecx, SIZEOF_XMMWORD
- jz short .rgb_gray_cnv
- movdqa xmmF,xmmA
- movdqa xmmH,xmmE
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
- jmp short .rgb_gray_cnv
- alignx 16,7
+ test cl, SIZEOF_XMMWORD/2
+ mov ecx, SIZEOF_XMMWORD
+ jz short .rgb_gray_cnv
+ movdqa xmmF, xmmA
+ movdqa xmmH, xmmE
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ jmp short .rgb_gray_cnv
+ alignx 16, 7
.columnloop:
- movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
- movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
- movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+ movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+ movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
.rgb_gray_cnv:
- ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
- punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
- movdqa xmmC,xmmF
- punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
- punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
- movdqa xmmB,xmmA
- punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
- punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
- movdqa xmmG,xmmD
- punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
- punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
- movdqa xmmE,xmmA
- punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
- punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
- movdqa xmmH,xmmB
- punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
- punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
- pxor xmmF,xmmF
-
- movdqa xmmC,xmmA
- punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
- punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
- movdqa xmmD,xmmB
- punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
- punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
- movdqa xmmG,xmmE
- punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
- punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
- punpcklbw xmmF,xmmH
- punpckhbw xmmH,xmmH
- psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
- psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
- ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
- ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
- ; (Original)
- ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
- ;
- ; (This implementation)
- ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
- movdqa xmm6,xmm1
- punpcklwd xmm1,xmm3
- punpckhwd xmm6,xmm3
- pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
- pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
- movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
- movdqa xmm6,xmm0
- punpcklwd xmm0,xmm2
- punpckhwd xmm6,xmm2
- pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
- pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
- movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
- movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
- movdqa xmm0, xmm5 ; xmm0=BO
- movdqa xmm6, xmm4 ; xmm6=BE
-
- movdqa xmm4,xmm0
- punpcklwd xmm0,xmm3
- punpckhwd xmm4,xmm3
- pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
- pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
- movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
-
- paddd xmm0, xmm1
- paddd xmm4, xmm7
- paddd xmm0,xmm3
- paddd xmm4,xmm3
- psrld xmm0,SCALEBITS ; xmm0=YOL
- psrld xmm4,SCALEBITS ; xmm4=YOH
- packssdw xmm0,xmm4 ; xmm0=YO
-
- movdqa xmm4,xmm6
- punpcklwd xmm6,xmm2
- punpckhwd xmm4,xmm2
- pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
- pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
- movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
-
- paddd xmm6, XMMWORD [wk(0)]
- paddd xmm4, XMMWORD [wk(1)]
- paddd xmm6,xmm2
- paddd xmm4,xmm2
- psrld xmm6,SCALEBITS ; xmm6=YEL
- psrld xmm4,SCALEBITS ; xmm4=YEH
- packssdw xmm6,xmm4 ; xmm6=YE
-
- psllw xmm0,BYTE_BIT
- por xmm6,xmm0 ; xmm6=Y
- movdqa XMMWORD [edi], xmm6 ; Save Y
-
- sub ecx, byte SIZEOF_XMMWORD
- add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
- add edi, byte SIZEOF_XMMWORD ; outptr0
- cmp ecx, byte SIZEOF_XMMWORD
- jae near .columnloop
- test ecx,ecx
- jnz near .column_ld1
-
- pop ecx ; col
- pop esi
- pop edi
- poppic eax
-
- add esi, byte SIZEOF_JSAMPROW ; input_buf
- add edi, byte SIZEOF_JSAMPROW
- dec eax ; num_rows
- jg near .rowloop
+ ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+ punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+ movdqa xmmC, xmmF
+ punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+ punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+ movdqa xmmB, xmmA
+ punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+ punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+ movdqa xmmG, xmmD
+ punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+ punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+ movdqa xmmE, xmmA
+ punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+ punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+ movdqa xmmH, xmmB
+ punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+ punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+ pxor xmmF, xmmF
+
+ movdqa xmmC, xmmA
+ punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
+ punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+ movdqa xmmD, xmmB
+ punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
+ punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+ movdqa xmmG, xmmE
+ punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
+ punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+ punpcklbw xmmF, xmmH
+ punpckhbw xmmH, xmmH
+ psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+ psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+ ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+ ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+ ; (Original)
+ ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ ;
+ ; (This implementation)
+ ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+ movdqa xmm6, xmm1
+ punpcklwd xmm1, xmm3
+ punpckhwd xmm6, xmm3
+ pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+ movdqa xmm6, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm6, xmm2
+ pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+ movdqa xmm0, xmm5 ; xmm0=BO
+ movdqa xmm6, xmm4 ; xmm6=BE
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm4, xmm3
+ pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+ pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+ movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
+
+ paddd xmm0, xmm1
+ paddd xmm4, xmm7
+ paddd xmm0, xmm3
+ paddd xmm4, xmm3
+ psrld xmm0, SCALEBITS ; xmm0=YOL
+ psrld xmm4, SCALEBITS ; xmm4=YOH
+ packssdw xmm0, xmm4 ; xmm0=YO
+
+ movdqa xmm4, xmm6
+ punpcklwd xmm6, xmm2
+ punpckhwd xmm4, xmm2
+ pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+ pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+ movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
+
+ paddd xmm6, XMMWORD [wk(0)]
+ paddd xmm4, XMMWORD [wk(1)]
+ paddd xmm6, xmm2
+ paddd xmm4, xmm2
+ psrld xmm6, SCALEBITS ; xmm6=YEL
+ psrld xmm4, SCALEBITS ; xmm4=YEH
+ packssdw xmm6, xmm4 ; xmm6=YE
+
+ psllw xmm0, BYTE_BIT
+ por xmm6, xmm0 ; xmm6=Y
+ movdqa XMMWORD [edi], xmm6 ; Save Y
+
+ sub ecx, byte SIZEOF_XMMWORD
+ add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
+ add edi, byte SIZEOF_XMMWORD ; outptr0
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .column_ld1
+
+ pop ecx ; col
+ pop esi
+ pop edi
+ poppic eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_buf
+ add edi, byte SIZEOF_JSAMPROW
+ dec eax ; num_rows
+ jg near .rowloop
.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
%include "jsimdext.inc"
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_huff_encode_one_block)
+ alignz 16
+ global EXTN(jconst_huff_encode_one_block)
EXTN(jconst_huff_encode_one_block):
%include "jpeg_nbits_table.inc"
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
+ SECTION SEG_TEXT
+ BITS 64
; These macros perform the same task as the emit_bits() function in the
; original libjpeg code. In addition to reducing overhead by explicitly
; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
%macro EMIT_BYTE 0
- sub put_bits, 8 ; put_bits -= 8;
- mov rdx, put_buffer
- mov ecx, put_bits
- shr rdx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
- mov byte [buffer], dl ; *buffer++ = c;
- add buffer, 1
- cmp dl, 0xFF ; need to stuff a zero byte?
- jne %%.EMIT_BYTE_END
- mov byte [buffer], 0 ; *buffer++ = 0;
- add buffer, 1
+ sub put_bits, 8 ; put_bits -= 8;
+ mov rdx, put_buffer
+ mov ecx, put_bits
+ shr rdx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
+ mov byte [buffer], dl ; *buffer++ = c;
+ add buffer, 1
+ cmp dl, 0xFF ; need to stuff a zero byte?
+ jne %%.EMIT_BYTE_END
+ mov byte [buffer], 0 ; *buffer++ = 0;
+ add buffer, 1
%%.EMIT_BYTE_END:
%endmacro
%macro PUT_BITS 1
- add put_bits, ecx ; put_bits += size;
- shl put_buffer, cl ; put_buffer = (put_buffer << size);
- or put_buffer, %1
+ add put_bits, ecx ; put_bits += size;
+ shl put_buffer, cl ; put_buffer = (put_buffer << size);
+ or put_buffer, %1
%endmacro
%macro CHECKBUF31 0
- cmp put_bits, 32 ; if (put_bits > 31) {
- jl %%.CHECKBUF31_END
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
+ cmp put_bits, 32 ; if (put_bits > 31) {
+ jl %%.CHECKBUF31_END
+ EMIT_BYTE
+ EMIT_BYTE
+ EMIT_BYTE
+ EMIT_BYTE
%%.CHECKBUF31_END:
%endmacro
%macro CHECKBUF47 0
- cmp put_bits, 48 ; if (put_bits > 47) {
- jl %%.CHECKBUF47_END
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
+ cmp put_bits, 48 ; if (put_bits > 47) {
+ jl %%.CHECKBUF47_END
+ EMIT_BYTE
+ EMIT_BYTE
+ EMIT_BYTE
+ EMIT_BYTE
+ EMIT_BYTE
+ EMIT_BYTE
%%.CHECKBUF47_END:
%endmacro
%macro EMIT_BITS 2
- CHECKBUF47
- mov ecx, %2
- PUT_BITS %1
+ CHECKBUF47
+ mov ecx, %2
+ PUT_BITS %1
%endmacro
-%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
- pxor xmm8, xmm8 ; __m128i neg = _mm_setzero_si128();
- pxor xmm9, xmm9 ; __m128i neg = _mm_setzero_si128();
- pxor xmm10, xmm10 ; __m128i neg = _mm_setzero_si128();
- pxor xmm11, xmm11 ; __m128i neg = _mm_setzero_si128();
- pinsrw %34, word [r12 + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
- pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
- pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
- pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
- pinsrw %34, word [r12 + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
- pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
- pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
- pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
- pinsrw %34, word [r12 + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
- pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
- pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
- pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
- pinsrw %34, word [r12 + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
- pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
- pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
- pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
- pinsrw %34, word [r12 + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
- pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
- pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
- pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
- pinsrw %34, word [r12 + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
- pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
- pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
- pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
- pinsrw %34, word [r12 + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
- pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
- pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
- pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
- pinsrw %34, word [r12 + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
- pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
- pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
+%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
+ pxor xmm8, xmm8 ; __m128i neg = _mm_setzero_si128();
+ pxor xmm9, xmm9 ; __m128i neg = _mm_setzero_si128();
+ pxor xmm10, xmm10 ; __m128i neg = _mm_setzero_si128();
+ pxor xmm11, xmm11 ; __m128i neg = _mm_setzero_si128();
+ pinsrw %34, word [r12 + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
+ pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
+ pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
+ pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
+ pinsrw %34, word [r12 + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
+ pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
+ pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
+ pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
+ pinsrw %34, word [r12 + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
+ pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
+ pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
+ pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
+ pinsrw %34, word [r12 + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
+ pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
+ pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
+ pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
+ pinsrw %34, word [r12 + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
+ pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
+ pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
+ pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
+ pinsrw %34, word [r12 + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
+ pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
+ pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
+ pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
+ pinsrw %34, word [r12 + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
+ pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
+ pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
+ pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
+ pinsrw %34, word [r12 + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
+ pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
+ pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
%if %1 != 32
- pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
+ pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
%else
- pinsrw %37, ebx, 7 ; xmm_shadow[31] = block[jno31];
+ pinsrw %37, ebx, 7 ; xmm_shadow[31] = block[jno31];
%endif
- pcmpgtw xmm8, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm9, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm10, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm11, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
- paddw %34, xmm8 ; x1 = _mm_add_epi16(x1, neg);
- paddw %35, xmm9 ; x1 = _mm_add_epi16(x1, neg);
- paddw %36, xmm10 ; x1 = _mm_add_epi16(x1, neg);
- paddw %37, xmm11 ; x1 = _mm_add_epi16(x1, neg);
- pxor %34, xmm8 ; x1 = _mm_xor_si128(x1, neg);
- pxor %35, xmm9 ; x1 = _mm_xor_si128(x1, neg);
- pxor %36, xmm10 ; x1 = _mm_xor_si128(x1, neg);
- pxor %37, xmm11 ; x1 = _mm_xor_si128(x1, neg);
- pxor xmm8, %34 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm9, %35 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm10, %36 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm11, %37 ; neg = _mm_xor_si128(neg, x1);
- movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
- movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
- movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
- movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
- movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
- movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
- movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
- movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
+ pcmpgtw xmm8, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
+ pcmpgtw xmm9, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
+ pcmpgtw xmm10, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
+ pcmpgtw xmm11, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
+ paddw %34, xmm8 ; x1 = _mm_add_epi16(x1, neg);
+ paddw %35, xmm9 ; x1 = _mm_add_epi16(x1, neg);
+ paddw %36, xmm10 ; x1 = _mm_add_epi16(x1, neg);
+ paddw %37, xmm11 ; x1 = _mm_add_epi16(x1, neg);
+ pxor %34, xmm8 ; x1 = _mm_xor_si128(x1, neg);
+ pxor %35, xmm9 ; x1 = _mm_xor_si128(x1, neg);
+ pxor %36, xmm10 ; x1 = _mm_xor_si128(x1, neg);
+ pxor %37, xmm11 ; x1 = _mm_xor_si128(x1, neg);
+ pxor xmm8, %34 ; neg = _mm_xor_si128(neg, x1);
+ pxor xmm9, %35 ; neg = _mm_xor_si128(neg, x1);
+ pxor xmm10, %36 ; neg = _mm_xor_si128(neg, x1);
+ pxor xmm11, %37 ; neg = _mm_xor_si128(neg, x1);
+ movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
+ movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
+ movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
+ movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
+ movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
+ movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
+ movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
+ movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
%endmacro
;
; r14 = c_derived_tbl *dctbl
; r15 = c_derived_tbl *actbl
-%define t1 rbp-(DCTSIZE2*SIZEOF_WORD)
-%define t2 t1-(DCTSIZE2*SIZEOF_WORD)
-%define put_buffer r8
-%define put_bits r9d
-%define buffer rax
+%define t1 rbp-(DCTSIZE2*SIZEOF_WORD)
+%define t2 t1-(DCTSIZE2*SIZEOF_WORD)
+%define put_buffer r8
+%define put_bits r9d
+%define buffer rax
- align 16
- global EXTN(jsimd_huff_encode_one_block_sse2)
+ align 16
+ global EXTN(jsimd_huff_encode_one_block_sse2)
EXTN(jsimd_huff_encode_one_block_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [t2]
- collect_args
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp,rsp ; rbp = aligned rbp
+ lea rsp, [t2]
+ collect_args
%ifdef WIN64
- movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
- movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
- movaps XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10
- movaps XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11
- sub rsp, 4*SIZEOF_XMMWORD
+ movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
+ movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
+ movaps XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10
+ movaps XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11
+ sub rsp, 4*SIZEOF_XMMWORD
%endif
- push rbx
-
- mov buffer, r11 ; r11 is now sratch
-
- mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer;
- mov put_bits, DWORD [r10+24] ; put_bits = state->cur.put_bits;
- push r10 ; r10 is now scratch
-
- ; Encode the DC coefficient difference per section F.1.2.1
- movsx edi, word [r12] ; temp = temp2 = block[0] - last_dc_val;
- sub edi, r13d ; r13 is not used anymore
- mov ebx, edi
-
- ; This is a well-known technique for obtaining the absolute value
- ; without a branch. It is derived from an assembly language technique
- ; presented in "How to Optimize for the Pentium Processors",
- ; Copyright (c) 1996, 1997 by Agner Fog.
- mov esi, edi
- sar esi, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
- xor edi, esi ; temp ^= temp3;
- sub edi, esi ; temp -= temp3;
-
- ; For a negative input, want temp2 = bitwise complement of abs(input)
- ; This code assumes we are on a two's complement machine
- add ebx, esi ; temp2 += temp3;
-
- ; Find the number of bits needed for the magnitude of the coefficient
- lea r11, [rel jpeg_nbits_table]
- movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp);
- ; Emit the Huffman-coded symbol for the number of bits
- mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits];
- movzx esi, byte [r14 + rdi + 1024] ; size = dctbl->ehufsi[nbits];
- EMIT_BITS r11, esi ; EMIT_BITS(code, size)
-
- ; Mask off any extra bits in code
- mov esi, 1
- mov ecx, edi
- shl esi, cl
- dec esi
- and ebx, esi ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-
- ; Emit that number of bits of the value, if positive,
- ; or the complement of its magnitude, if negative.
- EMIT_BITS rbx, edi ; EMIT_BITS(temp2, nbits)
-
- ; Prepare data
- xor ebx, ebx
- kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
- 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
- 27, 20, 13, 6, 7, 14, 21, 28, 35, \
- xmm0, xmm1, xmm2, xmm3
- kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
- 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
- 53, 60, 61, 54, 47, 55, 62, 63, 63, \
- xmm4, xmm5, xmm6, xmm7
-
- pxor xmm8, xmm8
- pcmpeqw xmm0, xmm8 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
- pcmpeqw xmm1, xmm8 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
- pcmpeqw xmm2, xmm8 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
- pcmpeqw xmm3, xmm8 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
- pcmpeqw xmm4, xmm8 ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
- pcmpeqw xmm5, xmm8 ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
- pcmpeqw xmm6, xmm8 ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
- pcmpeqw xmm7, xmm8 ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
- packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
- packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
- packsswb xmm4, xmm5 ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
- packsswb xmm6, xmm7 ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
- pmovmskb r11d, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
- pmovmskb r12d, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
- pmovmskb r13d, xmm4 ; index = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
- pmovmskb r14d, xmm6 ; index = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
- shl r12, 16
- shl r14, 16
- or r11, r12
- or r13, r14
- shl r13, 32
- or r11, r13
- not r11 ; index = ~index;
-
- ;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
- ;jmp .EFN
-
- mov r13d, INT [r15 + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
- movzx r14d, byte [r15 + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
- lea rsi, [t1]
+ push rbx
+
+ mov buffer, r11 ; r11 is now sratch
+
+ mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer;
+ mov put_bits, DWORD [r10+24] ; put_bits = state->cur.put_bits;
+ push r10 ; r10 is now scratch
+
+ ; Encode the DC coefficient difference per section F.1.2.1
+ movsx edi, word [r12] ; temp = temp2 = block[0] - last_dc_val;
+ sub edi, r13d ; r13 is not used anymore
+ mov ebx, edi
+
+ ; This is a well-known technique for obtaining the absolute value
+ ; without a branch. It is derived from an assembly language technique
+ ; presented in "How to Optimize for the Pentium Processors",
+ ; Copyright (c) 1996, 1997 by Agner Fog.
+ mov esi, edi
+ sar esi, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
+ xor edi, esi ; temp ^= temp3;
+ sub edi, esi ; temp -= temp3;
+
+ ; For a negative input, want temp2 = bitwise complement of abs(input)
+ ; This code assumes we are on a two's complement machine
+ add ebx, esi ; temp2 += temp3;
+
+ ; Find the number of bits needed for the magnitude of the coefficient
+ lea r11, [rel jpeg_nbits_table]
+ movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp);
+ ; Emit the Huffman-coded symbol for the number of bits
+ mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits];
+ movzx esi, byte [r14 + rdi + 1024] ; size = dctbl->ehufsi[nbits];
+ EMIT_BITS r11, esi ; EMIT_BITS(code, size)
+
+ ; Mask off any extra bits in code
+ mov esi, 1
+ mov ecx, edi
+ shl esi, cl
+ dec esi
+ and ebx, esi ; temp2 &= (((JLONG) 1)<<nbits) - 1;
+
+ ; Emit that number of bits of the value, if positive,
+ ; or the complement of its magnitude, if negative.
+ EMIT_BITS rbx, edi ; EMIT_BITS(temp2, nbits)
+
+ ; Prepare data
+ xor ebx, ebx
+ kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
+ 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
+ 27, 20, 13, 6, 7, 14, 21, 28, 35, \
+ xmm0, xmm1, xmm2, xmm3
+ kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
+ 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
+ 53, 60, 61, 54, 47, 55, 62, 63, 63, \
+ xmm4, xmm5, xmm6, xmm7
+
+ pxor xmm8, xmm8
+ pcmpeqw xmm0, xmm8 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
+ pcmpeqw xmm1, xmm8 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
+ pcmpeqw xmm2, xmm8 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
+ pcmpeqw xmm3, xmm8 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
+ pcmpeqw xmm4, xmm8 ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
+ pcmpeqw xmm5, xmm8 ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
+ pcmpeqw xmm6, xmm8 ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
+ pcmpeqw xmm7, xmm8 ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
+ packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
+ packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
+ packsswb xmm4, xmm5 ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
+ packsswb xmm6, xmm7 ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
+ pmovmskb r11d, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
+ pmovmskb r12d, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
+ pmovmskb r13d, xmm4 ; index = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
+ pmovmskb r14d, xmm6 ; index = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
+ shl r12, 16
+ shl r14, 16
+ or r11, r12
+ or r13, r14
+ shl r13, 32
+ or r11, r13
+ not r11 ; index = ~index;
+
+ ;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
+ ;jmp .EFN
+
+ mov r13d, INT [r15 + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
+ movzx r14d, byte [r15 + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
+ lea rsi, [t1]
.BLOOP:
- bsf r12, r11 ; r = __builtin_ctzl(index);
- jz .ELOOP
- mov rcx, r12
- lea rsi, [rsi+r12*2] ; k += r;
- shr r11, cl ; index >>= r;
- movzx rdi, word [rsi] ; temp = t1[k];
- lea rbx, [rel jpeg_nbits_table]
- movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp);
+ bsf r12, r11 ; r = __builtin_ctzl(index);
+ jz .ELOOP
+ mov rcx, r12
+ lea rsi, [rsi+r12*2] ; k += r;
+ shr r11, cl ; index >>= r;
+ movzx rdi, word [rsi] ; temp = t1[k];
+ lea rbx, [rel jpeg_nbits_table]
+ movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp);
.BRLOOP:
- cmp r12, 16 ; while (r > 15) {
- jl .ERLOOP
- EMIT_BITS r13, r14d ; EMIT_BITS(code_0xf0, size_0xf0)
- sub r12, 16 ; r -= 16;
- jmp .BRLOOP
+ cmp r12, 16 ; while (r > 15) {
+ jl .ERLOOP
+ EMIT_BITS r13, r14d ; EMIT_BITS(code_0xf0, size_0xf0)
+ sub r12, 16 ; r -= 16;
+ jmp .BRLOOP
.ERLOOP:
- ; Emit Huffman symbol for run length / number of bits
- CHECKBUF31 ; uses rcx, rdx
-
- shl r12, 4 ; temp3 = (r << 4) + nbits;
- add r12, rdi
- mov ebx, INT [r15 + r12 * 4] ; code = actbl->ehufco[temp3];
- movzx ecx, byte [r15 + r12 + 1024] ; size = actbl->ehufsi[temp3];
- PUT_BITS rbx
-
- ;EMIT_CODE(code, size)
-
- movsx ebx, word [rsi-DCTSIZE2*2] ; temp2 = t2[k];
- ; Mask off any extra bits in code
- mov rcx, rdi
- mov rdx, 1
- shl rdx, cl
- dec rdx
- and rbx, rdx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
- PUT_BITS rbx ; PUT_BITS(temp2, nbits)
-
- shr r11, 1 ; index >>= 1;
- add rsi, 2 ; ++k;
- jmp .BLOOP
+ ; Emit Huffman symbol for run length / number of bits
+ CHECKBUF31 ; uses rcx, rdx
+
+ shl r12, 4 ; temp3 = (r << 4) + nbits;
+ add r12, rdi
+ mov ebx, INT [r15 + r12 * 4] ; code = actbl->ehufco[temp3];
+ movzx ecx, byte [r15 + r12 + 1024] ; size = actbl->ehufsi[temp3];
+ PUT_BITS rbx
+
+ ;EMIT_CODE(code, size)
+
+ movsx ebx, word [rsi-DCTSIZE2*2] ; temp2 = t2[k];
+ ; Mask off any extra bits in code
+ mov rcx, rdi
+ mov rdx, 1
+ shl rdx, cl
+ dec rdx
+ and rbx, rdx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
+ PUT_BITS rbx ; PUT_BITS(temp2, nbits)
+
+ shr r11, 1 ; index >>= 1;
+ add rsi, 2 ; ++k;
+ jmp .BLOOP
.ELOOP:
- ; If the last coef(s) were zero, emit an end-of-block code
- lea rdi, [t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
- cmp rdi, rsi ; if (r > 0) {
- je .EFN
- mov ebx, INT [r15] ; code = actbl->ehufco[0];
- movzx r12d, byte [r15 + 1024] ; size = actbl->ehufsi[0];
- EMIT_BITS rbx, r12d
+ ; If the last coef(s) were zero, emit an end-of-block code
+ lea rdi, [t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
+ cmp rdi, rsi ; if (r > 0) {
+ je .EFN
+ mov ebx, INT [r15] ; code = actbl->ehufco[0];
+ movzx r12d, byte [r15 + 1024] ; size = actbl->ehufsi[0];
+ EMIT_BITS rbx, r12d
.EFN:
- pop r10
- ; Save put_buffer & put_bits
- mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer;
- mov DWORD [r10+24], put_bits ; state->cur.put_bits = put_bits;
+ pop r10
+ ; Save put_buffer & put_bits
+ mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer;
+ mov DWORD [r10+24], put_bits ; state->cur.put_bits = put_bits;
- pop rbx
+ pop rbx
%ifdef WIN64
- movaps xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD]
- movaps xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD]
- movaps xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD]
- movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
- add rsp, 4*SIZEOF_XMMWORD
+ movaps xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD]
+ movaps xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD]
+ movaps xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD]
+ movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
+ add rsp, 4*SIZEOF_XMMWORD
%endif
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
+ uncollect_args
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
%include "jsimdext.inc"
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_huff_encode_one_block)
+ alignz 16
+ global EXTN(jconst_huff_encode_one_block)
EXTN(jconst_huff_encode_one_block):
%include "jpeg_nbits_table.inc"
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
+ SECTION SEG_TEXT
+ BITS 32
; These macros perform the same task as the emit_bits() function in the
; original libjpeg code. In addition to reducing overhead by explicitly
; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
%macro EMIT_BYTE 0
- sub put_bits, 8 ; put_bits -= 8;
- mov edx, put_buffer
- mov ecx, put_bits
- shr edx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
- mov byte [eax], dl ; *buffer++ = c;
- add eax, 1
- cmp dl, 0xFF ; need to stuff a zero byte?
- jne %%.EMIT_BYTE_END
- mov byte [eax], 0 ; *buffer++ = 0;
- add eax, 1
+ sub put_bits, 8 ; put_bits -= 8;
+ mov edx, put_buffer
+ mov ecx, put_bits
+ shr edx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
+ mov byte [eax], dl ; *buffer++ = c;
+ add eax, 1
+ cmp dl, 0xFF ; need to stuff a zero byte?
+ jne %%.EMIT_BYTE_END
+ mov byte [eax], 0 ; *buffer++ = 0;
+ add eax, 1
%%.EMIT_BYTE_END:
%endmacro
%macro PUT_BITS 1
- add put_bits, ecx ; put_bits += size;
- shl put_buffer, cl ; put_buffer = (put_buffer << size);
- or put_buffer, %1
+ add put_bits, ecx ; put_bits += size;
+ shl put_buffer, cl ; put_buffer = (put_buffer << size);
+ or put_buffer, %1
%endmacro
%macro CHECKBUF15 0
- cmp put_bits, 16 ; if (put_bits > 31) {
- jl %%.CHECKBUF15_END
- mov eax, POINTER [esp+buffer]
- EMIT_BYTE
- EMIT_BYTE
- mov POINTER [esp+buffer], eax
+ cmp put_bits, 16 ; if (put_bits > 31) {
+ jl %%.CHECKBUF15_END
+ mov eax, POINTER [esp+buffer]
+ EMIT_BYTE
+ EMIT_BYTE
+ mov POINTER [esp+buffer], eax
%%.CHECKBUF15_END:
%endmacro
%macro EMIT_BITS 1
- PUT_BITS %1
- CHECKBUF15
+ PUT_BITS %1
+ CHECKBUF15
%endmacro
-%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
- pxor xmm4, xmm4 ; __m128i neg = _mm_setzero_si128();
- pxor xmm5, xmm5 ; __m128i neg = _mm_setzero_si128();
- pxor xmm6, xmm6 ; __m128i neg = _mm_setzero_si128();
- pxor xmm7, xmm7 ; __m128i neg = _mm_setzero_si128();
- pinsrw %34, word [esi + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
- pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
- pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
- pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
- pinsrw %34, word [esi + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
- pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
- pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
- pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
- pinsrw %34, word [esi + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
- pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
- pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
- pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
- pinsrw %34, word [esi + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
- pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
- pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
- pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
- pinsrw %34, word [esi + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
- pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
- pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
- pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
- pinsrw %34, word [esi + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
- pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
- pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
- pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
- pinsrw %34, word [esi + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
- pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
- pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
- pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
- pinsrw %34, word [esi + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
- pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
- pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
+%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
+ pxor xmm4, xmm4 ; __m128i neg = _mm_setzero_si128();
+ pxor xmm5, xmm5 ; __m128i neg = _mm_setzero_si128();
+ pxor xmm6, xmm6 ; __m128i neg = _mm_setzero_si128();
+ pxor xmm7, xmm7 ; __m128i neg = _mm_setzero_si128();
+ pinsrw %34, word [esi + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
+ pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
+ pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
+ pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
+ pinsrw %34, word [esi + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
+ pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
+ pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
+ pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
+ pinsrw %34, word [esi + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
+ pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
+ pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
+ pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
+ pinsrw %34, word [esi + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
+ pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
+ pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
+ pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
+ pinsrw %34, word [esi + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
+ pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
+ pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
+ pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
+ pinsrw %34, word [esi + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
+ pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
+ pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
+ pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
+ pinsrw %34, word [esi + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
+ pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
+ pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
+ pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
+ pinsrw %34, word [esi + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
+ pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
+ pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
%if %1 != 32
- pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
+ pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
%else
- pinsrw %37, ecx, 7 ; xmm_shadow[31] = block[jno31];
+ pinsrw %37, ecx, 7 ; xmm_shadow[31] = block[jno31];
%endif
- pcmpgtw xmm4, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm5, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm6, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm7, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
- paddw %34, xmm4 ; x1 = _mm_add_epi16(x1, neg);
- paddw %35, xmm5 ; x1 = _mm_add_epi16(x1, neg);
- paddw %36, xmm6 ; x1 = _mm_add_epi16(x1, neg);
- paddw %37, xmm7 ; x1 = _mm_add_epi16(x1, neg);
- pxor %34, xmm4 ; x1 = _mm_xor_si128(x1, neg);
- pxor %35, xmm5 ; x1 = _mm_xor_si128(x1, neg);
- pxor %36, xmm6 ; x1 = _mm_xor_si128(x1, neg);
- pxor %37, xmm7 ; x1 = _mm_xor_si128(x1, neg);
- pxor xmm4, %34 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm5, %35 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm6, %36 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm7, %37 ; neg = _mm_xor_si128(neg, x1);
- movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
- movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
- movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
- movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
- movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
- movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
- movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
- movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
+ pcmpgtw xmm4, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
+ pcmpgtw xmm5, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
+ pcmpgtw xmm6, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
+ pcmpgtw xmm7, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
+ paddw %34, xmm4 ; x1 = _mm_add_epi16(x1, neg);
+ paddw %35, xmm5 ; x1 = _mm_add_epi16(x1, neg);
+ paddw %36, xmm6 ; x1 = _mm_add_epi16(x1, neg);
+ paddw %37, xmm7 ; x1 = _mm_add_epi16(x1, neg);
+ pxor %34, xmm4 ; x1 = _mm_xor_si128(x1, neg);
+ pxor %35, xmm5 ; x1 = _mm_xor_si128(x1, neg);
+ pxor %36, xmm6 ; x1 = _mm_xor_si128(x1, neg);
+ pxor %37, xmm7 ; x1 = _mm_xor_si128(x1, neg);
+ pxor xmm4, %34 ; neg = _mm_xor_si128(neg, x1);
+ pxor xmm5, %35 ; neg = _mm_xor_si128(neg, x1);
+ pxor xmm6, %36 ; neg = _mm_xor_si128(neg, x1);
+ pxor xmm7, %37 ; neg = _mm_xor_si128(neg, x1);
+ movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
+ movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
+ movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
+ movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
+ movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
+ movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
+ movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
+ movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
%endmacro
;
; eax + 24 = c_derived_tbl *dctbl
; eax + 28 = c_derived_tbl *actbl
-%define pad 6*SIZEOF_DWORD ; Align to 16 bytes
-%define t1 pad
-%define t2 t1+(DCTSIZE2*SIZEOF_WORD)
-%define block t2+(DCTSIZE2*SIZEOF_WORD)
-%define actbl block+SIZEOF_DWORD
-%define buffer actbl+SIZEOF_DWORD
-%define temp buffer+SIZEOF_DWORD
-%define temp2 temp+SIZEOF_DWORD
-%define temp3 temp2+SIZEOF_DWORD
-%define temp4 temp3+SIZEOF_DWORD
-%define temp5 temp4+SIZEOF_DWORD
-%define gotptr temp5+SIZEOF_DWORD ; void *gotptr
-%define put_buffer ebx
-%define put_bits edi
-
- align 16
- global EXTN(jsimd_huff_encode_one_block_sse2)
+%define pad 6*SIZEOF_DWORD ; Align to 16 bytes
+%define t1 pad
+%define t2 t1+(DCTSIZE2*SIZEOF_WORD)
+%define block t2+(DCTSIZE2*SIZEOF_WORD)
+%define actbl block+SIZEOF_DWORD
+%define buffer actbl+SIZEOF_DWORD
+%define temp buffer+SIZEOF_DWORD
+%define temp2 temp+SIZEOF_DWORD
+%define temp3 temp2+SIZEOF_DWORD
+%define temp4 temp3+SIZEOF_DWORD
+%define temp5 temp4+SIZEOF_DWORD
+%define gotptr temp5+SIZEOF_DWORD ; void *gotptr
+%define put_buffer ebx
+%define put_bits edi
+
+ align 16
+ global EXTN(jsimd_huff_encode_one_block_sse2)
EXTN(jsimd_huff_encode_one_block_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- sub esp, temp5+9*SIZEOF_DWORD-pad
- push ebx
- push ecx
-; push edx ; need not be preserved
- push esi
- push edi
- push ebp
-
- mov esi, POINTER [eax+8] ; (working_state *state)
- mov put_buffer, DWORD [esi+8] ; put_buffer = state->cur.put_buffer;
- mov put_bits, DWORD [esi+12] ; put_bits = state->cur.put_bits;
- push esi ; esi is now scratch
-
- get_GOT edx ; get GOT address
- movpic POINTER [esp+gotptr], edx ; save GOT address
-
- mov ecx, POINTER [eax+28]
- mov edx, POINTER [eax+16]
- mov esi, POINTER [eax+12]
- mov POINTER [esp+actbl], ecx
- mov POINTER [esp+block], edx
- mov POINTER [esp+buffer], esi
-
- ; Encode the DC coefficient difference per section F.1.2.1
- mov esi, POINTER [esp+block] ; block
- movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val;
- sub ecx, DWORD [eax+20]
- mov esi, ecx
-
- ; This is a well-known technique for obtaining the absolute value
- ; without a branch. It is derived from an assembly language technique
- ; presented in "How to Optimize for the Pentium Processors",
- ; Copyright (c) 1996, 1997 by Agner Fog.
- mov edx, ecx
- sar edx, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
- xor ecx, edx ; temp ^= temp3;
- sub ecx, edx ; temp -= temp3;
-
- ; For a negative input, want temp2 = bitwise complement of abs(input)
- ; This code assumes we are on a two's complement machine
- add esi, edx ; temp2 += temp3;
- mov DWORD [esp+temp], esi ; backup temp2 in temp
-
- ; Find the number of bits needed for the magnitude of the coefficient
- movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp)
- movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)] ; nbits = JPEG_NBITS(temp);
- mov DWORD [esp+temp2], edx ; backup nbits in temp2
-
- ; Emit the Huffman-coded symbol for the number of bits
- mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore
- mov eax, INT [ebp + edx * 4] ; code = dctbl->ehufco[nbits];
- movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits];
- EMIT_BITS eax ; EMIT_BITS(code, size)
-
- mov ecx, DWORD [esp+temp2] ; restore nbits
-
- ; Mask off any extra bits in code
- mov eax, 1
- shl eax, cl
- dec eax
- and eax, DWORD [esp+temp] ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-
- ; Emit that number of bits of the value, if positive,
- ; or the complement of its magnitude, if negative.
- EMIT_BITS eax ; EMIT_BITS(temp2, nbits)
-
- ; Prepare data
- xor ecx, ecx
- mov esi, POINTER [esp+block]
- kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
- 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
- 27, 20, 13, 6, 7, 14, 21, 28, 35, \
- xmm0, xmm1, xmm2, xmm3
- kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
- 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
- 53, 60, 61, 54, 47, 55, 62, 63, 63, \
- xmm0, xmm1, xmm2, xmm3
-
- pxor xmm7, xmm7
- movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
- movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
- movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
- movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
- pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
- pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
- pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
- pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
- packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
- packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
- pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
- pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
- shl ecx, 16
- or edx, ecx
- not edx ; index = ~index;
-
- lea esi, [esp+t1]
- mov ebp, POINTER [esp+actbl] ; ebp = actbl
+ push ebp
+ mov eax,esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp],eax
+ mov ebp,esp ; ebp = aligned ebp
+ sub esp, temp5+9*SIZEOF_DWORD-pad
+ push ebx
+ push ecx
+; push edx ; need not be preserved
+ push esi
+ push edi
+ push ebp
+
+ mov esi, POINTER [eax+8] ; (working_state *state)
+ mov put_buffer, DWORD [esi+8] ; put_buffer = state->cur.put_buffer;
+ mov put_bits, DWORD [esi+12] ; put_bits = state->cur.put_bits;
+ push esi ; esi is now scratch
+
+ get_GOT edx ; get GOT address
+ movpic POINTER [esp+gotptr], edx ; save GOT address
+
+ mov ecx, POINTER [eax+28]
+ mov edx, POINTER [eax+16]
+ mov esi, POINTER [eax+12]
+ mov POINTER [esp+actbl], ecx
+ mov POINTER [esp+block], edx
+ mov POINTER [esp+buffer], esi
+
+ ; Encode the DC coefficient difference per section F.1.2.1
+ mov esi, POINTER [esp+block] ; block
+ movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val;
+ sub ecx, DWORD [eax+20]
+ mov esi, ecx
+
+ ; This is a well-known technique for obtaining the absolute value
+ ; with out a branch. It is derived from an assembly language technique
+ ; presented in "How to Optimize for the Pentium Processors",
+ ; Copyright (c) 1996, 1997 by Agner Fog.
+ mov edx, ecx
+ sar edx, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
+ xor ecx, edx ; temp ^= temp3;
+ sub ecx, edx ; temp -= temp3;
+
+ ; For a negative input, want temp2 = bitwise complement of abs(input)
+ ; This code assumes we are on a two's complement machine
+ add esi, edx ; temp2 += temp3;
+ mov DWORD [esp+temp], esi ; backup temp2 in temp
+
+ ; Find the number of bits needed for the magnitude of the coefficient
+ movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp)
+ movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)] ; nbits = JPEG_NBITS(temp);
+ mov DWORD [esp+temp2], edx ; backup nbits in temp2
+
+ ; Emit the Huffman-coded symbol for the number of bits
+ mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore
+ mov eax, INT [ebp + edx * 4] ; code = dctbl->ehufco[nbits];
+ movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits];
+ EMIT_BITS eax ; EMIT_BITS(code, size)
+
+ mov ecx, DWORD [esp+temp2] ; restore nbits
+
+ ; Mask off any extra bits in code
+ mov eax, 1
+ shl eax, cl
+ dec eax
+ and eax, DWORD [esp+temp] ; temp2 &= (((JLONG) 1)<<nbits) - 1;
+
+ ; Emit that number of bits of the value, if positive,
+ ; or the complement of its magnitude, if negative.
+ EMIT_BITS eax ; EMIT_BITS(temp2, nbits)
+
+ ; Prepare data
+ xor ecx, ecx
+ mov esi, POINTER [esp+block]
+ kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
+ 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
+ 27, 20, 13, 6, 7, 14, 21, 28, 35, \
+ xmm0, xmm1, xmm2, xmm3
+ kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
+ 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
+ 53, 60, 61, 54, 47, 55, 62, 63, 63, \
+ xmm0, xmm1, xmm2, xmm3
+
+ pxor xmm7, xmm7
+ movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
+ movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
+ movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
+ movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
+ pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
+ pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
+ pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
+ pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
+ packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
+ packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
+ pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
+ pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
+ shl ecx, 16
+ or edx, ecx
+ not edx ; index = ~index;
+
+ lea esi, [esp+t1]
+ mov ebp, POINTER [esp+actbl] ; ebp = actbl
.BLOOP:
- bsf ecx, edx ; r = __builtin_ctzl(index);
- jz .ELOOP
- lea esi, [esi+ecx*2] ; k += r;
- shr edx, cl ; index >>= r;
- mov DWORD [esp+temp3], edx
+ bsf ecx, edx ; r = __builtin_ctzl(index);
+ jz .ELOOP
+ lea esi, [esi+ecx*2] ; k += r;
+ shr edx, cl ; index >>= r;
+ mov DWORD [esp+temp3], edx
.BRLOOP:
- cmp ecx, 16 ; while (r > 15) {
- jl .ERLOOP
- sub ecx, 16 ; r -= 16;
- mov DWORD [esp+temp], ecx
- mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
- movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
- EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
- mov ecx, DWORD [esp+temp]
- jmp .BRLOOP
+ cmp ecx, 16 ; while (r > 15) {
+ jl .ERLOOP
+ sub ecx, 16 ; r -= 16;
+ mov DWORD [esp+temp], ecx
+ mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
+ movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
+ EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
+ mov ecx, DWORD [esp+temp]
+ jmp .BRLOOP
.ERLOOP:
- movsx eax, word [esi] ; temp = t1[k];
- movpic edx, POINTER [esp+gotptr] ; load GOT address (edx)
- movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)] ; nbits = JPEG_NBITS(temp);
- mov DWORD [esp+temp2], eax
- ; Emit Huffman symbol for run length / number of bits
- shl ecx, 4 ; temp3 = (r << 4) + nbits;
- add ecx, eax
- mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3];
- movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3];
- EMIT_BITS eax
-
- movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
- ; Mask off any extra bits in code
- mov ecx, DWORD [esp+temp2]
- mov eax, 1
- shl eax, cl
- dec eax
- and eax, edx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
- EMIT_BITS eax ; PUT_BITS(temp2, nbits)
- mov edx, DWORD [esp+temp3]
- add esi, 2 ; ++k;
- shr edx, 1 ; index >>= 1;
-
- jmp .BLOOP
+ movsx eax, word [esi] ; temp = t1[k];
+ movpic edx, POINTER [esp+gotptr] ; load GOT address (edx)
+ movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)] ; nbits = JPEG_NBITS(temp);
+ mov DWORD [esp+temp2], eax
+ ; Emit Huffman symbol for run length / number of bits
+ shl ecx, 4 ; temp3 = (r << 4) + nbits;
+ add ecx, eax
+ mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3];
+ movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3];
+ EMIT_BITS eax
+
+ movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
+ ; Mask off any extra bits in code
+ mov ecx, DWORD [esp+temp2]
+ mov eax, 1
+ shl eax, cl
+ dec eax
+ and eax, edx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
+ EMIT_BITS eax ; PUT_BITS(temp2, nbits)
+ mov edx, DWORD [esp+temp3]
+ add esi, 2 ; ++k;
+ shr edx, 1 ; index >>= 1;
+
+ jmp .BLOOP
.ELOOP:
- movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
- movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
- movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
- movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
- pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
- pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
- pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
- pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
- packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
- packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
- pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
- pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
- shl ecx, 16
- or edx, ecx
- not edx ; index = ~index;
-
- lea eax, [esp + t1 + (DCTSIZE2/2) * 2]
- sub eax, esi
- shr eax, 1
- bsf ecx, edx ; r = __builtin_ctzl(index);
- jz .ELOOP2
- shr edx, cl ; index >>= r;
- add ecx, eax
- lea esi, [esi+ecx*2] ; k += r;
- mov DWORD [esp+temp3], edx
- jmp .BRLOOP2
+ movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
+ movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
+ movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
+ movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
+ pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
+ pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
+ pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
+ pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
+ packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
+ packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
+ pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
+ pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
+ shl ecx, 16
+ or edx, ecx
+ not edx ; index = ~index;
+
+ lea eax, [esp + t1 + (DCTSIZE2/2) * 2]
+ sub eax, esi
+ shr eax, 1
+ bsf ecx, edx ; r = __builtin_ctzl(index);
+ jz .ELOOP2
+ shr edx, cl ; index >>= r;
+ add ecx, eax
+ lea esi, [esi+ecx*2] ; k += r;
+ mov DWORD [esp+temp3], edx
+ jmp .BRLOOP2
.BLOOP2:
- bsf ecx, edx ; r = __builtin_ctzl(index);
- jz .ELOOP2
- lea esi, [esi+ecx*2] ; k += r;
- shr edx, cl ; index >>= r;
- mov DWORD [esp+temp3], edx
+ bsf ecx, edx ; r = __builtin_ctzl(index);
+ jz .ELOOP2
+ lea esi, [esi+ecx*2] ; k += r;
+ shr edx, cl ; index >>= r;
+ mov DWORD [esp+temp3], edx
.BRLOOP2:
- cmp ecx, 16 ; while (r > 15) {
- jl .ERLOOP2
- sub ecx, 16 ; r -= 16;
- mov DWORD [esp+temp], ecx
- mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
- movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
- EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
- mov ecx, DWORD [esp+temp]
- jmp .BRLOOP2
+ cmp ecx, 16 ; while (r > 15) {
+ jl .ERLOOP2
+ sub ecx, 16 ; r -= 16;
+ mov DWORD [esp+temp], ecx
+ mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
+ movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
+ EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
+ mov ecx, DWORD [esp+temp]
+ jmp .BRLOOP2
.ERLOOP2:
- movsx eax, word [esi] ; temp = t1[k];
- bsr eax, eax ; nbits = 32 - __builtin_clz(temp);
- inc eax
- mov DWORD [esp+temp2], eax
- ; Emit Huffman symbol for run length / number of bits
- shl ecx, 4 ; temp3 = (r << 4) + nbits;
- add ecx, eax
- mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3];
- movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3];
- EMIT_BITS eax
-
- movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
- ; Mask off any extra bits in code
- mov ecx, DWORD [esp+temp2]
- mov eax, 1
- shl eax, cl
- dec eax
- and eax, edx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
- EMIT_BITS eax ; PUT_BITS(temp2, nbits)
- mov edx, DWORD [esp+temp3]
- add esi, 2 ; ++k;
- shr edx, 1 ; index >>= 1;
-
- jmp .BLOOP2
+ movsx eax, word [esi] ; temp = t1[k];
+ bsr eax, eax ; nbits = 32 - __builtin_clz(temp);
+ inc eax
+ mov DWORD [esp+temp2], eax
+ ; Emit Huffman symbol for run length / number of bits
+ shl ecx, 4 ; temp3 = (r << 4) + nbits;
+ add ecx, eax
+ mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3];
+ movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3];
+ EMIT_BITS eax
+
+ movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
+ ; Mask off any extra bits in code
+ mov ecx, DWORD [esp+temp2]
+ mov eax, 1
+ shl eax, cl
+ dec eax
+ and eax, edx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
+ EMIT_BITS eax ; PUT_BITS(temp2, nbits)
+ mov edx, DWORD [esp+temp3]
+ add esi, 2 ; ++k;
+ shr edx, 1 ; index >>= 1;
+
+ jmp .BLOOP2
.ELOOP2:
- ; If the last coef(s) were zero, emit an end-of-block code
- lea edx, [esp + t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
- cmp edx, esi ; if (r > 0) {
- je .EFN
- mov eax, INT [ebp] ; code = actbl->ehufco[0];
- movzx ecx, byte [ebp + 1024] ; size = actbl->ehufsi[0];
- EMIT_BITS eax
+ ; If the last coef(s) were zero, emit an end-of-block code
+ lea edx, [esp + t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
+ cmp edx, esi ; if (r > 0) {
+ je .EFN
+ mov eax, INT [ebp] ; code = actbl->ehufco[0];
+ movzx ecx, byte [ebp + 1024] ; size = actbl->ehufsi[0];
+ EMIT_BITS eax
.EFN:
- mov eax, [esp+buffer]
- pop esi
- ; Save put_buffer & put_bits
- mov DWORD [esi+8], put_buffer ; state->cur.put_buffer = put_buffer;
- mov DWORD [esi+12], put_bits ; state->cur.put_bits = put_bits;
-
- pop ebp
- pop edi
- pop esi
-; pop edx ; need not be preserved
- pop ecx
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
+ mov eax, [esp+buffer]
+ pop esi
+ ; Save put_buffer & put_bits
+ mov DWORD [esi+8], put_buffer ; state->cur.put_buffer = put_buffer;
+ mov DWORD [esi+12], put_bits ; state->cur.put_bits = put_bits;
+
+ pop ebp
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+ pop ecx
+ pop ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
%include "jsimdext.inc"
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
+ SECTION SEG_TEXT
+ BITS 64
;
; Downsample pixel values of a single component.
; This version handles the common case of 2:1 horizontal and 1:1 vertical,
; r14 = JSAMPARRAY input_data
; r15 = JSAMPARRAY output_data
- align 16
- global EXTN(jsimd_h2v1_downsample_sse2)
+ align 16
+ global EXTN(jsimd_h2v1_downsample_sse2)
EXTN(jsimd_h2v1_downsample_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args
- mov ecx, r13d
- shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
- jz near .return
+ mov ecx, r13d
+ shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
+ jz near .return
- mov edx, r10d
+ mov edx, r10d
- ; -- expand_right_edge
+ ; -- expand_right_edge
- push rcx
- shl rcx,1 ; output_cols * 2
- sub rcx,rdx
- jle short .expand_end
+ push rcx
+ shl rcx, 1 ; output_cols * 2
+ sub rcx, rdx
+ jle short .expand_end
- mov rax, r11
- test rax,rax
- jle short .expand_end
+ mov rax, r11
+ test rax, rax
+ jle short .expand_end
- cld
- mov rsi, r14 ; input_data
+ cld
+ mov rsi, r14 ; input_data
.expandloop:
- push rax
- push rcx
+ push rax
+ push rcx
- mov rdi, JSAMPROW [rsi]
- add rdi,rdx
- mov al, JSAMPLE [rdi-1]
+ mov rdi, JSAMPROW [rsi]
+ add rdi, rdx
+ mov al, JSAMPLE [rdi-1]
- rep stosb
+ rep stosb
- pop rcx
- pop rax
+ pop rcx
+ pop rax
- add rsi, byte SIZEOF_JSAMPROW
- dec rax
- jg short .expandloop
+ add rsi, byte SIZEOF_JSAMPROW
+ dec rax
+ jg short .expandloop
.expand_end:
- pop rcx ; output_cols
+ pop rcx ; output_cols
- ; -- h2v1_downsample
+ ; -- h2v1_downsample
- mov eax, r12d ; rowctr
- test eax,eax
- jle near .return
+ mov eax, r12d ; rowctr
+ test eax, eax
+ jle near .return
- mov rdx, 0x00010000 ; bias pattern
- movd xmm7,edx
- pcmpeqw xmm6,xmm6
- pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
- psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+ mov rdx, 0x00010000 ; bias pattern
+ movd xmm7, edx
+ pcmpeqw xmm6, xmm6
+ pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
- mov rsi, r14 ; input_data
- mov rdi, r15 ; output_data
+ mov rsi, r14 ; input_data
+ mov rdi, r15 ; output_data
.rowloop:
- push rcx
- push rdi
- push rsi
+ push rcx
+ push rdi
+ push rsi
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr
+ mov rsi, JSAMPROW [rsi] ; inptr
+ mov rdi, JSAMPROW [rdi] ; outptr
- cmp rcx, byte SIZEOF_XMMWORD
- jae short .columnloop
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae short .columnloop
.columnloop_r8:
- movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- pxor xmm1,xmm1
- mov rcx, SIZEOF_XMMWORD
- jmp short .downsample
+ movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ pxor xmm1, xmm1
+ mov rcx, SIZEOF_XMMWORD
+ jmp short .downsample
.columnloop:
- movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
.downsample:
- movdqa xmm2,xmm0
- movdqa xmm3,xmm1
-
- pand xmm0,xmm6
- psrlw xmm2,BYTE_BIT
- pand xmm1,xmm6
- psrlw xmm3,BYTE_BIT
-
- paddw xmm0,xmm2
- paddw xmm1,xmm3
- paddw xmm0,xmm7
- paddw xmm1,xmm7
- psrlw xmm0,1
- psrlw xmm1,1
-
- packuswb xmm0,xmm1
-
- movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-
- sub rcx, byte SIZEOF_XMMWORD ; outcol
- add rsi, byte 2*SIZEOF_XMMWORD ; inptr
- add rdi, byte 1*SIZEOF_XMMWORD ; outptr
- cmp rcx, byte SIZEOF_XMMWORD
- jae short .columnloop
- test rcx,rcx
- jnz short .columnloop_r8
-
- pop rsi
- pop rdi
- pop rcx
-
- add rsi, byte SIZEOF_JSAMPROW ; input_data
- add rdi, byte SIZEOF_JSAMPROW ; output_data
- dec rax ; rowctr
- jg near .rowloop
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ pand xmm0, xmm6
+ psrlw xmm2, BYTE_BIT
+ pand xmm1, xmm6
+ psrlw xmm3, BYTE_BIT
+
+ paddw xmm0, xmm2
+ paddw xmm1, xmm3
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+ psrlw xmm0, 1
+ psrlw xmm1, 1
+
+ packuswb xmm0, xmm1
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+ sub rcx, byte SIZEOF_XMMWORD ; outcol
+ add rsi, byte 2*SIZEOF_XMMWORD ; inptr
+ add rdi, byte 1*SIZEOF_XMMWORD ; outptr
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+ test rcx, rcx
+ jnz short .columnloop_r8
+
+ pop rsi
+ pop rdi
+ pop rcx
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rax ; rowctr
+ jg near .rowloop
.return:
- uncollect_args
- pop rbp
- ret
+ uncollect_args
+ pop rbp
+ ret
; --------------------------------------------------------------------------
;
; r14 = JSAMPARRAY input_data
; r15 = JSAMPARRAY output_data
- align 16
- global EXTN(jsimd_h2v2_downsample_sse2)
+ align 16
+ global EXTN(jsimd_h2v2_downsample_sse2)
EXTN(jsimd_h2v2_downsample_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args
- mov ecx, r13d
- shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
- jz near .return
+ mov ecx, r13d
+ shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
+ jz near .return
- mov edx, r10d
+ mov edx, r10d
- ; -- expand_right_edge
+ ; -- expand_right_edge
- push rcx
- shl rcx,1 ; output_cols * 2
- sub rcx,rdx
- jle short .expand_end
+ push rcx
+ shl rcx, 1 ; output_cols * 2
+ sub rcx, rdx
+ jle short .expand_end
- mov rax, r11
- test rax,rax
- jle short .expand_end
+ mov rax, r11
+ test rax, rax
+ jle short .expand_end
- cld
- mov rsi, r14 ; input_data
+ cld
+ mov rsi, r14 ; input_data
.expandloop:
- push rax
- push rcx
+ push rax
+ push rcx
- mov rdi, JSAMPROW [rsi]
- add rdi,rdx
- mov al, JSAMPLE [rdi-1]
+ mov rdi, JSAMPROW [rsi]
+ add rdi, rdx
+ mov al, JSAMPLE [rdi-1]
- rep stosb
+ rep stosb
- pop rcx
- pop rax
+ pop rcx
+ pop rax
- add rsi, byte SIZEOF_JSAMPROW
- dec rax
- jg short .expandloop
+ add rsi, byte SIZEOF_JSAMPROW
+ dec rax
+ jg short .expandloop
.expand_end:
- pop rcx ; output_cols
+ pop rcx ; output_cols
- ; -- h2v2_downsample
+ ; -- h2v2_downsample
- mov eax, r12d ; rowctr
- test rax,rax
- jle near .return
+ mov eax, r12d ; rowctr
+ test rax, rax
+ jle near .return
- mov rdx, 0x00020001 ; bias pattern
- movd xmm7,edx
- pcmpeqw xmm6,xmm6
- pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
- psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+ mov rdx, 0x00020001 ; bias pattern
+ movd xmm7, edx
+ pcmpeqw xmm6, xmm6
+ pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
- mov rsi, r14 ; input_data
- mov rdi, r15 ; output_data
+ mov rsi, r14 ; input_data
+ mov rdi, r15 ; output_data
.rowloop:
- push rcx
- push rdi
- push rsi
+ push rcx
+ push rdi
+ push rsi
- mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
- mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
- mov rdi, JSAMPROW [rdi] ; outptr
+ mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
+ mov rdi, JSAMPROW [rdi] ; outptr
- cmp rcx, byte SIZEOF_XMMWORD
- jae short .columnloop
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae short .columnloop
.columnloop_r8:
- movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
- movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- pxor xmm2,xmm2
- pxor xmm3,xmm3
- mov rcx, SIZEOF_XMMWORD
- jmp short .downsample
+ movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ pxor xmm2, xmm2
+ pxor xmm3, xmm3
+ mov rcx, SIZEOF_XMMWORD
+ jmp short .downsample
.columnloop:
- movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
- movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
- movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+ movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
.downsample:
- movdqa xmm4,xmm0
- movdqa xmm5,xmm1
- pand xmm0,xmm6
- psrlw xmm4,BYTE_BIT
- pand xmm1,xmm6
- psrlw xmm5,BYTE_BIT
- paddw xmm0,xmm4
- paddw xmm1,xmm5
-
- movdqa xmm4,xmm2
- movdqa xmm5,xmm3
- pand xmm2,xmm6
- psrlw xmm4,BYTE_BIT
- pand xmm3,xmm6
- psrlw xmm5,BYTE_BIT
- paddw xmm2,xmm4
- paddw xmm3,xmm5
-
- paddw xmm0,xmm1
- paddw xmm2,xmm3
- paddw xmm0,xmm7
- paddw xmm2,xmm7
- psrlw xmm0,2
- psrlw xmm2,2
-
- packuswb xmm0,xmm2
-
- movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-
- sub rcx, byte SIZEOF_XMMWORD ; outcol
- add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
- add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
- add rdi, byte 1*SIZEOF_XMMWORD ; outptr
- cmp rcx, byte SIZEOF_XMMWORD
- jae near .columnloop
- test rcx,rcx
- jnz near .columnloop_r8
-
- pop rsi
- pop rdi
- pop rcx
-
- add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
- add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
- dec rax ; rowctr
- jg near .rowloop
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm1
+ pand xmm0, xmm6
+ psrlw xmm4, BYTE_BIT
+ pand xmm1, xmm6
+ psrlw xmm5, BYTE_BIT
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+
+ movdqa xmm4, xmm2
+ movdqa xmm5, xmm3
+ pand xmm2, xmm6
+ psrlw xmm4, BYTE_BIT
+ pand xmm3, xmm6
+ psrlw xmm5, BYTE_BIT
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+ paddw xmm0, xmm1
+ paddw xmm2, xmm3
+ paddw xmm0, xmm7
+ paddw xmm2, xmm7
+ psrlw xmm0, 2
+ psrlw xmm2, 2
+
+ packuswb xmm0, xmm2
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+ sub rcx, byte SIZEOF_XMMWORD ; outcol
+ add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
+ add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
+ add rdi, byte 1*SIZEOF_XMMWORD ; outptr
+ cmp rcx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test rcx, rcx
+ jnz near .columnloop_r8
+
+ pop rsi
+ pop rdi
+ pop rcx
+
+ add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
+ dec rax ; rowctr
+ jg near .rowloop
.return:
- uncollect_args
- pop rbp
- ret
+ uncollect_args
+ pop rbp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
%include "jsimdext.inc"
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
+ SECTION SEG_TEXT
+ BITS 32
;
; Downsample pixel values of a single component.
; This version handles the common case of 2:1 horizontal and 1:1 vertical,
%define input_data(b) (b)+24 ; JSAMPARRAY input_data
%define output_data(b) (b)+28 ; JSAMPARRAY output_data
- align 16
- global EXTN(jsimd_h2v1_downsample_sse2)
+ align 16
+ global EXTN(jsimd_h2v1_downsample_sse2)
EXTN(jsimd_h2v1_downsample_sse2):
- push ebp
- mov ebp,esp
-; push ebx ; unused
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov ecx, JDIMENSION [width_blks(ebp)]
- shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
- jz near .return
-
- mov edx, JDIMENSION [img_width(ebp)]
-
- ; -- expand_right_edge
-
- push ecx
- shl ecx,1 ; output_cols * 2
- sub ecx,edx
- jle short .expand_end
-
- mov eax, INT [max_v_samp(ebp)]
- test eax,eax
- jle short .expand_end
-
- cld
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- alignx 16,7
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx, 1 ; output_cols * 2
+ sub ecx, edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax, eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16, 7
.expandloop:
- push eax
- push ecx
+ push eax
+ push ecx
- mov edi, JSAMPROW [esi]
- add edi,edx
- mov al, JSAMPLE [edi-1]
+ mov edi, JSAMPROW [esi]
+ add edi, edx
+ mov al, JSAMPLE [edi-1]
- rep stosb
+ rep stosb
- pop ecx
- pop eax
+ pop ecx
+ pop eax
- add esi, byte SIZEOF_JSAMPROW
- dec eax
- jg short .expandloop
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
.expand_end:
- pop ecx ; output_cols
+ pop ecx ; output_cols
- ; -- h2v1_downsample
+ ; -- h2v1_downsample
- mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
- test eax,eax
- jle near .return
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax, eax
+ jle near .return
- mov edx, 0x00010000 ; bias pattern
- movd xmm7,edx
- pcmpeqw xmm6,xmm6
- pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
- psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+ mov edx, 0x00010000 ; bias pattern
+ movd xmm7, edx
+ pcmpeqw xmm6, xmm6
+ pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
- alignx 16,7
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16, 7
.rowloop:
- push ecx
- push edi
- push esi
+ push ecx
+ push edi
+ push esi
- mov esi, JSAMPROW [esi] ; inptr
- mov edi, JSAMPROW [edi] ; outptr
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
- cmp ecx, byte SIZEOF_XMMWORD
- jae short .columnloop
- alignx 16,7
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+ alignx 16, 7
.columnloop_r8:
- movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
- pxor xmm1,xmm1
- mov ecx, SIZEOF_XMMWORD
- jmp short .downsample
- alignx 16,7
+ movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ pxor xmm1, xmm1
+ mov ecx, SIZEOF_XMMWORD
+ jmp short .downsample
+ alignx 16, 7
.columnloop:
- movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
.downsample:
- movdqa xmm2,xmm0
- movdqa xmm3,xmm1
-
- pand xmm0,xmm6
- psrlw xmm2,BYTE_BIT
- pand xmm1,xmm6
- psrlw xmm3,BYTE_BIT
-
- paddw xmm0,xmm2
- paddw xmm1,xmm3
- paddw xmm0,xmm7
- paddw xmm1,xmm7
- psrlw xmm0,1
- psrlw xmm1,1
-
- packuswb xmm0,xmm1
-
- movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-
- sub ecx, byte SIZEOF_XMMWORD ; outcol
- add esi, byte 2*SIZEOF_XMMWORD ; inptr
- add edi, byte 1*SIZEOF_XMMWORD ; outptr
- cmp ecx, byte SIZEOF_XMMWORD
- jae short .columnloop
- test ecx,ecx
- jnz short .columnloop_r8
-
- pop esi
- pop edi
- pop ecx
-
- add esi, byte SIZEOF_JSAMPROW ; input_data
- add edi, byte SIZEOF_JSAMPROW ; output_data
- dec eax ; rowctr
- jg near .rowloop
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ pand xmm0, xmm6
+ psrlw xmm2, BYTE_BIT
+ pand xmm1, xmm6
+ psrlw xmm3, BYTE_BIT
+
+ paddw xmm0, xmm2
+ paddw xmm1, xmm3
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+ psrlw xmm0, 1
+ psrlw xmm1, 1
+
+ packuswb xmm0, xmm1
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+ sub ecx, byte SIZEOF_XMMWORD ; outcol
+ add esi, byte 2*SIZEOF_XMMWORD ; inptr
+ add edi, byte 1*SIZEOF_XMMWORD ; outptr
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+ test ecx, ecx
+ jnz short .columnloop_r8
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg near .rowloop
.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
-; pop ebx ; unused
- pop ebp
- ret
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
; --------------------------------------------------------------------------
;
%define input_data(b) (b)+24 ; JSAMPARRAY input_data
%define output_data(b) (b)+28 ; JSAMPARRAY output_data
- align 16
- global EXTN(jsimd_h2v2_downsample_sse2)
+ align 16
+ global EXTN(jsimd_h2v2_downsample_sse2)
EXTN(jsimd_h2v2_downsample_sse2):
- push ebp
- mov ebp,esp
-; push ebx ; unused
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov ecx, JDIMENSION [width_blks(ebp)]
- shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
- jz near .return
-
- mov edx, JDIMENSION [img_width(ebp)]
-
- ; -- expand_right_edge
-
- push ecx
- shl ecx,1 ; output_cols * 2
- sub ecx,edx
- jle short .expand_end
-
- mov eax, INT [max_v_samp(ebp)]
- test eax,eax
- jle short .expand_end
-
- cld
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- alignx 16,7
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov ecx, JDIMENSION [width_blks(ebp)]
+ shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
+ jz near .return
+
+ mov edx, JDIMENSION [img_width(ebp)]
+
+ ; -- expand_right_edge
+
+ push ecx
+ shl ecx, 1 ; output_cols * 2
+ sub ecx, edx
+ jle short .expand_end
+
+ mov eax, INT [max_v_samp(ebp)]
+ test eax, eax
+ jle short .expand_end
+
+ cld
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ alignx 16, 7
.expandloop:
- push eax
- push ecx
+ push eax
+ push ecx
- mov edi, JSAMPROW [esi]
- add edi,edx
- mov al, JSAMPLE [edi-1]
+ mov edi, JSAMPROW [esi]
+ add edi, edx
+ mov al, JSAMPLE [edi-1]
- rep stosb
+ rep stosb
- pop ecx
- pop eax
+ pop ecx
+ pop eax
- add esi, byte SIZEOF_JSAMPROW
- dec eax
- jg short .expandloop
+ add esi, byte SIZEOF_JSAMPROW
+ dec eax
+ jg short .expandloop
.expand_end:
- pop ecx ; output_cols
+ pop ecx ; output_cols
- ; -- h2v2_downsample
+ ; -- h2v2_downsample
- mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
- test eax,eax
- jle near .return
+ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
+ test eax, eax
+ jle near .return
- mov edx, 0x00020001 ; bias pattern
- movd xmm7,edx
- pcmpeqw xmm6,xmm6
- pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
- psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+ mov edx, 0x00020001 ; bias pattern
+ movd xmm7, edx
+ pcmpeqw xmm6, xmm6
+ pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
- alignx 16,7
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
+ alignx 16, 7
.rowloop:
- push ecx
- push edi
- push esi
+ push ecx
+ push edi
+ push esi
- mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
- mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
- mov edi, JSAMPROW [edi] ; outptr
+ mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
+ mov edi, JSAMPROW [edi] ; outptr
- cmp ecx, byte SIZEOF_XMMWORD
- jae short .columnloop
- alignx 16,7
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae short .columnloop
+ alignx 16, 7
.columnloop_r8:
- movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
- movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
- pxor xmm2,xmm2
- pxor xmm3,xmm3
- mov ecx, SIZEOF_XMMWORD
- jmp short .downsample
- alignx 16,7
+ movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ pxor xmm2, xmm2
+ pxor xmm3, xmm3
+ mov ecx, SIZEOF_XMMWORD
+ jmp short .downsample
+ alignx 16, 7
.columnloop:
- movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
- movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
- movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+ movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
+ movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
.downsample:
- movdqa xmm4,xmm0
- movdqa xmm5,xmm1
- pand xmm0,xmm6
- psrlw xmm4,BYTE_BIT
- pand xmm1,xmm6
- psrlw xmm5,BYTE_BIT
- paddw xmm0,xmm4
- paddw xmm1,xmm5
-
- movdqa xmm4,xmm2
- movdqa xmm5,xmm3
- pand xmm2,xmm6
- psrlw xmm4,BYTE_BIT
- pand xmm3,xmm6
- psrlw xmm5,BYTE_BIT
- paddw xmm2,xmm4
- paddw xmm3,xmm5
-
- paddw xmm0,xmm1
- paddw xmm2,xmm3
- paddw xmm0,xmm7
- paddw xmm2,xmm7
- psrlw xmm0,2
- psrlw xmm2,2
-
- packuswb xmm0,xmm2
-
- movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-
- sub ecx, byte SIZEOF_XMMWORD ; outcol
- add edx, byte 2*SIZEOF_XMMWORD ; inptr0
- add esi, byte 2*SIZEOF_XMMWORD ; inptr1
- add edi, byte 1*SIZEOF_XMMWORD ; outptr
- cmp ecx, byte SIZEOF_XMMWORD
- jae near .columnloop
- test ecx,ecx
- jnz near .columnloop_r8
-
- pop esi
- pop edi
- pop ecx
-
- add esi, byte 2*SIZEOF_JSAMPROW ; input_data
- add edi, byte 1*SIZEOF_JSAMPROW ; output_data
- dec eax ; rowctr
- jg near .rowloop
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm1
+ pand xmm0, xmm6
+ psrlw xmm4, BYTE_BIT
+ pand xmm1, xmm6
+ psrlw xmm5, BYTE_BIT
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+
+ movdqa xmm4, xmm2
+ movdqa xmm5, xmm3
+ pand xmm2, xmm6
+ psrlw xmm4, BYTE_BIT
+ pand xmm3, xmm6
+ psrlw xmm5, BYTE_BIT
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+ paddw xmm0, xmm1
+ paddw xmm2, xmm3
+ paddw xmm0, xmm7
+ paddw xmm2, xmm7
+ psrlw xmm0, 2
+ psrlw xmm2, 2
+
+ packuswb xmm0, xmm2
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+ sub ecx, byte SIZEOF_XMMWORD ; outcol
+ add edx, byte 2*SIZEOF_XMMWORD ; inptr0
+ add esi, byte 2*SIZEOF_XMMWORD ; inptr1
+ add edi, byte 1*SIZEOF_XMMWORD ; outptr
+ cmp ecx, byte SIZEOF_XMMWORD
+ jae near .columnloop
+ test ecx, ecx
+ jnz near .columnloop_r8
+
+ pop esi
+ pop edi
+ pop ecx
+
+ add esi, byte 2*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 1*SIZEOF_JSAMPROW ; output_data
+ dec eax ; rowctr
+ jg near .rowloop
.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
-; pop ebx ; unused
- pop ebp
- ret
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; r13 = JSAMPARRAY output_buf
; r14 = int num_rows
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
- align 16
- global EXTN(jsimd_ycc_rgb_convert_sse2)
+ align 16
+ global EXTN(jsimd_ycc_rgb_convert_sse2)
EXTN(jsimd_ycc_rgb_convert_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
- push rbx
-
- mov ecx, r10d ; num_cols
- test rcx,rcx
- jz near .return
-
- push rcx
-
- mov rdi, r11
- mov ecx, r12d
- mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
- mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
- mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
- lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
- lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
- lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
-
- pop rcx
-
- mov rdi, r13
- mov eax, r14d
- test rax,rax
- jle near .return
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args
+ push rbx
+
+ mov ecx, r10d ; num_cols
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rdi, r11
+ mov ecx, r12d
+ mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+ lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+ lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+ pop rcx
+
+ mov rdi, r13
+ mov eax, r14d
+ test rax, rax
+ jle near .return
.rowloop:
- push rax
- push rdi
- push rdx
- push rbx
- push rsi
- push rcx ; col
-
- mov rsi, JSAMPROW [rsi] ; inptr0
- mov rbx, JSAMPROW [rbx] ; inptr1
- mov rdx, JSAMPROW [rdx] ; inptr2
- mov rdi, JSAMPROW [rdi] ; outptr
+ push rax
+ push rdi
+ push rdx
+ push rbx
+ push rsi
+ push rcx ; col
+
+ mov rsi, JSAMPROW [rsi] ; inptr0
+ mov rbx, JSAMPROW [rbx] ; inptr1
+ mov rdx, JSAMPROW [rdx] ; inptr2
+ mov rdi, JSAMPROW [rdi] ; outptr
.columnloop:
- movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF)
- movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF)
-
- pcmpeqw xmm4,xmm4
- pcmpeqw xmm7,xmm7
- psrlw xmm4,BYTE_BIT
- psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
- movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
-
- pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE
- psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
- pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE
- psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
-
- paddw xmm4,xmm7
- paddw xmm5,xmm7
- paddw xmm0,xmm7
- paddw xmm1,xmm7
-
- ; (Original)
- ; R = Y + 1.40200 * Cr
- ; G = Y - 0.34414 * Cb - 0.71414 * Cr
- ; B = Y + 1.77200 * Cb
- ;
- ; (This implementation)
- ; R = Y + 0.40200 * Cr + Cr
- ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
- ; B = Y - 0.22800 * Cb + Cb + Cb
-
- movdqa xmm2,xmm4 ; xmm2=CbE
- movdqa xmm3,xmm5 ; xmm3=CbO
- paddw xmm4,xmm4 ; xmm4=2*CbE
- paddw xmm5,xmm5 ; xmm5=2*CbO
- movdqa xmm6,xmm0 ; xmm6=CrE
- movdqa xmm7,xmm1 ; xmm7=CrO
- paddw xmm0,xmm0 ; xmm0=2*CrE
- paddw xmm1,xmm1 ; xmm1=2*CrO
-
- pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800))
- pmulhw xmm5,[rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800))
- pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200))
- pmulhw xmm1,[rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200))
-
- paddw xmm4,[rel PW_ONE]
- paddw xmm5,[rel PW_ONE]
- psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800))
- psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800))
- paddw xmm0,[rel PW_ONE]
- paddw xmm1,[rel PW_ONE]
- psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200))
- psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200))
-
- paddw xmm4,xmm2
- paddw xmm5,xmm3
- paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
- paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
- paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
- paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
-
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
- movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
-
- movdqa xmm4,xmm2
- movdqa xmm5,xmm3
- punpcklwd xmm2,xmm6
- punpckhwd xmm4,xmm6
- pmaddwd xmm2,[rel PW_MF0344_F0285]
- pmaddwd xmm4,[rel PW_MF0344_F0285]
- punpcklwd xmm3,xmm7
- punpckhwd xmm5,xmm7
- pmaddwd xmm3,[rel PW_MF0344_F0285]
- pmaddwd xmm5,[rel PW_MF0344_F0285]
-
- paddd xmm2,[rel PD_ONEHALF]
- paddd xmm4,[rel PD_ONEHALF]
- psrad xmm2,SCALEBITS
- psrad xmm4,SCALEBITS
- paddd xmm3,[rel PD_ONEHALF]
- paddd xmm5,[rel PD_ONEHALF]
- psrad xmm3,SCALEBITS
- psrad xmm5,SCALEBITS
-
- packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
- packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
- psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
- psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
- movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF)
-
- pcmpeqw xmm4,xmm4
- psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
- pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE
- psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO
-
- paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
- paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
- packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
- packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
-
- paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
- paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
- packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
- packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
-
- paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
- paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
- packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
- packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
- punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
- movdqa xmmG,xmmA
- movdqa xmmH,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
- punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
- psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
- psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
- movdqa xmmC,xmmD
- movdqa xmmB,xmmD
- punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
- punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
- psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
- movdqa xmmF,xmmE
- punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
- punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
- pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
- movdqa xmmB,xmmE
- punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
- punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
- punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
- pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
- movdqa xmmB,xmmF
- punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
- punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
- punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
- punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
- cmp rcx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test rdi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
- jmp short .out0
+ movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF)
+ movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF)
+
+ pcmpeqw xmm4, xmm4
+ pcmpeqw xmm7, xmm7
+ psrlw xmm4, BYTE_BIT
+ psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+ movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+ pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE
+ psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
+ pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE
+ psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
+
+ paddw xmm4, xmm7
+ paddw xmm5, xmm7
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movdqa xmm2, xmm4 ; xmm2=CbE
+ movdqa xmm3, xmm5 ; xmm3=CbO
+ paddw xmm4, xmm4 ; xmm4=2*CbE
+ paddw xmm5, xmm5 ; xmm5=2*CbO
+ movdqa xmm6, xmm0 ; xmm6=CrE
+ movdqa xmm7, xmm1 ; xmm7=CrO
+ paddw xmm0, xmm0 ; xmm0=2*CrE
+ paddw xmm1, xmm1 ; xmm1=2*CrO
+
+ pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800))
+ pmulhw xmm5, [rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800))
+ pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200))
+ pmulhw xmm1, [rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200))
+
+ paddw xmm4, [rel PW_ONE]
+ paddw xmm5, [rel PW_ONE]
+ psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800))
+ psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800))
+ paddw xmm0, [rel PW_ONE]
+ paddw xmm1, [rel PW_ONE]
+ psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200))
+ psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200))
+
+ paddw xmm4, xmm2
+ paddw xmm5, xmm3
+ paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+ paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+ paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+ paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
+
+ movdqa xmm4, xmm2
+ movdqa xmm5, xmm3
+ punpcklwd xmm2, xmm6
+ punpckhwd xmm4, xmm6
+ pmaddwd xmm2, [rel PW_MF0344_F0285]
+ pmaddwd xmm4, [rel PW_MF0344_F0285]
+ punpcklwd xmm3, xmm7
+ punpckhwd xmm5, xmm7
+ pmaddwd xmm3, [rel PW_MF0344_F0285]
+ pmaddwd xmm5, [rel PW_MF0344_F0285]
+
+ paddd xmm2, [rel PD_ONEHALF]
+ paddd xmm4, [rel PD_ONEHALF]
+ psrad xmm2, SCALEBITS
+ psrad xmm4, SCALEBITS
+ paddd xmm3, [rel PD_ONEHALF]
+ paddd xmm5, [rel PD_ONEHALF]
+ psrad xmm3, SCALEBITS
+ psrad xmm5, SCALEBITS
+
+ packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+ packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+ psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+ psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+ movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF)
+
+ pcmpeqw xmm4, xmm4
+ psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+ pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE
+ psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO
+
+ paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+ paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+ packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
+ packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
+
+ paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+ paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+ packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
+ packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
+
+ paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+ paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+ packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
+ packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+ punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+ movdqa xmmG, xmmA
+ movdqa xmmH, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+ punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+ psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+ psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+ movdqa xmmC, xmmD
+ movdqa xmmB, xmmD
+ punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+ punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+ psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+ movdqa xmmF, xmmE
+ punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+ punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+ pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+ movdqa xmmB, xmmE
+ punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+ punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+ punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+ pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+ movdqa xmmB, xmmF
+ punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+ punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+ punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+ punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test rdi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+ jmp short .out0
.out1: ; --(unaligned)-----------------
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
.out0:
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- sub rcx, byte SIZEOF_XMMWORD
- jz near .nextrow
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub rcx, byte SIZEOF_XMMWORD
+ jz near .nextrow
- add rsi, byte SIZEOF_XMMWORD ; inptr0
- add rbx, byte SIZEOF_XMMWORD ; inptr1
- add rdx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
+ add rsi, byte SIZEOF_XMMWORD ; inptr0
+ add rbx, byte SIZEOF_XMMWORD ; inptr1
+ add rdx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
.column_st32:
- lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
- cmp rcx, byte 2*SIZEOF_XMMWORD
- jb short .column_st16
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- add rdi, byte 2*SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmF
- sub rcx, byte 2*SIZEOF_XMMWORD
- jmp short .column_st15
+ lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp rcx, byte 2*SIZEOF_XMMWORD
+ jb short .column_st16
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmF
+ sub rcx, byte 2*SIZEOF_XMMWORD
+ jmp short .column_st15
.column_st16:
- cmp rcx, byte SIZEOF_XMMWORD
- jb short .column_st15
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub rcx, byte SIZEOF_XMMWORD
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub rcx, byte SIZEOF_XMMWORD
.column_st15:
- ; Store the lower 8 bytes of xmmA to the output when it has enough
- ; space.
- cmp rcx, byte SIZEOF_MMWORD
- jb short .column_st7
- movq XMM_MMWORD [rdi], xmmA
- add rdi, byte SIZEOF_MMWORD
- sub rcx, byte SIZEOF_MMWORD
- psrldq xmmA, SIZEOF_MMWORD
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ movq XMM_MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_MMWORD
+ sub rcx, byte SIZEOF_MMWORD
+ psrldq xmmA, SIZEOF_MMWORD
.column_st7:
- ; Store the lower 4 bytes of xmmA to the output when it has enough
- ; space.
- cmp rcx, byte SIZEOF_DWORD
- jb short .column_st3
- movd XMM_DWORD [rdi], xmmA
- add rdi, byte SIZEOF_DWORD
- sub rcx, byte SIZEOF_DWORD
- psrldq xmmA, SIZEOF_DWORD
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_DWORD
+ jb short .column_st3
+ movd XMM_DWORD [rdi], xmmA
+ add rdi, byte SIZEOF_DWORD
+ sub rcx, byte SIZEOF_DWORD
+ psrldq xmmA, SIZEOF_DWORD
.column_st3:
- ; Store the lower 2 bytes of rax to the output when it has enough
- ; space.
- movd eax, xmmA
- cmp rcx, byte SIZEOF_WORD
- jb short .column_st1
- mov WORD [rdi], ax
- add rdi, byte SIZEOF_WORD
- sub rcx, byte SIZEOF_WORD
- shr rax, 16
+ ; Store the lower 2 bytes of rax to the output when it has enough
+ ; space.
+ movd eax, xmmA
+ cmp rcx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov WORD [rdi], ax
+ add rdi, byte SIZEOF_WORD
+ sub rcx, byte SIZEOF_WORD
+ shr rax, 16
.column_st1:
- ; Store the lower 1 byte of rax to the output when it has enough
- ; space.
- test rcx, rcx
- jz short .nextrow
- mov BYTE [rdi], al
+ ; Store the lower 1 byte of rax to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .nextrow
+ mov BYTE [rdi], al
-%else ; RGB_PIXELSIZE == 4 ; -----------
+%else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF
- pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+ pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else
- pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+ pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
- punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
- punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
- movdqa xmmC,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
- punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
- movdqa xmmG,xmmB
- punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
- punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- movdqa xmmH,xmmC
- punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- cmp rcx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test rdi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
- movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
- jmp short .out0
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+ punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+ punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+ movdqa xmmC, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+ punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+ movdqa xmmG, xmmB
+ punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+ punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ movdqa xmmH, xmmC
+ punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test rdi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+ jmp short .out0
.out1: ; --(unaligned)-----------------
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
- movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
.out0:
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- sub rcx, byte SIZEOF_XMMWORD
- jz near .nextrow
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub rcx, byte SIZEOF_XMMWORD
+ jz near .nextrow
- add rsi, byte SIZEOF_XMMWORD ; inptr0
- add rbx, byte SIZEOF_XMMWORD ; inptr1
- add rdx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
+ add rsi, byte SIZEOF_XMMWORD ; inptr0
+ add rbx, byte SIZEOF_XMMWORD ; inptr1
+ add rdx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
.column_st32:
- cmp rcx, byte SIZEOF_XMMWORD/2
- jb short .column_st16
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- add rdi, byte 2*SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmC
- movdqa xmmD,xmmH
- sub rcx, byte SIZEOF_XMMWORD/2
+ cmp rcx, byte SIZEOF_XMMWORD/2
+ jb short .column_st16
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmC
+ movdqa xmmD, xmmH
+ sub rcx, byte SIZEOF_XMMWORD/2
.column_st16:
- cmp rcx, byte SIZEOF_XMMWORD/4
- jb short .column_st15
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub rcx, byte SIZEOF_XMMWORD/4
+ cmp rcx, byte SIZEOF_XMMWORD/4
+ jb short .column_st15
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub rcx, byte SIZEOF_XMMWORD/4
.column_st15:
- ; Store two pixels (8 bytes) of xmmA to the output when it has enough
- ; space.
- cmp rcx, byte SIZEOF_XMMWORD/8
- jb short .column_st7
- movq MMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD/8*4
- sub rcx, byte SIZEOF_XMMWORD/8
- psrldq xmmA, SIZEOF_XMMWORD/8*4
+ ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_XMMWORD/8
+ jb short .column_st7
+ movq MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD/8*4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7:
- ; Store one pixel (4 bytes) of xmmA to the output when it has enough
- ; space.
- test rcx, rcx
- jz short .nextrow
- movd XMM_DWORD [rdi], xmmA
+ ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .nextrow
+ movd XMM_DWORD [rdi], xmmA
-%endif ; RGB_PIXELSIZE ; ---------------
+%endif ; RGB_PIXELSIZE ; ---------------
.nextrow:
- pop rcx
- pop rsi
- pop rbx
- pop rdx
- pop rdi
- pop rax
-
- add rsi, byte SIZEOF_JSAMPROW
- add rbx, byte SIZEOF_JSAMPROW
- add rdx, byte SIZEOF_JSAMPROW
- add rdi, byte SIZEOF_JSAMPROW ; output_buf
- dec rax ; num_rows
- jg near .rowloop
-
- sfence ; flush the write buffer
+ pop rcx
+ pop rsi
+ pop rbx
+ pop rdx
+ pop rdi
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW
+ add rbx, byte SIZEOF_JSAMPROW
+ add rdx, byte SIZEOF_JSAMPROW
+ add rdi, byte SIZEOF_JSAMPROW ; output_buf
+ dec rax ; num_rows
+ jg near .rowloop
+
+ sfence ; flush the write buffer
.return:
- pop rbx
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
+ pop rbx
+ uncollect_args
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; JSAMPARRAY output_buf, int num_rows)
;
-%define out_width(b) (b)+8 ; JDIMENSION out_width
-%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
-%define input_row(b) (b)+16 ; JDIMENSION input_row
-%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
-%define num_rows(b) (b)+24 ; int num_rows
+%define out_width(b) (b)+8 ; JDIMENSION out_width
+%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
+%define input_row(b) (b)+16 ; JDIMENSION input_row
+%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
+%define num_rows(b) (b)+24 ; int num_rows
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
- align 16
- global EXTN(jsimd_ycc_rgb_convert_sse2)
+ align 16
+ global EXTN(jsimd_ycc_rgb_convert_sse2)
EXTN(jsimd_ycc_rgb_convert_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov ecx, JDIMENSION [out_width(eax)] ; num_cols
- test ecx,ecx
- jz near .return
-
- push ecx
-
- mov edi, JSAMPIMAGE [input_buf(eax)]
- mov ecx, JDIMENSION [input_row(eax)]
- mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
- lea esi, [esi+ecx*SIZEOF_JSAMPROW]
- lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
- lea edx, [edx+ecx*SIZEOF_JSAMPROW]
-
- pop ecx
-
- mov edi, JSAMPARRAY [output_buf(eax)]
- mov eax, INT [num_rows(eax)]
- test eax,eax
- jle near .return
- alignx 16,7
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [out_width(eax)] ; num_cols
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [input_row(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+ lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+ lea edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+ pop ecx
+
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov eax, INT [num_rows(eax)]
+ test eax, eax
+ jle near .return
+ alignx 16, 7
.rowloop:
- push eax
- push edi
- push edx
- push ebx
- push esi
- push ecx ; col
-
- mov esi, JSAMPROW [esi] ; inptr0
- mov ebx, JSAMPROW [ebx] ; inptr1
- mov edx, JSAMPROW [edx] ; inptr2
- mov edi, JSAMPROW [edi] ; outptr
- movpic eax, POINTER [gotptr] ; load GOT address (eax)
- alignx 16,7
+ push eax
+ push edi
+ push edx
+ push ebx
+ push esi
+ push ecx ; col
+
+ mov esi, JSAMPROW [esi] ; inptr0
+ mov ebx, JSAMPROW [ebx] ; inptr1
+ mov edx, JSAMPROW [edx] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+ alignx 16, 7
.columnloop:
- movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
- movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF)
-
- pcmpeqw xmm4,xmm4
- pcmpeqw xmm7,xmm7
- psrlw xmm4,BYTE_BIT
- psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
- movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
-
- pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE
- psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
- pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE
- psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
-
- paddw xmm4,xmm7
- paddw xmm5,xmm7
- paddw xmm0,xmm7
- paddw xmm1,xmm7
-
- ; (Original)
- ; R = Y + 1.40200 * Cr
- ; G = Y - 0.34414 * Cb - 0.71414 * Cr
- ; B = Y + 1.77200 * Cb
- ;
- ; (This implementation)
- ; R = Y + 0.40200 * Cr + Cr
- ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
- ; B = Y - 0.22800 * Cb + Cb + Cb
-
- movdqa xmm2,xmm4 ; xmm2=CbE
- movdqa xmm3,xmm5 ; xmm3=CbO
- paddw xmm4,xmm4 ; xmm4=2*CbE
- paddw xmm5,xmm5 ; xmm5=2*CbO
- movdqa xmm6,xmm0 ; xmm6=CrE
- movdqa xmm7,xmm1 ; xmm7=CrO
- paddw xmm0,xmm0 ; xmm0=2*CrE
- paddw xmm1,xmm1 ; xmm1=2*CrO
-
- pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800))
- pmulhw xmm5,[GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800))
- pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200))
- pmulhw xmm1,[GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200))
-
- paddw xmm4,[GOTOFF(eax,PW_ONE)]
- paddw xmm5,[GOTOFF(eax,PW_ONE)]
- psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800))
- psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800))
- paddw xmm0,[GOTOFF(eax,PW_ONE)]
- paddw xmm1,[GOTOFF(eax,PW_ONE)]
- psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200))
- psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200))
-
- paddw xmm4,xmm2
- paddw xmm5,xmm3
- paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
- paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
- paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
- paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
-
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
- movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
-
- movdqa xmm4,xmm2
- movdqa xmm5,xmm3
- punpcklwd xmm2,xmm6
- punpckhwd xmm4,xmm6
- pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
- punpcklwd xmm3,xmm7
- punpckhwd xmm5,xmm7
- pmaddwd xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
-
- paddd xmm2,[GOTOFF(eax,PD_ONEHALF)]
- paddd xmm4,[GOTOFF(eax,PD_ONEHALF)]
- psrad xmm2,SCALEBITS
- psrad xmm4,SCALEBITS
- paddd xmm3,[GOTOFF(eax,PD_ONEHALF)]
- paddd xmm5,[GOTOFF(eax,PD_ONEHALF)]
- psrad xmm3,SCALEBITS
- psrad xmm5,SCALEBITS
-
- packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
- packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
- psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
- psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
- movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF)
-
- pcmpeqw xmm4,xmm4
- psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
- pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE
- psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO
-
- paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
- paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
- packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
- packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
-
- paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
- paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
- packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
- packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
-
- paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
- paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
- packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
- packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
- punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
- movdqa xmmG,xmmA
- movdqa xmmH,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
- punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
- psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
- psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
- movdqa xmmC,xmmD
- movdqa xmmB,xmmD
- punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
- punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
- psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
- movdqa xmmF,xmmE
- punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
- punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
- pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
- movdqa xmmB,xmmE
- punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
- punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
- punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
- pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
- movdqa xmmB,xmmF
- punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
- punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
- punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
- punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test edi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
- jmp short .out0
+ movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
+ movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF)
+
+ pcmpeqw xmm4, xmm4
+ pcmpeqw xmm7, xmm7
+ psrlw xmm4, BYTE_BIT
+ psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+ movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+ pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE
+ psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
+ pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE
+ psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
+
+ paddw xmm4, xmm7
+ paddw xmm5, xmm7
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movdqa xmm2, xmm4 ; xmm2=CbE
+ movdqa xmm3, xmm5 ; xmm3=CbO
+ paddw xmm4, xmm4 ; xmm4=2*CbE
+ paddw xmm5, xmm5 ; xmm5=2*CbO
+ movdqa xmm6, xmm0 ; xmm6=CrE
+ movdqa xmm7, xmm1 ; xmm7=CrO
+ paddw xmm0, xmm0 ; xmm0=2*CrE
+ paddw xmm1, xmm1 ; xmm1=2*CrO
+
+ pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800))
+ pmulhw xmm5, [GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800))
+ pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200))
+ pmulhw xmm1, [GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200))
+
+ paddw xmm4, [GOTOFF(eax,PW_ONE)]
+ paddw xmm5, [GOTOFF(eax,PW_ONE)]
+ psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800))
+ psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800))
+ paddw xmm0, [GOTOFF(eax,PW_ONE)]
+ paddw xmm1, [GOTOFF(eax,PW_ONE)]
+ psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200))
+ psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200))
+
+ paddw xmm4, xmm2
+ paddw xmm5, xmm3
+ paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+ paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+ paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+ paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
+
+ movdqa xmm4, xmm2
+ movdqa xmm5, xmm3
+ punpcklwd xmm2, xmm6
+ punpckhwd xmm4, xmm6
+ pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm4, [GOTOFF(eax,PW_MF0344_F0285)]
+ punpcklwd xmm3, xmm7
+ punpckhwd xmm5, xmm7
+ pmaddwd xmm3, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+ paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm4, [GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm2, SCALEBITS
+ psrad xmm4, SCALEBITS
+ paddd xmm3, [GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm3, SCALEBITS
+ psrad xmm5, SCALEBITS
+
+ packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+ packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+ psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+ psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+ movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF)
+
+ pcmpeqw xmm4, xmm4
+ psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+ pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE
+ psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO
+
+ paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+ paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+ packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
+ packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
+
+ paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+ paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+ packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
+ packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
+
+ paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+ paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+ packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
+ packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+ punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+ movdqa xmmG, xmmA
+ movdqa xmmH, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+ punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+ psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+ psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+ movdqa xmmC, xmmD
+ movdqa xmmB, xmmD
+ punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+ punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+ psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+ movdqa xmmF, xmmE
+ punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+ punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+ pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+ movdqa xmmB, xmmE
+ punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+ punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+ punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+ pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+ movdqa xmmB, xmmF
+ punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+ punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+ punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+ punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+ jmp short .out0
.out1: ; --(unaligned)-----------------
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0:
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- sub ecx, byte SIZEOF_XMMWORD
- jz near .nextrow
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .nextrow
- add esi, byte SIZEOF_XMMWORD ; inptr0
- add ebx, byte SIZEOF_XMMWORD ; inptr1
- add edx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
- alignx 16,7
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
.column_st32:
- lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
- cmp ecx, byte 2*SIZEOF_XMMWORD
- jb short .column_st16
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- add edi, byte 2*SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmF
- sub ecx, byte 2*SIZEOF_XMMWORD
- jmp short .column_st15
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_XMMWORD
+ jb short .column_st16
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmF
+ sub ecx, byte 2*SIZEOF_XMMWORD
+ jmp short .column_st15
.column_st16:
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st15
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub ecx, byte SIZEOF_XMMWORD
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub ecx, byte SIZEOF_XMMWORD
.column_st15:
- ; Store the lower 8 bytes of xmmA to the output when it has enough
- ; space.
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st7
- movq XMM_MMWORD [edi], xmmA
- add edi, byte SIZEOF_MMWORD
- sub ecx, byte SIZEOF_MMWORD
- psrldq xmmA, SIZEOF_MMWORD
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ movq XMM_MMWORD [edi], xmmA
+ add edi, byte SIZEOF_MMWORD
+ sub ecx, byte SIZEOF_MMWORD
+ psrldq xmmA, SIZEOF_MMWORD
.column_st7:
- ; Store the lower 4 bytes of xmmA to the output when it has enough
- ; space.
- cmp ecx, byte SIZEOF_DWORD
- jb short .column_st3
- movd XMM_DWORD [edi], xmmA
- add edi, byte SIZEOF_DWORD
- sub ecx, byte SIZEOF_DWORD
- psrldq xmmA, SIZEOF_DWORD
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st3
+ movd XMM_DWORD [edi], xmmA
+ add edi, byte SIZEOF_DWORD
+ sub ecx, byte SIZEOF_DWORD
+ psrldq xmmA, SIZEOF_DWORD
.column_st3:
- ; Store the lower 2 bytes of eax to the output when it has enough
- ; space.
- movd eax, xmmA
- cmp ecx, byte SIZEOF_WORD
- jb short .column_st1
- mov WORD [edi], ax
- add edi, byte SIZEOF_WORD
- sub ecx, byte SIZEOF_WORD
- shr eax, 16
+ ; Store the lower 2 bytes of eax to the output when it has enough
+ ; space.
+ movd eax, xmmA
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov WORD [edi], ax
+ add edi, byte SIZEOF_WORD
+ sub ecx, byte SIZEOF_WORD
+ shr eax, 16
.column_st1:
- ; Store the lower 1 byte of eax to the output when it has enough
- ; space.
- test ecx, ecx
- jz short .nextrow
- mov BYTE [edi], al
+ ; Store the lower 1 byte of eax to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .nextrow
+ mov BYTE [edi], al
-%else ; RGB_PIXELSIZE == 4 ; -----------
+%else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF
- pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+ pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else
- pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+ pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
- punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
- punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
- movdqa xmmC,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
- punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
- movdqa xmmG,xmmB
- punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
- punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- movdqa xmmH,xmmC
- punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test edi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
- movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
- jmp short .out0
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+ punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+ punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+ movdqa xmmC, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+ punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+ movdqa xmmG, xmmB
+ punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+ punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ movdqa xmmH, xmmC
+ punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+ jmp short .out0
.out1: ; --(unaligned)-----------------
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
- movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0:
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- sub ecx, byte SIZEOF_XMMWORD
- jz near .nextrow
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .nextrow
- add esi, byte SIZEOF_XMMWORD ; inptr0
- add ebx, byte SIZEOF_XMMWORD ; inptr1
- add edx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
- alignx 16,7
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
.column_st32:
- cmp ecx, byte SIZEOF_XMMWORD/2
- jb short .column_st16
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- add edi, byte 2*SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmC
- movdqa xmmD,xmmH
- sub ecx, byte SIZEOF_XMMWORD/2
+ cmp ecx, byte SIZEOF_XMMWORD/2
+ jb short .column_st16
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmC
+ movdqa xmmD, xmmH
+ sub ecx, byte SIZEOF_XMMWORD/2
.column_st16:
- cmp ecx, byte SIZEOF_XMMWORD/4
- jb short .column_st15
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub ecx, byte SIZEOF_XMMWORD/4
+ cmp ecx, byte SIZEOF_XMMWORD/4
+ jb short .column_st15
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
- ; Store two pixels (8 bytes) of xmmA to the output when it has enough
- ; space.
- cmp ecx, byte SIZEOF_XMMWORD/8
- jb short .column_st7
- movq XMM_MMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD/8*4
- sub ecx, byte SIZEOF_XMMWORD/8
- psrldq xmmA, SIZEOF_XMMWORD/8*4
+ ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_XMMWORD/8
+ jb short .column_st7
+ movq XMM_MMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD/8*4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7:
- ; Store one pixel (4 bytes) of xmmA to the output when it has enough
- ; space.
- test ecx, ecx
- jz short .nextrow
- movd XMM_DWORD [edi], xmmA
+ ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .nextrow
+ movd XMM_DWORD [edi], xmmA
-%endif ; RGB_PIXELSIZE ; ---------------
+%endif ; RGB_PIXELSIZE ; ---------------
- alignx 16,7
+ alignx 16, 7
.nextrow:
- pop ecx
- pop esi
- pop ebx
- pop edx
- pop edi
- pop eax
-
- add esi, byte SIZEOF_JSAMPROW
- add ebx, byte SIZEOF_JSAMPROW
- add edx, byte SIZEOF_JSAMPROW
- add edi, byte SIZEOF_JSAMPROW ; output_buf
- dec eax ; num_rows
- jg near .rowloop
-
- sfence ; flush the write buffer
+ pop ecx
+ pop esi
+ pop ebx
+ pop edx
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW
+ add ebx, byte SIZEOF_JSAMPROW
+ add edx, byte SIZEOF_JSAMPROW
+ add edi, byte SIZEOF_JSAMPROW ; output_buf
+ dec eax ; num_rows
+ jg near .rowloop
+
+ sfence ; flush the write buffer
.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; --------------------------------------------------------------------------
-%define SCALEBITS 16
+%define SCALEBITS 16
-F_0_344 equ 22554 ; FIX(0.34414)
-F_0_714 equ 46802 ; FIX(0.71414)
-F_1_402 equ 91881 ; FIX(1.40200)
-F_1_772 equ 116130 ; FIX(1.77200)
-F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
-F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
-F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_ycc_rgb_convert_sse2)
+ alignz 16
+ global EXTN(jconst_ycc_rgb_convert_sse2)
EXTN(jconst_ycc_rgb_convert_sse2):
PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
+ SECTION SEG_TEXT
+ BITS 64
%include "jdcolext-sse2-64.asm"
; --------------------------------------------------------------------------
-%define SCALEBITS 16
+%define SCALEBITS 16
-F_0_344 equ 22554 ; FIX(0.34414)
-F_0_714 equ 46802 ; FIX(0.71414)
-F_1_402 equ 91881 ; FIX(1.40200)
-F_1_772 equ 116130 ; FIX(1.77200)
-F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
-F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
-F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_ycc_rgb_convert_sse2)
+ alignz 16
+ global EXTN(jconst_ycc_rgb_convert_sse2)
EXTN(jconst_ycc_rgb_convert_sse2):
PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
+ SECTION SEG_TEXT
+ BITS 32
%include "jdcolext-sse2.asm"
;
%define RANGE_MASK (MAXJSAMPLE * 4 + 3) ; 2 bits wider than legal samples
-%define ROW(n,b,s) ((b)+(n)*(s))
-%define COL(n,b,s) ((b)+(n)*(s)*DCTSIZE)
+%define ROW(n,b,s) ((b)+(n)*(s))
+%define COL(n,b,s) ((b)+(n)*(s)*DCTSIZE)
-%define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
-%define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
-%define XMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
+%define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
+%define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
+%define XMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
; --------------------------------------------------------------------------
; --------------------------------------------------------------------------
-%define SCALEBITS 16
+%define SCALEBITS 16
-F_0_344 equ 22554 ; FIX(0.34414)
-F_0_714 equ 46802 ; FIX(0.71414)
-F_1_402 equ 91881 ; FIX(1.40200)
-F_1_772 equ 116130 ; FIX(1.77200)
-F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
-F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
-F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_merged_upsample_sse2)
+ alignz 16
+ global EXTN(jconst_merged_upsample_sse2)
EXTN(jconst_merged_upsample_sse2):
PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
+ SECTION SEG_TEXT
+ BITS 64
%include "jdmrgext-sse2-64.asm"
; --------------------------------------------------------------------------
-%define SCALEBITS 16
+%define SCALEBITS 16
-F_0_344 equ 22554 ; FIX(0.34414)
-F_0_714 equ 46802 ; FIX(0.71414)
-F_1_402 equ 91881 ; FIX(1.40200)
-F_1_772 equ 116130 ; FIX(1.77200)
-F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
-F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
-F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
+F_0_344 equ 22554 ; FIX(0.34414)
+F_0_714 equ 46802 ; FIX(0.71414)
+F_1_402 equ 91881 ; FIX(1.40200)
+F_1_772 equ 116130 ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_merged_upsample_sse2)
+ alignz 16
+ global EXTN(jconst_merged_upsample_sse2)
EXTN(jconst_merged_upsample_sse2):
PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
+ SECTION SEG_TEXT
+ BITS 32
%include "jdmrgext-sse2.asm"
; r12 = JDIMENSION in_row_group_ctr
; r13 = JSAMPARRAY output_buf
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 3
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 3
- align 16
- global EXTN(jsimd_h2v1_merged_upsample_sse2)
+ align 16
+ global EXTN(jsimd_h2v1_merged_upsample_sse2)
EXTN(jsimd_h2v1_merged_upsample_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
- push rbx
-
- mov ecx, r10d ; col
- test rcx,rcx
- jz near .return
-
- push rcx
-
- mov rdi, r11
- mov ecx, r12d
- mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
- mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
- mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
- mov rdi, r13
- mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
- mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
- mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
- mov rdi, JSAMPROW [rdi] ; outptr
-
- pop rcx ; col
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args
+ push rbx
+
+ mov ecx, r10d ; col
+ test rcx, rcx
+ jz near .return
+
+ push rcx
+
+ mov rdi, r11
+ mov ecx, r12d
+ mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rdi, r13
+ mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
+ mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
+ mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
+ mov rdi, JSAMPROW [rdi] ; outptr
+
+ pop rcx ; col
.columnloop:
- movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF)
- movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF)
-
- pxor xmm1,xmm1 ; xmm1=(all 0's)
- pcmpeqw xmm3,xmm3
- psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
- movdqa xmm4,xmm6
- punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH
- punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL
- movdqa xmm0,xmm7
- punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH
- punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL
-
- paddw xmm6,xmm3
- paddw xmm4,xmm3
- paddw xmm7,xmm3
- paddw xmm0,xmm3
-
- ; (Original)
- ; R = Y + 1.40200 * Cr
- ; G = Y - 0.34414 * Cb - 0.71414 * Cr
- ; B = Y + 1.77200 * Cb
- ;
- ; (This implementation)
- ; R = Y + 0.40200 * Cr + Cr
- ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
- ; B = Y - 0.22800 * Cb + Cb + Cb
-
- movdqa xmm5,xmm6 ; xmm5=CbH
- movdqa xmm2,xmm4 ; xmm2=CbL
- paddw xmm6,xmm6 ; xmm6=2*CbH
- paddw xmm4,xmm4 ; xmm4=2*CbL
- movdqa xmm1,xmm7 ; xmm1=CrH
- movdqa xmm3,xmm0 ; xmm3=CrL
- paddw xmm7,xmm7 ; xmm7=2*CrH
- paddw xmm0,xmm0 ; xmm0=2*CrL
-
- pmulhw xmm6,[rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800))
- pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800))
- pmulhw xmm7,[rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200))
- pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200))
-
- paddw xmm6,[rel PW_ONE]
- paddw xmm4,[rel PW_ONE]
- psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800))
- psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800))
- paddw xmm7,[rel PW_ONE]
- paddw xmm0,[rel PW_ONE]
- psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200))
- psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200))
-
- paddw xmm6,xmm5
- paddw xmm4,xmm2
- paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
- paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
- paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
- paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
-
- movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
- movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
-
- movdqa xmm6,xmm5
- movdqa xmm7,xmm2
- punpcklwd xmm5,xmm1
- punpckhwd xmm6,xmm1
- pmaddwd xmm5,[rel PW_MF0344_F0285]
- pmaddwd xmm6,[rel PW_MF0344_F0285]
- punpcklwd xmm2,xmm3
- punpckhwd xmm7,xmm3
- pmaddwd xmm2,[rel PW_MF0344_F0285]
- pmaddwd xmm7,[rel PW_MF0344_F0285]
-
- paddd xmm5,[rel PD_ONEHALF]
- paddd xmm6,[rel PD_ONEHALF]
- psrad xmm5,SCALEBITS
- psrad xmm6,SCALEBITS
- paddd xmm2,[rel PD_ONEHALF]
- paddd xmm7,[rel PD_ONEHALF]
- psrad xmm2,SCALEBITS
- psrad xmm7,SCALEBITS
-
- packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
- packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
- psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
- psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
- movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
-
- mov al,2 ; Yctr
- jmp short .Yloop_1st
+ movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF)
+ movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF)
+
+ pxor xmm1, xmm1 ; xmm1=(all 0's)
+ pcmpeqw xmm3, xmm3
+ psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ movdqa xmm4, xmm6
+ punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH
+ punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL
+ movdqa xmm0, xmm7
+ punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH
+ punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL
+
+ paddw xmm6, xmm3
+ paddw xmm4, xmm3
+ paddw xmm7, xmm3
+ paddw xmm0, xmm3
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movdqa xmm5, xmm6 ; xmm5=CbH
+ movdqa xmm2, xmm4 ; xmm2=CbL
+ paddw xmm6, xmm6 ; xmm6=2*CbH
+ paddw xmm4, xmm4 ; xmm4=2*CbL
+ movdqa xmm1, xmm7 ; xmm1=CrH
+ movdqa xmm3, xmm0 ; xmm3=CrL
+ paddw xmm7, xmm7 ; xmm7=2*CrH
+ paddw xmm0, xmm0 ; xmm0=2*CrL
+
+ pmulhw xmm6, [rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800))
+ pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800))
+ pmulhw xmm7, [rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200))
+ pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200))
+
+ paddw xmm6, [rel PW_ONE]
+ paddw xmm4, [rel PW_ONE]
+ psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800))
+ psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800))
+ paddw xmm7, [rel PW_ONE]
+ paddw xmm0, [rel PW_ONE]
+ psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200))
+ psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200))
+
+ paddw xmm6, xmm5
+ paddw xmm4, xmm2
+ paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+ paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+ paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+ paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+ movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
+
+ movdqa xmm6, xmm5
+ movdqa xmm7, xmm2
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm6, xmm1
+ pmaddwd xmm5, [rel PW_MF0344_F0285]
+ pmaddwd xmm6, [rel PW_MF0344_F0285]
+ punpcklwd xmm2, xmm3
+ punpckhwd xmm7, xmm3
+ pmaddwd xmm2, [rel PW_MF0344_F0285]
+ pmaddwd xmm7, [rel PW_MF0344_F0285]
+
+ paddd xmm5, [rel PD_ONEHALF]
+ paddd xmm6, [rel PD_ONEHALF]
+ psrad xmm5, SCALEBITS
+ psrad xmm6, SCALEBITS
+ paddd xmm2, [rel PD_ONEHALF]
+ paddd xmm7, [rel PD_ONEHALF]
+ psrad xmm2, SCALEBITS
+ psrad xmm7, SCALEBITS
+
+ packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+ packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+ psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+ psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+ movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
+
+ mov al, 2 ; Yctr
+ jmp short .Yloop_1st
.Yloop_2nd:
- movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
- movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
- movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
+ movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
+ movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
.Yloop_1st:
- movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF)
-
- pcmpeqw xmm6,xmm6
- psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
- pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE
- psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO
-
- movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H)
- movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H)
- movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H)
-
- paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
- paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
- packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
- packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
-
- paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
- paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
- packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
- packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
-
- paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
- paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
- packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
- packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
- punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
- movdqa xmmG,xmmA
- movdqa xmmH,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
- punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
- psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
- psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
- movdqa xmmC,xmmD
- movdqa xmmB,xmmD
- punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
- punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
- psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
- movdqa xmmF,xmmE
- punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
- punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
- pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
- movdqa xmmB,xmmE
- punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
- punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
- punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
- pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
- movdqa xmmB,xmmF
- punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
- punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
- punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
- punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
- cmp rcx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test rdi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
- jmp short .out0
+ movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF)
+
+ pcmpeqw xmm6, xmm6
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+ pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE
+ psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO
+
+ movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H)
+ movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H)
+ movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H)
+
+ paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+ paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+ packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
+ packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
+
+ paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+ paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+ packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
+ packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
+
+ paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+ paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+ packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
+ packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+ punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+ movdqa xmmG, xmmA
+ movdqa xmmH, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+ punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+ psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+ psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+ movdqa xmmC, xmmD
+ movdqa xmmB, xmmD
+ punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+ punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+ psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+ movdqa xmmF, xmmE
+ punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+ punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+ pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+ movdqa xmmB, xmmE
+ punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+ punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+ punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+ pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+ movdqa xmmB, xmmF
+ punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+ punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+ punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+ punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test rdi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+ jmp short .out0
.out1: ; --(unaligned)-----------------
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
.out0:
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- sub rcx, byte SIZEOF_XMMWORD
- jz near .endcolumn
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub rcx, byte SIZEOF_XMMWORD
+ jz near .endcolumn
- add rsi, byte SIZEOF_XMMWORD ; inptr0
- dec al ; Yctr
- jnz near .Yloop_2nd
+ add rsi, byte SIZEOF_XMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
- add rbx, byte SIZEOF_XMMWORD ; inptr1
- add rdx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
+ add rbx, byte SIZEOF_XMMWORD ; inptr1
+ add rdx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
.column_st32:
- lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
- cmp rcx, byte 2*SIZEOF_XMMWORD
- jb short .column_st16
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- add rdi, byte 2*SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmF
- sub rcx, byte 2*SIZEOF_XMMWORD
- jmp short .column_st15
+ lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp rcx, byte 2*SIZEOF_XMMWORD
+ jb short .column_st16
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmF
+ sub rcx, byte 2*SIZEOF_XMMWORD
+ jmp short .column_st15
.column_st16:
- cmp rcx, byte SIZEOF_XMMWORD
- jb short .column_st15
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub rcx, byte SIZEOF_XMMWORD
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub rcx, byte SIZEOF_XMMWORD
.column_st15:
- ; Store the lower 8 bytes of xmmA to the output when it has enough
- ; space.
- cmp rcx, byte SIZEOF_MMWORD
- jb short .column_st7
- movq XMM_MMWORD [rdi], xmmA
- add rdi, byte SIZEOF_MMWORD
- sub rcx, byte SIZEOF_MMWORD
- psrldq xmmA, SIZEOF_MMWORD
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ movq XMM_MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_MMWORD
+ sub rcx, byte SIZEOF_MMWORD
+ psrldq xmmA, SIZEOF_MMWORD
.column_st7:
- ; Store the lower 4 bytes of xmmA to the output when it has enough
- ; space.
- cmp rcx, byte SIZEOF_DWORD
- jb short .column_st3
- movd XMM_DWORD [rdi], xmmA
- add rdi, byte SIZEOF_DWORD
- sub rcx, byte SIZEOF_DWORD
- psrldq xmmA, SIZEOF_DWORD
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_DWORD
+ jb short .column_st3
+ movd XMM_DWORD [rdi], xmmA
+ add rdi, byte SIZEOF_DWORD
+ sub rcx, byte SIZEOF_DWORD
+ psrldq xmmA, SIZEOF_DWORD
.column_st3:
- ; Store the lower 2 bytes of rax to the output when it has enough
- ; space.
- movd eax, xmmA
- cmp rcx, byte SIZEOF_WORD
- jb short .column_st1
- mov WORD [rdi], ax
- add rdi, byte SIZEOF_WORD
- sub rcx, byte SIZEOF_WORD
- shr rax, 16
+ ; Store the lower 2 bytes of rax to the output when it has enough
+ ; space.
+ movd eax, xmmA
+ cmp rcx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov WORD [rdi], ax
+ add rdi, byte SIZEOF_WORD
+ sub rcx, byte SIZEOF_WORD
+ shr rax, 16
.column_st1:
- ; Store the lower 1 byte of rax to the output when it has enough
- ; space.
- test rcx, rcx
- jz short .endcolumn
- mov BYTE [rdi], al
+ ; Store the lower 1 byte of rax to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .endcolumn
+ mov BYTE [rdi], al
-%else ; RGB_PIXELSIZE == 4 ; -----------
+%else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF
- pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+ pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else
- pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+ pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
- punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
- punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
- movdqa xmmC,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
- punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
- movdqa xmmG,xmmB
- punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
- punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- movdqa xmmH,xmmC
- punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- cmp rcx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test rdi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
- movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
- jmp short .out0
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+ punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+ punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+ movdqa xmmC, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+ punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+ movdqa xmmG, xmmB
+ punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+ punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ movdqa xmmH, xmmC
+ punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ cmp rcx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test rdi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+ jmp short .out0
.out1: ; --(unaligned)-----------------
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
- movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
.out0:
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- sub rcx, byte SIZEOF_XMMWORD
- jz near .endcolumn
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub rcx, byte SIZEOF_XMMWORD
+ jz near .endcolumn
- add rsi, byte SIZEOF_XMMWORD ; inptr0
- dec al ; Yctr
- jnz near .Yloop_2nd
+ add rsi, byte SIZEOF_XMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
- add rbx, byte SIZEOF_XMMWORD ; inptr1
- add rdx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
+ add rbx, byte SIZEOF_XMMWORD ; inptr1
+ add rdx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
.column_st32:
- cmp rcx, byte SIZEOF_XMMWORD/2
- jb short .column_st16
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
- add rdi, byte 2*SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmC
- movdqa xmmD,xmmH
- sub rcx, byte SIZEOF_XMMWORD/2
+ cmp rcx, byte SIZEOF_XMMWORD/2
+ jb short .column_st16
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmC
+ movdqa xmmD, xmmH
+ sub rcx, byte SIZEOF_XMMWORD/2
.column_st16:
- cmp rcx, byte SIZEOF_XMMWORD/4
- jb short .column_st15
- movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub rcx, byte SIZEOF_XMMWORD/4
+ cmp rcx, byte SIZEOF_XMMWORD/4
+ jb short .column_st15
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub rcx, byte SIZEOF_XMMWORD/4
.column_st15:
- ; Store two pixels (8 bytes) of xmmA to the output when it has enough
- ; space.
- cmp rcx, byte SIZEOF_XMMWORD/8
- jb short .column_st7
- movq XMM_MMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD/8*4
- sub rcx, byte SIZEOF_XMMWORD/8
- psrldq xmmA, SIZEOF_XMMWORD/8*4
+ ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_XMMWORD/8
+ jb short .column_st7
+ movq XMM_MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD/8*4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7:
- ; Store one pixel (4 bytes) of xmmA to the output when it has enough
- ; space.
- test rcx, rcx
- jz short .endcolumn
- movd XMM_DWORD [rdi], xmmA
+ ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .endcolumn
+ movd XMM_DWORD [rdi], xmmA
-%endif ; RGB_PIXELSIZE ; ---------------
+%endif ; RGB_PIXELSIZE ; ---------------
.endcolumn:
- sfence ; flush the write buffer
+ sfence ; flush the write buffer
.return:
- pop rbx
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
+ pop rbx
+ uncollect_args
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
; --------------------------------------------------------------------------
;
; r12 = JDIMENSION in_row_group_ctr
; r13 = JSAMPARRAY output_buf
- align 16
- global EXTN(jsimd_h2v2_merged_upsample_sse2)
+ align 16
+ global EXTN(jsimd_h2v2_merged_upsample_sse2)
EXTN(jsimd_h2v2_merged_upsample_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
- push rbx
-
- mov eax, r10d
-
- mov rdi, r11
- mov ecx, r12d
- mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
- mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
- mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
- mov rdi, r13
- lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
-
- push rdx ; inptr2
- push rbx ; inptr1
- push rsi ; inptr00
- mov rbx,rsp
-
- push rdi
- push rcx
- push rax
-
- %ifdef WIN64
- mov r8, rcx
- mov r9, rdi
- mov rcx, rax
- mov rdx, rbx
- %else
- mov rdx, rcx
- mov rcx, rdi
- mov rdi, rax
- mov rsi, rbx
- %endif
-
- call EXTN(jsimd_h2v1_merged_upsample_sse2)
-
- pop rax
- pop rcx
- pop rdi
- pop rsi
- pop rbx
- pop rdx
-
- add rdi, byte SIZEOF_JSAMPROW ; outptr1
- add rsi, byte SIZEOF_JSAMPROW ; inptr01
-
- push rdx ; inptr2
- push rbx ; inptr1
- push rsi ; inptr00
- mov rbx,rsp
-
- push rdi
- push rcx
- push rax
-
- %ifdef WIN64
- mov r8, rcx
- mov r9, rdi
- mov rcx, rax
- mov rdx, rbx
- %else
- mov rdx, rcx
- mov rcx, rdi
- mov rdi, rax
- mov rsi, rbx
- %endif
-
- call EXTN(jsimd_h2v1_merged_upsample_sse2)
-
- pop rax
- pop rcx
- pop rdi
- pop rsi
- pop rbx
- pop rdx
-
- pop rbx
- uncollect_args
- pop rbp
- ret
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args
+ push rbx
+
+ mov eax, r10d
+
+ mov rdi, r11
+ mov ecx, r12d
+ mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rdi, r13
+ lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+ push rdx ; inptr2
+ push rbx ; inptr1
+ push rsi ; inptr00
+ mov rbx, rsp
+
+ push rdi
+ push rcx
+ push rax
+
+ %ifdef WIN64
+ mov r8, rcx
+ mov r9, rdi
+ mov rcx, rax
+ mov rdx, rbx
+ %else
+ mov rdx, rcx
+ mov rcx, rdi
+ mov rdi, rax
+ mov rsi, rbx
+ %endif
+
+ call EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+ pop rax
+ pop rcx
+ pop rdi
+ pop rsi
+ pop rbx
+ pop rdx
+
+ add rdi, byte SIZEOF_JSAMPROW ; outptr1
+ add rsi, byte SIZEOF_JSAMPROW ; inptr01
+
+ push rdx ; inptr2
+ push rbx ; inptr1
+ push rsi ; inptr00
+ mov rbx, rsp
+
+ push rdi
+ push rcx
+ push rax
+
+ %ifdef WIN64
+ mov r8, rcx
+ mov r9, rdi
+ mov rcx, rax
+ mov rdx, rbx
+ %else
+ mov rdx, rcx
+ mov rcx, rdi
+ mov rdi, rax
+ mov rsi, rbx
+ %endif
+
+ call EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+ pop rax
+ pop rcx
+ pop rdi
+ pop rsi
+ pop rbx
+ pop rdx
+
+ pop rbx
+ uncollect_args
+ pop rbp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; JSAMPARRAY output_buf);
;
-%define output_width(b) (b)+8 ; JDIMENSION output_width
-%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
-%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
+%define output_width(b) (b)+8 ; JDIMENSION output_width
+%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 3
-%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 3
+%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
- align 16
- global EXTN(jsimd_h2v1_merged_upsample_sse2)
+ align 16
+ global EXTN(jsimd_h2v1_merged_upsample_sse2)
EXTN(jsimd_h2v1_merged_upsample_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov ecx, JDIMENSION [output_width(eax)] ; col
- test ecx,ecx
- jz near .return
-
- push ecx
-
- mov edi, JSAMPIMAGE [input_buf(eax)]
- mov ecx, JDIMENSION [in_row_group_ctr(eax)]
- mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
- mov edi, JSAMPARRAY [output_buf(eax)]
- mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
- mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
- mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
- mov edi, JSAMPROW [edi] ; outptr
-
- pop ecx ; col
-
- alignx 16,7
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov ecx, JDIMENSION [output_width(eax)] ; col
+ test ecx, ecx
+ jz near .return
+
+ push ecx
+
+ mov edi, JSAMPIMAGE [input_buf(eax)]
+ mov ecx, JDIMENSION [in_row_group_ctr(eax)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(eax)]
+ mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
+ mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
+ mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
+ mov edi, JSAMPROW [edi] ; outptr
+
+ pop ecx ; col
+
+ alignx 16, 7
.columnloop:
- movpic eax, POINTER [gotptr] ; load GOT address (eax)
-
- movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF)
- movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF)
-
- pxor xmm1,xmm1 ; xmm1=(all 0's)
- pcmpeqw xmm3,xmm3
- psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
- movdqa xmm4,xmm6
- punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH
- punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL
- movdqa xmm0,xmm7
- punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH
- punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL
-
- paddw xmm6,xmm3
- paddw xmm4,xmm3
- paddw xmm7,xmm3
- paddw xmm0,xmm3
-
- ; (Original)
- ; R = Y + 1.40200 * Cr
- ; G = Y - 0.34414 * Cb - 0.71414 * Cr
- ; B = Y + 1.77200 * Cb
- ;
- ; (This implementation)
- ; R = Y + 0.40200 * Cr + Cr
- ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
- ; B = Y - 0.22800 * Cb + Cb + Cb
-
- movdqa xmm5,xmm6 ; xmm5=CbH
- movdqa xmm2,xmm4 ; xmm2=CbL
- paddw xmm6,xmm6 ; xmm6=2*CbH
- paddw xmm4,xmm4 ; xmm4=2*CbL
- movdqa xmm1,xmm7 ; xmm1=CrH
- movdqa xmm3,xmm0 ; xmm3=CrL
- paddw xmm7,xmm7 ; xmm7=2*CrH
- paddw xmm0,xmm0 ; xmm0=2*CrL
-
- pmulhw xmm6,[GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800))
- pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800))
- pmulhw xmm7,[GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200))
- pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200))
-
- paddw xmm6,[GOTOFF(eax,PW_ONE)]
- paddw xmm4,[GOTOFF(eax,PW_ONE)]
- psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800))
- psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800))
- paddw xmm7,[GOTOFF(eax,PW_ONE)]
- paddw xmm0,[GOTOFF(eax,PW_ONE)]
- psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200))
- psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200))
-
- paddw xmm6,xmm5
- paddw xmm4,xmm2
- paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
- paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
- paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
- paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
-
- movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
- movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
-
- movdqa xmm6,xmm5
- movdqa xmm7,xmm2
- punpcklwd xmm5,xmm1
- punpckhwd xmm6,xmm1
- pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd xmm6,[GOTOFF(eax,PW_MF0344_F0285)]
- punpcklwd xmm2,xmm3
- punpckhwd xmm7,xmm3
- pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
- pmaddwd xmm7,[GOTOFF(eax,PW_MF0344_F0285)]
-
- paddd xmm5,[GOTOFF(eax,PD_ONEHALF)]
- paddd xmm6,[GOTOFF(eax,PD_ONEHALF)]
- psrad xmm5,SCALEBITS
- psrad xmm6,SCALEBITS
- paddd xmm2,[GOTOFF(eax,PD_ONEHALF)]
- paddd xmm7,[GOTOFF(eax,PD_ONEHALF)]
- psrad xmm2,SCALEBITS
- psrad xmm7,SCALEBITS
-
- packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
- packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
- psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
- psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
- movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
-
- mov al,2 ; Yctr
- jmp short .Yloop_1st
- alignx 16,7
+ movpic eax, POINTER [gotptr] ; load GOT address (eax)
+
+ movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF)
+ movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF)
+
+ pxor xmm1, xmm1 ; xmm1=(all 0's)
+ pcmpeqw xmm3, xmm3
+ psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ movdqa xmm4, xmm6
+ punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH
+ punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL
+ movdqa xmm0, xmm7
+ punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH
+ punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL
+
+ paddw xmm6, xmm3
+ paddw xmm4, xmm3
+ paddw xmm7, xmm3
+ paddw xmm0, xmm3
+
+ ; (Original)
+ ; R = Y + 1.40200 * Cr
+ ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+ ; B = Y + 1.77200 * Cb
+ ;
+ ; (This implementation)
+ ; R = Y + 0.40200 * Cr + Cr
+ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ ; B = Y - 0.22800 * Cb + Cb + Cb
+
+ movdqa xmm5, xmm6 ; xmm5=CbH
+ movdqa xmm2, xmm4 ; xmm2=CbL
+ paddw xmm6, xmm6 ; xmm6=2*CbH
+ paddw xmm4, xmm4 ; xmm4=2*CbL
+ movdqa xmm1, xmm7 ; xmm1=CrH
+ movdqa xmm3, xmm0 ; xmm3=CrL
+ paddw xmm7, xmm7 ; xmm7=2*CrH
+ paddw xmm0, xmm0 ; xmm0=2*CrL
+
+ pmulhw xmm6, [GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800))
+ pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800))
+ pmulhw xmm7, [GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200))
+ pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200))
+
+ paddw xmm6, [GOTOFF(eax,PW_ONE)]
+ paddw xmm4, [GOTOFF(eax,PW_ONE)]
+ psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800))
+ psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800))
+ paddw xmm7, [GOTOFF(eax,PW_ONE)]
+ paddw xmm0, [GOTOFF(eax,PW_ONE)]
+ psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200))
+ psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200))
+
+ paddw xmm6, xmm5
+ paddw xmm4, xmm2
+ paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+ paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+ paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+ paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+ movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
+
+ movdqa xmm6, xmm5
+ movdqa xmm7, xmm2
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm6, xmm1
+ pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm6, [GOTOFF(eax,PW_MF0344_F0285)]
+ punpcklwd xmm2, xmm3
+ punpckhwd xmm7, xmm3
+ pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
+ pmaddwd xmm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+ paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm6, [GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm5, SCALEBITS
+ psrad xmm6, SCALEBITS
+ paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
+ paddd xmm7, [GOTOFF(eax,PD_ONEHALF)]
+ psrad xmm2, SCALEBITS
+ psrad xmm7, SCALEBITS
+
+ packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+ packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+ psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+ psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+ movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
+
+ mov al, 2 ; Yctr
+ jmp short .Yloop_1st
+ alignx 16, 7
.Yloop_2nd:
- movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
- movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
- movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
- alignx 16,7
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
+ movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
+ movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
+ alignx 16, 7
.Yloop_1st:
- movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF)
-
- pcmpeqw xmm6,xmm6
- psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
- pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE
- psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO
-
- movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H)
- movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H)
- movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H)
-
- paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
- paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
- packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
- packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
-
- paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
- paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
- packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
- packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
-
- paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
- paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
- packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
- packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
- punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
- movdqa xmmG,xmmA
- movdqa xmmH,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
- punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
- psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
- psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
- movdqa xmmC,xmmD
- movdqa xmmB,xmmD
- punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
- punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
- psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
- movdqa xmmF,xmmE
- punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
- punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
- pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
- movdqa xmmB,xmmE
- punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
- punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
- punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
- pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
- movdqa xmmB,xmmF
- punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
- punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
- punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
- punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
- punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
- punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test edi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
- jmp short .out0
+ movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF)
+
+ pcmpeqw xmm6, xmm6
+ psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+ pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE
+ psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO
+
+ movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H)
+ movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H)
+ movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H)
+
+ paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+ paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+ packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
+ packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
+
+ paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+ paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+ packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
+ packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
+
+ paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+ paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+ packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
+ packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+ punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+ movdqa xmmG, xmmA
+ movdqa xmmH, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+ punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+ psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+ psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+ movdqa xmmC, xmmD
+ movdqa xmmB, xmmD
+ punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+ punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+ psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+ movdqa xmmF, xmmE
+ punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+ punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+ pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+ movdqa xmmB, xmmE
+ punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+ punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+ punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+ pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+ movdqa xmmB, xmmF
+ punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+ punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+ punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+ punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+ punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+ punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+ jmp short .out0
.out1: ; --(unaligned)-----------------
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0:
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- sub ecx, byte SIZEOF_XMMWORD
- jz near .endcolumn
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .endcolumn
- add esi, byte SIZEOF_XMMWORD ; inptr0
- dec al ; Yctr
- jnz near .Yloop_2nd
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
- add ebx, byte SIZEOF_XMMWORD ; inptr1
- add edx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
- alignx 16,7
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
.column_st32:
- lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
- cmp ecx, byte 2*SIZEOF_XMMWORD
- jb short .column_st16
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- add edi, byte 2*SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmF
- sub ecx, byte 2*SIZEOF_XMMWORD
- jmp short .column_st15
+ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
+ cmp ecx, byte 2*SIZEOF_XMMWORD
+ jb short .column_st16
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmF
+ sub ecx, byte 2*SIZEOF_XMMWORD
+ jmp short .column_st15
.column_st16:
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st15
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub ecx, byte SIZEOF_XMMWORD
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st15
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub ecx, byte SIZEOF_XMMWORD
.column_st15:
- ; Store the lower 8 bytes of xmmA to the output when it has enough
- ; space.
- cmp ecx, byte SIZEOF_MMWORD
- jb short .column_st7
- movq XMM_MMWORD [edi], xmmA
- add edi, byte SIZEOF_MMWORD
- sub ecx, byte SIZEOF_MMWORD
- psrldq xmmA, SIZEOF_MMWORD
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ movq XMM_MMWORD [edi], xmmA
+ add edi, byte SIZEOF_MMWORD
+ sub ecx, byte SIZEOF_MMWORD
+ psrldq xmmA, SIZEOF_MMWORD
.column_st7:
- ; Store the lower 4 bytes of xmmA to the output when it has enough
- ; space.
- cmp ecx, byte SIZEOF_DWORD
- jb short .column_st3
- movd XMM_DWORD [edi], xmmA
- add edi, byte SIZEOF_DWORD
- sub ecx, byte SIZEOF_DWORD
- psrldq xmmA, SIZEOF_DWORD
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st3
+ movd XMM_DWORD [edi], xmmA
+ add edi, byte SIZEOF_DWORD
+ sub ecx, byte SIZEOF_DWORD
+ psrldq xmmA, SIZEOF_DWORD
.column_st3:
- ; Store the lower 2 bytes of eax to the output when it has enough
- ; space.
- movd eax, xmmA
- cmp ecx, byte SIZEOF_WORD
- jb short .column_st1
- mov WORD [edi], ax
- add edi, byte SIZEOF_WORD
- sub ecx, byte SIZEOF_WORD
- shr eax, 16
+ ; Store the lower 2 bytes of eax to the output when it has enough
+ ; space.
+ movd eax, xmmA
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov WORD [edi], ax
+ add edi, byte SIZEOF_WORD
+ sub ecx, byte SIZEOF_WORD
+ shr eax, 16
.column_st1:
- ; Store the lower 1 byte of eax to the output when it has enough
- ; space.
- test ecx, ecx
- jz short .endcolumn
- mov BYTE [edi], al
+ ; Store the lower 1 byte of eax to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .endcolumn
+ mov BYTE [edi], al
-%else ; RGB_PIXELSIZE == 4 ; -----------
+%else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF
- pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+ pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else
- pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
- pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
+ pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
+ pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif
- ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
- ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
- ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
- ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
- punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
- punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
- punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
- punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
- movdqa xmmC,xmmA
- punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
- punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
- movdqa xmmG,xmmB
- punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
- punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
- movdqa xmmD,xmmA
- punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
- punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
- movdqa xmmH,xmmC
- punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
- punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
- cmp ecx, byte SIZEOF_XMMWORD
- jb short .column_st32
-
- test edi, SIZEOF_XMMWORD-1
- jnz short .out1
- ; --(aligned)-------------------
- movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
- movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
- jmp short .out0
+ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+ punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+ punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+ punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+ punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+ movdqa xmmC, xmmA
+ punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+ punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+ movdqa xmmG, xmmB
+ punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+ punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+ movdqa xmmD, xmmA
+ punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+ punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+ movdqa xmmH, xmmC
+ punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+ punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+ cmp ecx, byte SIZEOF_XMMWORD
+ jb short .column_st32
+
+ test edi, SIZEOF_XMMWORD-1
+ jnz short .out1
+ ; --(aligned)-------------------
+ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+ jmp short .out0
.out1: ; --(unaligned)-----------------
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
- movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0:
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
- sub ecx, byte SIZEOF_XMMWORD
- jz near .endcolumn
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
+ sub ecx, byte SIZEOF_XMMWORD
+ jz near .endcolumn
- add esi, byte SIZEOF_XMMWORD ; inptr0
- dec al ; Yctr
- jnz near .Yloop_2nd
+ add esi, byte SIZEOF_XMMWORD ; inptr0
+ dec al ; Yctr
+ jnz near .Yloop_2nd
- add ebx, byte SIZEOF_XMMWORD ; inptr1
- add edx, byte SIZEOF_XMMWORD ; inptr2
- jmp near .columnloop
- alignx 16,7
+ add ebx, byte SIZEOF_XMMWORD ; inptr1
+ add edx, byte SIZEOF_XMMWORD ; inptr2
+ jmp near .columnloop
+ alignx 16, 7
.column_st32:
- cmp ecx, byte SIZEOF_XMMWORD/2
- jb short .column_st16
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
- add edi, byte 2*SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmC
- movdqa xmmD,xmmH
- sub ecx, byte SIZEOF_XMMWORD/2
+ cmp ecx, byte SIZEOF_XMMWORD/2
+ jb short .column_st16
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmC
+ movdqa xmmD, xmmH
+ sub ecx, byte SIZEOF_XMMWORD/2
.column_st16:
- cmp ecx, byte SIZEOF_XMMWORD/4
- jb short .column_st15
- movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- movdqa xmmA,xmmD
- sub ecx, byte SIZEOF_XMMWORD/4
+ cmp ecx, byte SIZEOF_XMMWORD/4
+ jb short .column_st15
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqa xmmA, xmmD
+ sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
- ; Store two pixels (8 bytes) of xmmA to the output when it has enough
- ; space.
- cmp ecx, byte SIZEOF_XMMWORD/8
- jb short .column_st7
- movq XMM_MMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD/8*4
- sub ecx, byte SIZEOF_XMMWORD/8
- psrldq xmmA, SIZEOF_XMMWORD/8*4
+ ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_XMMWORD/8
+ jb short .column_st7
+ movq XMM_MMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD/8*4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7:
- ; Store one pixel (4 bytes) of xmmA to the output when it has enough
- ; space.
- test ecx, ecx
- jz short .endcolumn
- movd XMM_DWORD [edi], xmmA
+ ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .endcolumn
+ movd XMM_DWORD [edi], xmmA
-%endif ; RGB_PIXELSIZE ; ---------------
+%endif ; RGB_PIXELSIZE ; ---------------
.endcolumn:
- sfence ; flush the write buffer
+ sfence ; flush the write buffer
.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
; --------------------------------------------------------------------------
;
; JSAMPARRAY output_buf);
;
-%define output_width(b) (b)+8 ; JDIMENSION output_width
-%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
-%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
+%define output_width(b) (b)+8 ; JDIMENSION output_width
+%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr
+%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
- align 16
- global EXTN(jsimd_h2v2_merged_upsample_sse2)
+ align 16
+ global EXTN(jsimd_h2v2_merged_upsample_sse2)
EXTN(jsimd_h2v2_merged_upsample_sse2):
- push ebp
- mov ebp,esp
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov eax, POINTER [output_width(ebp)]
-
- mov edi, JSAMPIMAGE [input_buf(ebp)]
- mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
- mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
- mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
- mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
- mov edi, JSAMPARRAY [output_buf(ebp)]
- lea esi, [esi+ecx*SIZEOF_JSAMPROW]
-
- push edx ; inptr2
- push ebx ; inptr1
- push esi ; inptr00
- mov ebx,esp
-
- push edi ; output_buf (outptr0)
- push ecx ; in_row_group_ctr
- push ebx ; input_buf
- push eax ; output_width
-
- call near EXTN(jsimd_h2v1_merged_upsample_sse2)
-
- add esi, byte SIZEOF_JSAMPROW ; inptr01
- add edi, byte SIZEOF_JSAMPROW ; outptr1
- mov POINTER [ebx+0*SIZEOF_POINTER], esi
- mov POINTER [ebx-1*SIZEOF_POINTER], edi
-
- call near EXTN(jsimd_h2v1_merged_upsample_sse2)
-
- add esp, byte 7*SIZEOF_DWORD
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- pop ebp
- ret
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov eax, POINTER [output_width(ebp)]
+
+ mov edi, JSAMPIMAGE [input_buf(ebp)]
+ mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
+ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+ mov edi, JSAMPARRAY [output_buf(ebp)]
+ lea esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+ push edx ; inptr2
+ push ebx ; inptr1
+ push esi ; inptr00
+ mov ebx, esp
+
+ push edi ; output_buf (outptr0)
+ push ecx ; in_row_group_ctr
+ push ebx ; input_buf
+ push eax ; output_width
+
+ call near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+ add esi, byte SIZEOF_JSAMPROW ; inptr01
+ add edi, byte SIZEOF_JSAMPROW ; outptr1
+ mov POINTER [ebx+0*SIZEOF_POINTER], esi
+ mov POINTER [ebx-1*SIZEOF_POINTER], edi
+
+ call near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+ add esp, byte 7*SIZEOF_DWORD
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
%include "jsimdext.inc"
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_fancy_upsample_sse2)
+ alignz 16
+ global EXTN(jconst_fancy_upsample_sse2)
EXTN(jconst_fancy_upsample_sse2):
-PW_ONE times 8 dw 1
-PW_TWO times 8 dw 2
-PW_THREE times 8 dw 3
-PW_SEVEN times 8 dw 7
-PW_EIGHT times 8 dw 8
+PW_ONE times 8 dw 1
+PW_TWO times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
+ SECTION SEG_TEXT
+ BITS 64
;
; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
;
; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr
- align 16
- global EXTN(jsimd_h2v1_fancy_upsample_sse2)
+ align 16
+ global EXTN(jsimd_h2v1_fancy_upsample_sse2)
EXTN(jsimd_h2v1_fancy_upsample_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
-
- mov eax, r11d ; colctr
- test rax,rax
- jz near .return
-
- mov rcx, r10 ; rowctr
- test rcx,rcx
- jz near .return
-
- mov rsi, r12 ; input_data
- mov rdi, r13
- mov rdi, JSAMPARRAY [rdi] ; output_data
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args
+
+ mov eax, r11d ; colctr
+ test rax, rax
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdi, JSAMPARRAY [rdi] ; output_data
.rowloop:
- push rax ; colctr
- push rdi
- push rsi
+ push rax ; colctr
+ push rdi
+ push rsi
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr
+ mov rsi, JSAMPROW [rsi] ; inptr
+ mov rdi, JSAMPROW [rdi] ; outptr
- test rax, SIZEOF_XMMWORD-1
- jz short .skip
- mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+ test rax, SIZEOF_XMMWORD-1
+ jz short .skip
+ mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
.skip:
- pxor xmm0,xmm0 ; xmm0=(all 0's)
- pcmpeqb xmm7,xmm7
- psrldq xmm7,(SIZEOF_XMMWORD-1)
- pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ pxor xmm0, xmm0 ; xmm0=(all 0's)
+ pcmpeqb xmm7, xmm7
+ psrldq xmm7, (SIZEOF_XMMWORD-1)
+ pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- add rax, byte SIZEOF_XMMWORD-1
- and rax, byte -SIZEOF_XMMWORD
- cmp rax, byte SIZEOF_XMMWORD
- ja short .columnloop
+ add rax, byte SIZEOF_XMMWORD-1
+ and rax, byte -SIZEOF_XMMWORD
+ cmp rax, byte SIZEOF_XMMWORD
+ ja short .columnloop
.columnloop_last:
- pcmpeqb xmm6,xmm6
- pslldq xmm6,(SIZEOF_XMMWORD-1)
- pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- jmp short .upsample
+ pcmpeqb xmm6, xmm6
+ pslldq xmm6, (SIZEOF_XMMWORD-1)
+ pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ jmp short .upsample
.columnloop:
- movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- pslldq xmm6,(SIZEOF_XMMWORD-1)
+ movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ pslldq xmm6, (SIZEOF_XMMWORD-1)
.upsample:
- movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqa xmm2,xmm1
- movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
- pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14)
- psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --)
-
- por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
- por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
-
- movdqa xmm7,xmm1
- psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
-
- movdqa xmm4,xmm1
- punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
- punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
- movdqa xmm5,xmm2
- punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
- punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
- movdqa xmm6,xmm3
- punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
- punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
-
- pmullw xmm1,[rel PW_THREE]
- pmullw xmm4,[rel PW_THREE]
- paddw xmm2,[rel PW_ONE]
- paddw xmm5,[rel PW_ONE]
- paddw xmm3,[rel PW_TWO]
- paddw xmm6,[rel PW_TWO]
-
- paddw xmm2,xmm1
- paddw xmm5,xmm4
- psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
- psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
- paddw xmm3,xmm1
- paddw xmm6,xmm4
- psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
- psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
-
- psllw xmm3,BYTE_BIT
- psllw xmm6,BYTE_BIT
- por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
- por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
-
- movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
-
- sub rax, byte SIZEOF_XMMWORD
- add rsi, byte 1*SIZEOF_XMMWORD ; inptr
- add rdi, byte 2*SIZEOF_XMMWORD ; outptr
- cmp rax, byte SIZEOF_XMMWORD
- ja near .columnloop
- test eax,eax
- jnz near .columnloop_last
-
- pop rsi
- pop rdi
- pop rax
-
- add rsi, byte SIZEOF_JSAMPROW ; input_data
- add rdi, byte SIZEOF_JSAMPROW ; output_data
- dec rcx ; rowctr
- jg near .rowloop
+ movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
+ pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14)
+ psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --)
+
+ por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
+ por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
+
+ movdqa xmm7, xmm1
+ psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
+
+ movdqa xmm4, xmm1
+ punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm2
+ punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
+ punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
+ movdqa xmm6, xmm3
+ punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
+ punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
+
+ pmullw xmm1, [rel PW_THREE]
+ pmullw xmm4, [rel PW_THREE]
+ paddw xmm2, [rel PW_ONE]
+ paddw xmm5, [rel PW_ONE]
+ paddw xmm3, [rel PW_TWO]
+ paddw xmm6, [rel PW_TWO]
+
+ paddw xmm2, xmm1
+ paddw xmm5, xmm4
+ psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+ paddw xmm3, xmm1
+ paddw xmm6, xmm4
+ psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm3, BYTE_BIT
+ psllw xmm6, BYTE_BIT
+ por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
+ por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
+
+ sub rax, byte SIZEOF_XMMWORD
+ add rsi, byte 1*SIZEOF_XMMWORD ; inptr
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
+ cmp rax, byte SIZEOF_XMMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop rsi
+ pop rdi
+ pop rax
+
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rcx ; rowctr
+ jg near .rowloop
.return:
- uncollect_args
- pop rbp
- ret
+ uncollect_args
+ pop rbp
+ ret
; --------------------------------------------------------------------------
;
; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 4
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 4
- align 16
- global EXTN(jsimd_h2v2_fancy_upsample_sse2)
+ align 16
+ global EXTN(jsimd_h2v2_fancy_upsample_sse2)
EXTN(jsimd_h2v2_fancy_upsample_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
- push rbx
-
- mov eax, r11d ; colctr
- test rax,rax
- jz near .return
-
- mov rcx, r10 ; rowctr
- test rcx,rcx
- jz near .return
-
- mov rsi, r12 ; input_data
- mov rdi, r13
- mov rdi, JSAMPARRAY [rdi] ; output_data
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args
+ push rbx
+
+ mov eax, r11d ; colctr
+ test rax, rax
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdi, JSAMPARRAY [rdi] ; output_data
.rowloop:
- push rax ; colctr
- push rcx
- push rdi
- push rsi
-
- mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
- mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
- mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
- mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
-
- test rax, SIZEOF_XMMWORD-1
- jz short .skip
- push rdx
- mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
- mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
- mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
- pop rdx
+ push rax ; colctr
+ push rcx
+ push rdi
+ push rsi
+
+ mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
+ mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
+ mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+
+ test rax, SIZEOF_XMMWORD-1
+ jz short .skip
+ push rdx
+ mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+ pop rdx
.skip:
- ; -- process the first column block
-
- movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
- movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
- movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
-
- pxor xmm3,xmm3 ; xmm3=(all 0's)
- movdqa xmm4,xmm0
- punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
- movdqa xmm5,xmm1
- punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
- movdqa xmm6,xmm2
- punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
-
- pmullw xmm0,[rel PW_THREE]
- pmullw xmm4,[rel PW_THREE]
-
- pcmpeqb xmm7,xmm7
- psrldq xmm7,(SIZEOF_XMMWORD-2)
-
- paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
- paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
- paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
- paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
-
- movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
- movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
- movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
-
- pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
- pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
-
- movdqa XMMWORD [wk(0)], xmm1
- movdqa XMMWORD [wk(1)], xmm2
-
- add rax, byte SIZEOF_XMMWORD-1
- and rax, byte -SIZEOF_XMMWORD
- cmp rax, byte SIZEOF_XMMWORD
- ja short .columnloop
+ ; -- process the first column block
+
+ movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
+ movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
+ movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
+
+ pxor xmm3, xmm3 ; xmm3=(all 0's)
+ movdqa xmm4, xmm0
+ punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm1
+ punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
+ movdqa xmm6, xmm2
+ punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
+
+ pmullw xmm0, [rel PW_THREE]
+ pmullw xmm4, [rel PW_THREE]
+
+ pcmpeqb xmm7, xmm7
+ psrldq xmm7, (SIZEOF_XMMWORD-2)
+
+ paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
+ paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
+ paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
+ paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
+
+ movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
+ movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
+
+ pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
+ pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
+
+ movdqa XMMWORD [wk(0)], xmm1
+ movdqa XMMWORD [wk(1)], xmm2
+
+ add rax, byte SIZEOF_XMMWORD-1
+ and rax, byte -SIZEOF_XMMWORD
+ cmp rax, byte SIZEOF_XMMWORD
+ ja short .columnloop
.columnloop_last:
- ; -- process the last column block
+ ; -- process the last column block
- pcmpeqb xmm1,xmm1
- pslldq xmm1,(SIZEOF_XMMWORD-2)
- movdqa xmm2,xmm1
+ pcmpeqb xmm1, xmm1
+ pslldq xmm1, (SIZEOF_XMMWORD-2)
+ movdqa xmm2, xmm1
- pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
- pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+ pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+ pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
- movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
- movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
+ movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
+ movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
- jmp near .upsample
+ jmp near .upsample
.columnloop:
- ; -- process the next column block
-
- movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
- movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
- movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
-
- pxor xmm3,xmm3 ; xmm3=(all 0's)
- movdqa xmm4,xmm0
- punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
- movdqa xmm5,xmm1
- punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
- movdqa xmm6,xmm2
- punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
-
- pmullw xmm0,[rel PW_THREE]
- pmullw xmm4,[rel PW_THREE]
-
- paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
- paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
- paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
- paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
-
- movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
- movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
- movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
-
- pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
- pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
-
- movdqa XMMWORD [wk(2)], xmm1
- movdqa XMMWORD [wk(3)], xmm2
+ ; -- process the next column block
+
+ movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
+ movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
+ movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
+
+ pxor xmm3, xmm3 ; xmm3=(all 0's)
+ movdqa xmm4, xmm0
+ punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm1
+ punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
+ movdqa xmm6, xmm2
+ punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
+
+ pmullw xmm0, [rel PW_THREE]
+ pmullw xmm4, [rel PW_THREE]
+
+ paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
+ paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
+ paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
+ paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
+
+ movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
+ movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
+ movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
+
+ pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
+ pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
+
+ movdqa XMMWORD [wk(2)], xmm1
+ movdqa XMMWORD [wk(3)], xmm2
.upsample:
- ; -- process the upper row
-
- movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
- movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
-
- movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
- movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
- psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --)
- pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
- movdqa xmm5,xmm7
- movdqa xmm6,xmm3
- psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
- pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14)
-
- por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
- por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
-
- movdqa xmm1,xmm7
- movdqa xmm2,xmm3
- pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
- psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --)
- movdqa xmm4,xmm3
- psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
-
- por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
- por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
-
- movdqa XMMWORD [wk(0)], xmm4
-
- pmullw xmm7,[rel PW_THREE]
- pmullw xmm3,[rel PW_THREE]
- paddw xmm1,[rel PW_EIGHT]
- paddw xmm5,[rel PW_EIGHT]
- paddw xmm0,[rel PW_SEVEN]
- paddw xmm2,[rel PW_SEVEN]
-
- paddw xmm1,xmm7
- paddw xmm5,xmm3
- psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
- psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
- paddw xmm0,xmm7
- paddw xmm2,xmm3
- psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
- psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
-
- psllw xmm0,BYTE_BIT
- psllw xmm2,BYTE_BIT
- por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
- por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
-
- movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
- movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
-
- ; -- process the lower row
-
- movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
- movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
-
- movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
- movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
- psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --)
- pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
- movdqa xmm0,xmm6
- movdqa xmm2,xmm4
- psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
- pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14)
-
- por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
- por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
-
- movdqa xmm1,xmm6
- movdqa xmm5,xmm4
- pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
- psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --)
- movdqa xmm3,xmm4
- psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
-
- por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
- por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
-
- movdqa XMMWORD [wk(1)], xmm3
-
- pmullw xmm6,[rel PW_THREE]
- pmullw xmm4,[rel PW_THREE]
- paddw xmm1,[rel PW_EIGHT]
- paddw xmm0,[rel PW_EIGHT]
- paddw xmm7,[rel PW_SEVEN]
- paddw xmm5,[rel PW_SEVEN]
-
- paddw xmm1,xmm6
- paddw xmm0,xmm4
- psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
- psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
- paddw xmm7,xmm6
- paddw xmm5,xmm4
- psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
- psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
-
- psllw xmm7,BYTE_BIT
- psllw xmm5,BYTE_BIT
- por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
- por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
-
- movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
- movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
-
- sub rax, byte SIZEOF_XMMWORD
- add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
- add rbx, byte 1*SIZEOF_XMMWORD ; inptr0
- add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
- add rdx, byte 2*SIZEOF_XMMWORD ; outptr0
- add rdi, byte 2*SIZEOF_XMMWORD ; outptr1
- cmp rax, byte SIZEOF_XMMWORD
- ja near .columnloop
- test rax,rax
- jnz near .columnloop_last
-
- pop rsi
- pop rdi
- pop rcx
- pop rax
-
- add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
- add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
- sub rcx, byte 2 ; rowctr
- jg near .rowloop
+ ; -- process the upper row
+
+ movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+ movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+
+ movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
+ movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
+ psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --)
+ pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
+ movdqa xmm5, xmm7
+ movdqa xmm6, xmm3
+ psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
+ pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14)
+
+ por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
+ por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
+
+ movdqa xmm1, xmm7
+ movdqa xmm2, xmm3
+ pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
+ psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --)
+ movdqa xmm4, xmm3
+ psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
+
+ por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
+ por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
+
+ movdqa XMMWORD [wk(0)], xmm4
+
+ pmullw xmm7, [rel PW_THREE]
+ pmullw xmm3, [rel PW_THREE]
+ paddw xmm1, [rel PW_EIGHT]
+ paddw xmm5, [rel PW_EIGHT]
+ paddw xmm0, [rel PW_SEVEN]
+ paddw xmm2, [rel PW_SEVEN]
+
+ paddw xmm1, xmm7
+ paddw xmm5, xmm3
+ psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+ paddw xmm0, xmm7
+ paddw xmm2, xmm3
+ psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm0, BYTE_BIT
+ psllw xmm2, BYTE_BIT
+ por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
+ por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
+
+ ; -- process the lower row
+
+ movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
+ movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
+ movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
+ psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --)
+ pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
+ movdqa xmm0, xmm6
+ movdqa xmm2, xmm4
+ psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
+ pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14)
+
+ por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
+ por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
+
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm4
+ pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
+ psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --)
+ movdqa xmm3, xmm4
+ psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
+
+ por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
+ por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
+
+ movdqa XMMWORD [wk(1)], xmm3
+
+ pmullw xmm6, [rel PW_THREE]
+ pmullw xmm4, [rel PW_THREE]
+ paddw xmm1, [rel PW_EIGHT]
+ paddw xmm0, [rel PW_EIGHT]
+ paddw xmm7, [rel PW_SEVEN]
+ paddw xmm5, [rel PW_SEVEN]
+
+ paddw xmm1, xmm6
+ paddw xmm0, xmm4
+ psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+ paddw xmm7, xmm6
+ paddw xmm5, xmm4
+ psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm7, BYTE_BIT
+ psllw xmm5, BYTE_BIT
+ por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
+ por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
+
+ sub rax, byte SIZEOF_XMMWORD
+ add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
+ add rbx, byte 1*SIZEOF_XMMWORD ; inptr0
+ add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
+ add rdx, byte 2*SIZEOF_XMMWORD ; outptr0
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr1
+ cmp rax, byte SIZEOF_XMMWORD
+ ja near .columnloop
+ test rax, rax
+ jnz near .columnloop_last
+
+ pop rsi
+ pop rdi
+ pop rcx
+ pop rax
+
+ add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub rcx, byte 2 ; rowctr
+ jg near .rowloop
.return:
- pop rbx
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
+ pop rbx
+ uncollect_args
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
; --------------------------------------------------------------------------
;
; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr
- align 16
- global EXTN(jsimd_h2v1_upsample_sse2)
+ align 16
+ global EXTN(jsimd_h2v1_upsample_sse2)
EXTN(jsimd_h2v1_upsample_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
-
- mov edx, r11d
- add rdx, byte (2*SIZEOF_XMMWORD)-1
- and rdx, byte -(2*SIZEOF_XMMWORD)
- jz near .return
-
- mov rcx, r10 ; rowctr
- test rcx,rcx
- jz short .return
-
- mov rsi, r12 ; input_data
- mov rdi, r13
- mov rdi, JSAMPARRAY [rdi] ; output_data
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args
+
+ mov edx, r11d
+ add rdx, byte (2*SIZEOF_XMMWORD)-1
+ and rdx, byte -(2*SIZEOF_XMMWORD)
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz short .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdi, JSAMPARRAY [rdi] ; output_data
.rowloop:
- push rdi
- push rsi
+ push rdi
+ push rsi
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr
- mov rax,rdx ; colctr
+ mov rsi, JSAMPROW [rsi] ; inptr
+ mov rdi, JSAMPROW [rdi] ; outptr
+ mov rax, rdx ; colctr
.columnloop:
- movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqa xmm1,xmm0
- punpcklbw xmm0,xmm0
- punpckhbw xmm1,xmm1
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0
+ punpckhbw xmm1, xmm1
- movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
- movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
- sub rax, byte 2*SIZEOF_XMMWORD
- jz short .nextrow
+ sub rax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
- movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- movdqa xmm3,xmm2
- punpcklbw xmm2,xmm2
- punpckhbw xmm3,xmm3
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm2
+ punpckhbw xmm3, xmm3
- movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+ movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
- sub rax, byte 2*SIZEOF_XMMWORD
- jz short .nextrow
+ sub rax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
- add rsi, byte 2*SIZEOF_XMMWORD ; inptr
- add rdi, byte 4*SIZEOF_XMMWORD ; outptr
- jmp short .columnloop
+ add rsi, byte 2*SIZEOF_XMMWORD ; inptr
+ add rdi, byte 4*SIZEOF_XMMWORD ; outptr
+ jmp short .columnloop
.nextrow:
- pop rsi
- pop rdi
+ pop rsi
+ pop rdi
- add rsi, byte SIZEOF_JSAMPROW ; input_data
- add rdi, byte SIZEOF_JSAMPROW ; output_data
- dec rcx ; rowctr
- jg short .rowloop
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
+ dec rcx ; rowctr
+ jg short .rowloop
.return:
- uncollect_args
- pop rbp
- ret
+ uncollect_args
+ pop rbp
+ ret
; --------------------------------------------------------------------------
;
; It's still a box filter.
;
; GLOBAL(void)
-; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
+; jsimd_h2v2_upsample_sse2 (int max_v_samp_factor,
; JDIMENSION output_width,
; JSAMPARRAY input_data,
; JSAMPARRAY *output_data_ptr);
; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr
- align 16
- global EXTN(jsimd_h2v2_upsample_sse2)
+ align 16
+ global EXTN(jsimd_h2v2_upsample_sse2)
EXTN(jsimd_h2v2_upsample_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
- push rbx
-
- mov edx, r11d
- add rdx, byte (2*SIZEOF_XMMWORD)-1
- and rdx, byte -(2*SIZEOF_XMMWORD)
- jz near .return
-
- mov rcx, r10 ; rowctr
- test rcx,rcx
- jz near .return
-
- mov rsi, r12 ; input_data
- mov rdi, r13
- mov rdi, JSAMPARRAY [rdi] ; output_data
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args
+ push rbx
+
+ mov edx, r11d
+ add rdx, byte (2*SIZEOF_XMMWORD)-1
+ and rdx, byte -(2*SIZEOF_XMMWORD)
+ jz near .return
+
+ mov rcx, r10 ; rowctr
+ test rcx, rcx
+ jz near .return
+
+ mov rsi, r12 ; input_data
+ mov rdi, r13
+ mov rdi, JSAMPARRAY [rdi] ; output_data
.rowloop:
- push rdi
- push rsi
+ push rdi
+ push rsi
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
- mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
- mov rax,rdx ; colctr
+ mov rsi, JSAMPROW [rsi] ; inptr
+ mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov rax, rdx ; colctr
.columnloop:
- movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+ movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
- movdqa xmm1,xmm0
- punpcklbw xmm0,xmm0
- punpckhbw xmm1,xmm1
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0
+ punpckhbw xmm1, xmm1
- movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
- movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
- movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
- movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
- sub rax, byte 2*SIZEOF_XMMWORD
- jz short .nextrow
+ sub rax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
- movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+ movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
- movdqa xmm3,xmm2
- punpcklbw xmm2,xmm2
- punpckhbw xmm3,xmm3
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm2
+ punpckhbw xmm3, xmm3
- movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
- movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+ movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
+ movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
- sub rax, byte 2*SIZEOF_XMMWORD
- jz short .nextrow
+ sub rax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
- add rsi, byte 2*SIZEOF_XMMWORD ; inptr
- add rbx, byte 4*SIZEOF_XMMWORD ; outptr0
- add rdi, byte 4*SIZEOF_XMMWORD ; outptr1
- jmp short .columnloop
+ add rsi, byte 2*SIZEOF_XMMWORD ; inptr
+ add rbx, byte 4*SIZEOF_XMMWORD ; outptr0
+ add rdi, byte 4*SIZEOF_XMMWORD ; outptr1
+ jmp short .columnloop
.nextrow:
- pop rsi
- pop rdi
+ pop rsi
+ pop rdi
- add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
- add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
- sub rcx, byte 2 ; rowctr
- jg near .rowloop
+ add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub rcx, byte 2 ; rowctr
+ jg near .rowloop
.return:
- pop rbx
- uncollect_args
- pop rbp
- ret
+ pop rbx
+ uncollect_args
+ pop rbp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
%include "jsimdext.inc"
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_fancy_upsample_sse2)
+ alignz 16
+ global EXTN(jconst_fancy_upsample_sse2)
EXTN(jconst_fancy_upsample_sse2):
-PW_ONE times 8 dw 1
-PW_TWO times 8 dw 2
-PW_THREE times 8 dw 3
-PW_SEVEN times 8 dw 7
-PW_EIGHT times 8 dw 8
+PW_ONE times 8 dw 1
+PW_TWO times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
+ SECTION SEG_TEXT
+ BITS 32
;
; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
;
; JSAMPARRAY *output_data_ptr);
;
-%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
-%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
-%define input_data(b) (b)+16 ; JSAMPARRAY input_data
-%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
+%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
+%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
+%define input_data(b) (b)+16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
- align 16
- global EXTN(jsimd_h2v1_fancy_upsample_sse2)
+ align 16
+ global EXTN(jsimd_h2v1_fancy_upsample_sse2)
EXTN(jsimd_h2v1_fancy_upsample_sse2):
- push ebp
- mov ebp,esp
- pushpic ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
- test eax,eax
- jz near .return
-
- mov ecx, INT [max_v_samp(ebp)] ; rowctr
- test ecx,ecx
- jz near .return
-
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- mov edi, POINTER [output_data_ptr(ebp)]
- mov edi, JSAMPARRAY [edi] ; output_data
- alignx 16,7
+ push ebp
+ mov ebp, esp
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
+ test eax, eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
.rowloop:
- push eax ; colctr
- push edi
- push esi
+ push eax ; colctr
+ push edi
+ push esi
- mov esi, JSAMPROW [esi] ; inptr
- mov edi, JSAMPROW [edi] ; outptr
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
- test eax, SIZEOF_XMMWORD-1
- jz short .skip
- mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+ test eax, SIZEOF_XMMWORD-1
+ jz short .skip
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
.skip:
- pxor xmm0,xmm0 ; xmm0=(all 0's)
- pcmpeqb xmm7,xmm7
- psrldq xmm7,(SIZEOF_XMMWORD-1)
- pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ pxor xmm0, xmm0 ; xmm0=(all 0's)
+ pcmpeqb xmm7, xmm7
+ psrldq xmm7, (SIZEOF_XMMWORD-1)
+ pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
- add eax, byte SIZEOF_XMMWORD-1
- and eax, byte -SIZEOF_XMMWORD
- cmp eax, byte SIZEOF_XMMWORD
- ja short .columnloop
- alignx 16,7
+ add eax, byte SIZEOF_XMMWORD-1
+ and eax, byte -SIZEOF_XMMWORD
+ cmp eax, byte SIZEOF_XMMWORD
+ ja short .columnloop
+ alignx 16, 7
.columnloop_last:
- pcmpeqb xmm6,xmm6
- pslldq xmm6,(SIZEOF_XMMWORD-1)
- pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
- jmp short .upsample
- alignx 16,7
+ pcmpeqb xmm6, xmm6
+ pslldq xmm6, (SIZEOF_XMMWORD-1)
+ pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ jmp short .upsample
+ alignx 16, 7
.columnloop:
- movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
- pslldq xmm6,(SIZEOF_XMMWORD-1)
+ movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ pslldq xmm6, (SIZEOF_XMMWORD-1)
.upsample:
- movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqa xmm2,xmm1
- movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
- pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14)
- psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --)
-
- por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
- por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
-
- movdqa xmm7,xmm1
- psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
-
- movdqa xmm4,xmm1
- punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
- punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
- movdqa xmm5,xmm2
- punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
- punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
- movdqa xmm6,xmm3
- punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
- punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
-
- pmullw xmm1,[GOTOFF(ebx,PW_THREE)]
- pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
- paddw xmm2,[GOTOFF(ebx,PW_ONE)]
- paddw xmm5,[GOTOFF(ebx,PW_ONE)]
- paddw xmm3,[GOTOFF(ebx,PW_TWO)]
- paddw xmm6,[GOTOFF(ebx,PW_TWO)]
-
- paddw xmm2,xmm1
- paddw xmm5,xmm4
- psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
- psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
- paddw xmm3,xmm1
- paddw xmm6,xmm4
- psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
- psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
-
- psllw xmm3,BYTE_BIT
- psllw xmm6,BYTE_BIT
- por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
- por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
-
- movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
-
- sub eax, byte SIZEOF_XMMWORD
- add esi, byte 1*SIZEOF_XMMWORD ; inptr
- add edi, byte 2*SIZEOF_XMMWORD ; outptr
- cmp eax, byte SIZEOF_XMMWORD
- ja near .columnloop
- test eax,eax
- jnz near .columnloop_last
-
- pop esi
- pop edi
- pop eax
-
- add esi, byte SIZEOF_JSAMPROW ; input_data
- add edi, byte SIZEOF_JSAMPROW ; output_data
- dec ecx ; rowctr
- jg near .rowloop
+ movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
+ pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14)
+ psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --)
+
+ por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
+ por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
+
+ movdqa xmm7, xmm1
+ psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
+
+ movdqa xmm4, xmm1
+ punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm2
+ punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
+ punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
+ movdqa xmm6, xmm3
+ punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
+ punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
+
+ pmullw xmm1, [GOTOFF(ebx,PW_THREE)]
+ pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
+ paddw xmm2, [GOTOFF(ebx,PW_ONE)]
+ paddw xmm5, [GOTOFF(ebx,PW_ONE)]
+ paddw xmm3, [GOTOFF(ebx,PW_TWO)]
+ paddw xmm6, [GOTOFF(ebx,PW_TWO)]
+
+ paddw xmm2, xmm1
+ paddw xmm5, xmm4
+ psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+ paddw xmm3, xmm1
+ paddw xmm6, xmm4
+ psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm3, BYTE_BIT
+ psllw xmm6, BYTE_BIT
+ por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
+ por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
+
+ sub eax, byte SIZEOF_XMMWORD
+ add esi, byte 1*SIZEOF_XMMWORD ; inptr
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
+ cmp eax, byte SIZEOF_XMMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop eax
+
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg near .rowloop
.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- poppic ebx
- pop ebp
- ret
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ pop ebp
+ ret
; --------------------------------------------------------------------------
;
; JSAMPARRAY *output_data_ptr);
;
-%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
-%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
-%define input_data(b) (b)+16 ; JSAMPARRAY input_data
-%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
+%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
+%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
+%define input_data(b) (b)+16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 4
-%define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 4
+%define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr
- align 16
- global EXTN(jsimd_h2v2_fancy_upsample_sse2)
+ align 16
+ global EXTN(jsimd_h2v2_fancy_upsample_sse2)
EXTN(jsimd_h2v2_fancy_upsample_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic eax ; make a room for GOT address
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
- movpic POINTER [gotptr], ebx ; save GOT address
-
- mov edx,eax ; edx = original ebp
- mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
- test eax,eax
- jz near .return
-
- mov ecx, INT [max_v_samp(edx)] ; rowctr
- test ecx,ecx
- jz near .return
-
- mov esi, JSAMPARRAY [input_data(edx)] ; input_data
- mov edi, POINTER [output_data_ptr(edx)]
- mov edi, JSAMPARRAY [edi] ; output_data
- alignx 16,7
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic eax ; make a room for GOT address
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+ movpic POINTER [gotptr], ebx ; save GOT address
+
+ mov edx, eax ; edx = original ebp
+ mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
+ test eax, eax
+ jz near .return
+
+ mov ecx, INT [max_v_samp(edx)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(edx)] ; input_data
+ mov edi, POINTER [output_data_ptr(edx)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
.rowloop:
- push eax ; colctr
- push ecx
- push edi
- push esi
-
- mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
- mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
- mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
- mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
-
- test eax, SIZEOF_XMMWORD-1
- jz short .skip
- push edx
- mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
- mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
- mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
- mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
- pop edx
+ push eax ; colctr
+ push ecx
+ push edi
+ push esi
+
+ mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+
+ test eax, SIZEOF_XMMWORD-1
+ jz short .skip
+ push edx
+ mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+ mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+ mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
+ pop edx
.skip:
- ; -- process the first column block
+ ; -- process the first column block
- movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
- movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
- movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
+ movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
+ movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
+ movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
- pushpic ebx
- movpic ebx, POINTER [gotptr] ; load GOT address
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
- pxor xmm3,xmm3 ; xmm3=(all 0's)
- movdqa xmm4,xmm0
- punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
- movdqa xmm5,xmm1
- punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
- movdqa xmm6,xmm2
- punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
+ pxor xmm3, xmm3 ; xmm3=(all 0's)
+ movdqa xmm4, xmm0
+ punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm1
+ punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
+ movdqa xmm6, xmm2
+ punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
- pmullw xmm0,[GOTOFF(ebx,PW_THREE)]
- pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
+ pmullw xmm0, [GOTOFF(ebx,PW_THREE)]
+ pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
- pcmpeqb xmm7,xmm7
- psrldq xmm7,(SIZEOF_XMMWORD-2)
+ pcmpeqb xmm7, xmm7
+ psrldq xmm7, (SIZEOF_XMMWORD-2)
- paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
- paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
- paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
- paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
+ paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
+ paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
+ paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
+ paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
- movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
- movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
- movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
+ movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
+ movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
- pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
- pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
+ pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
+ pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
- movdqa XMMWORD [wk(0)], xmm1
- movdqa XMMWORD [wk(1)], xmm2
+ movdqa XMMWORD [wk(0)], xmm1
+ movdqa XMMWORD [wk(1)], xmm2
- poppic ebx
+ poppic ebx
- add eax, byte SIZEOF_XMMWORD-1
- and eax, byte -SIZEOF_XMMWORD
- cmp eax, byte SIZEOF_XMMWORD
- ja short .columnloop
- alignx 16,7
+ add eax, byte SIZEOF_XMMWORD-1
+ and eax, byte -SIZEOF_XMMWORD
+ cmp eax, byte SIZEOF_XMMWORD
+ ja short .columnloop
+ alignx 16, 7
.columnloop_last:
- ; -- process the last column block
+ ; -- process the last column block
- pushpic ebx
- movpic ebx, POINTER [gotptr] ; load GOT address
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
- pcmpeqb xmm1,xmm1
- pslldq xmm1,(SIZEOF_XMMWORD-2)
- movdqa xmm2,xmm1
+ pcmpeqb xmm1, xmm1
+ pslldq xmm1, (SIZEOF_XMMWORD-2)
+ movdqa xmm2, xmm1
- pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
- pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
+ pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
+ pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
- movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
- movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
+ movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
+ movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
- jmp near .upsample
- alignx 16,7
+ jmp near .upsample
+ alignx 16, 7
.columnloop:
- ; -- process the next column block
+ ; -- process the next column block
- movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
- movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
- movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
+ movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
+ movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
+ movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
- pushpic ebx
- movpic ebx, POINTER [gotptr] ; load GOT address
+ pushpic ebx
+ movpic ebx, POINTER [gotptr] ; load GOT address
- pxor xmm3,xmm3 ; xmm3=(all 0's)
- movdqa xmm4,xmm0
- punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
- movdqa xmm5,xmm1
- punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
- movdqa xmm6,xmm2
- punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
- punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
+ pxor xmm3, xmm3 ; xmm3=(all 0's)
+ movdqa xmm4, xmm0
+ punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
+ movdqa xmm5, xmm1
+ punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
+ movdqa xmm6, xmm2
+ punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
+ punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
- pmullw xmm0,[GOTOFF(ebx,PW_THREE)]
- pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
+ pmullw xmm0, [GOTOFF(ebx,PW_THREE)]
+ pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
- paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
- paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
- paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
- paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
+ paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
+ paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
+ paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
+ paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
- movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
- movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
- movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
+ movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
+ movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
+ movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
- pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
- pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
+ pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
+ pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
- movdqa XMMWORD [wk(2)], xmm1
- movdqa XMMWORD [wk(3)], xmm2
+ movdqa XMMWORD [wk(2)], xmm1
+ movdqa XMMWORD [wk(3)], xmm2
.upsample:
- ; -- process the upper row
-
- movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
- movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
-
- movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
- movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
- psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --)
- pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
- movdqa xmm5,xmm7
- movdqa xmm6,xmm3
- psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
- pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14)
-
- por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
- por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
-
- movdqa xmm1,xmm7
- movdqa xmm2,xmm3
- pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
- psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --)
- movdqa xmm4,xmm3
- psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
-
- por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
- por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
-
- movdqa XMMWORD [wk(0)], xmm4
-
- pmullw xmm7,[GOTOFF(ebx,PW_THREE)]
- pmullw xmm3,[GOTOFF(ebx,PW_THREE)]
- paddw xmm1,[GOTOFF(ebx,PW_EIGHT)]
- paddw xmm5,[GOTOFF(ebx,PW_EIGHT)]
- paddw xmm0,[GOTOFF(ebx,PW_SEVEN)]
- paddw xmm2,[GOTOFF(ebx,PW_SEVEN)]
-
- paddw xmm1,xmm7
- paddw xmm5,xmm3
- psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
- psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
- paddw xmm0,xmm7
- paddw xmm2,xmm3
- psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
- psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
-
- psllw xmm0,BYTE_BIT
- psllw xmm2,BYTE_BIT
- por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
- por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
-
- movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
- movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
-
- ; -- process the lower row
-
- movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
- movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
-
- movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
- movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
- psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --)
- pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
- movdqa xmm0,xmm6
- movdqa xmm2,xmm4
- psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
- pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14)
-
- por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
- por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
-
- movdqa xmm1,xmm6
- movdqa xmm5,xmm4
- pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
- psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --)
- movdqa xmm3,xmm4
- psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
-
- por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
- por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
-
- movdqa XMMWORD [wk(1)], xmm3
-
- pmullw xmm6,[GOTOFF(ebx,PW_THREE)]
- pmullw xmm4,[GOTOFF(ebx,PW_THREE)]
- paddw xmm1,[GOTOFF(ebx,PW_EIGHT)]
- paddw xmm0,[GOTOFF(ebx,PW_EIGHT)]
- paddw xmm7,[GOTOFF(ebx,PW_SEVEN)]
- paddw xmm5,[GOTOFF(ebx,PW_SEVEN)]
-
- paddw xmm1,xmm6
- paddw xmm0,xmm4
- psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
- psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
- paddw xmm7,xmm6
- paddw xmm5,xmm4
- psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
- psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
-
- psllw xmm7,BYTE_BIT
- psllw xmm5,BYTE_BIT
- por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
- por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
-
- movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
- movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
-
- poppic ebx
-
- sub eax, byte SIZEOF_XMMWORD
- add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
- add ebx, byte 1*SIZEOF_XMMWORD ; inptr0
- add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
- add edx, byte 2*SIZEOF_XMMWORD ; outptr0
- add edi, byte 2*SIZEOF_XMMWORD ; outptr1
- cmp eax, byte SIZEOF_XMMWORD
- ja near .columnloop
- test eax,eax
- jnz near .columnloop_last
-
- pop esi
- pop edi
- pop ecx
- pop eax
-
- add esi, byte 1*SIZEOF_JSAMPROW ; input_data
- add edi, byte 2*SIZEOF_JSAMPROW ; output_data
- sub ecx, byte 2 ; rowctr
- jg near .rowloop
+ ; -- process the upper row
+
+ movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
+ movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
+
+ movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
+ movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
+ psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --)
+ pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
+ movdqa xmm5, xmm7
+ movdqa xmm6, xmm3
+ psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
+ pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14)
+
+ por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
+ por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
+
+ movdqa xmm1, xmm7
+ movdqa xmm2, xmm3
+ pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
+ psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --)
+ movdqa xmm4, xmm3
+ psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
+
+ por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
+ por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
+
+ movdqa XMMWORD [wk(0)], xmm4
+
+ pmullw xmm7, [GOTOFF(ebx,PW_THREE)]
+ pmullw xmm3, [GOTOFF(ebx,PW_THREE)]
+ paddw xmm1, [GOTOFF(ebx,PW_EIGHT)]
+ paddw xmm5, [GOTOFF(ebx,PW_EIGHT)]
+ paddw xmm0, [GOTOFF(ebx,PW_SEVEN)]
+ paddw xmm2, [GOTOFF(ebx,PW_SEVEN)]
+
+ paddw xmm1, xmm7
+ paddw xmm5, xmm3
+ psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+ paddw xmm0, xmm7
+ paddw xmm2, xmm3
+ psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm0, BYTE_BIT
+ psllw xmm2, BYTE_BIT
+ por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
+ por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
+
+ ; -- process the lower row
+
+ movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
+ movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+ movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
+ movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
+ psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --)
+ pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
+ movdqa xmm0, xmm6
+ movdqa xmm2, xmm4
+ psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
+ pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14)
+
+ por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
+ por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
+
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm4
+ pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
+ psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --)
+ movdqa xmm3, xmm4
+ psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
+
+ por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
+ por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
+
+ movdqa XMMWORD [wk(1)], xmm3
+
+ pmullw xmm6, [GOTOFF(ebx,PW_THREE)]
+ pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
+ paddw xmm1, [GOTOFF(ebx,PW_EIGHT)]
+ paddw xmm0, [GOTOFF(ebx,PW_EIGHT)]
+ paddw xmm7, [GOTOFF(ebx,PW_SEVEN)]
+ paddw xmm5, [GOTOFF(ebx,PW_SEVEN)]
+
+ paddw xmm1, xmm6
+ paddw xmm0, xmm4
+ psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
+ psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+ paddw xmm7, xmm6
+ paddw xmm5, xmm4
+ psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
+ psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+ psllw xmm7, BYTE_BIT
+ psllw xmm5, BYTE_BIT
+ por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
+ por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
+
+ poppic ebx
+
+ sub eax, byte SIZEOF_XMMWORD
+ add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
+ add ebx, byte 1*SIZEOF_XMMWORD ; inptr0
+ add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
+ add edx, byte 2*SIZEOF_XMMWORD ; outptr0
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr1
+ cmp eax, byte SIZEOF_XMMWORD
+ ja near .columnloop
+ test eax, eax
+ jnz near .columnloop_last
+
+ pop esi
+ pop edi
+ pop ecx
+ pop eax
+
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg near .rowloop
.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
; --------------------------------------------------------------------------
;
; JSAMPARRAY *output_data_ptr);
;
-%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
-%define output_width(b) (b)+12 ; JDIMENSION output_width
-%define input_data(b) (b)+16 ; JSAMPARRAY input_data
-%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
+%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
+%define output_width(b) (b)+12 ; JDIMENSION output_width
+%define input_data(b) (b)+16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
- align 16
- global EXTN(jsimd_h2v1_upsample_sse2)
+ align 16
+ global EXTN(jsimd_h2v1_upsample_sse2)
EXTN(jsimd_h2v1_upsample_sse2):
- push ebp
- mov ebp,esp
-; push ebx ; unused
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov edx, JDIMENSION [output_width(ebp)]
- add edx, byte (2*SIZEOF_XMMWORD)-1
- and edx, byte -(2*SIZEOF_XMMWORD)
- jz short .return
-
- mov ecx, INT [max_v_samp(ebp)] ; rowctr
- test ecx,ecx
- jz short .return
-
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- mov edi, POINTER [output_data_ptr(ebp)]
- mov edi, JSAMPARRAY [edi] ; output_data
- alignx 16,7
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (2*SIZEOF_XMMWORD)-1
+ and edx, byte -(2*SIZEOF_XMMWORD)
+ jz short .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz short .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
.rowloop:
- push edi
- push esi
+ push edi
+ push esi
- mov esi, JSAMPROW [esi] ; inptr
- mov edi, JSAMPROW [edi] ; outptr
- mov eax,edx ; colctr
- alignx 16,7
+ mov esi, JSAMPROW [esi] ; inptr
+ mov edi, JSAMPROW [edi] ; outptr
+ mov eax, edx ; colctr
+ alignx 16, 7
.columnloop:
- movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqa xmm1,xmm0
- punpcklbw xmm0,xmm0
- punpckhbw xmm1,xmm1
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0
+ punpckhbw xmm1, xmm1
- movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
- movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
- sub eax, byte 2*SIZEOF_XMMWORD
- jz short .nextrow
+ sub eax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
- movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
- movdqa xmm3,xmm2
- punpcklbw xmm2,xmm2
- punpckhbw xmm3,xmm3
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm2
+ punpckhbw xmm3, xmm3
- movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+ movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
- sub eax, byte 2*SIZEOF_XMMWORD
- jz short .nextrow
+ sub eax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
- add esi, byte 2*SIZEOF_XMMWORD ; inptr
- add edi, byte 4*SIZEOF_XMMWORD ; outptr
- jmp short .columnloop
- alignx 16,7
+ add esi, byte 2*SIZEOF_XMMWORD ; inptr
+ add edi, byte 4*SIZEOF_XMMWORD ; outptr
+ jmp short .columnloop
+ alignx 16, 7
.nextrow:
- pop esi
- pop edi
+ pop esi
+ pop edi
- add esi, byte SIZEOF_JSAMPROW ; input_data
- add edi, byte SIZEOF_JSAMPROW ; output_data
- dec ecx ; rowctr
- jg short .rowloop
+ add esi, byte SIZEOF_JSAMPROW ; input_data
+ add edi, byte SIZEOF_JSAMPROW ; output_data
+ dec ecx ; rowctr
+ jg short .rowloop
.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
-; pop ebx ; unused
- pop ebp
- ret
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+; pop ebx ; unused
+ pop ebp
+ ret
; --------------------------------------------------------------------------
;
; JSAMPARRAY *output_data_ptr);
;
-%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
-%define output_width(b) (b)+12 ; JDIMENSION output_width
-%define input_data(b) (b)+16 ; JSAMPARRAY input_data
-%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
+%define max_v_samp(b) (b)+8 ; int max_v_samp_factor
+%define output_width(b) (b)+12 ; JDIMENSION output_width
+%define input_data(b) (b)+16 ; JSAMPARRAY input_data
+%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
- align 16
- global EXTN(jsimd_h2v2_upsample_sse2)
+ align 16
+ global EXTN(jsimd_h2v2_upsample_sse2)
EXTN(jsimd_h2v2_upsample_sse2):
- push ebp
- mov ebp,esp
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov edx, JDIMENSION [output_width(ebp)]
- add edx, byte (2*SIZEOF_XMMWORD)-1
- and edx, byte -(2*SIZEOF_XMMWORD)
- jz near .return
-
- mov ecx, INT [max_v_samp(ebp)] ; rowctr
- test ecx,ecx
- jz near .return
-
- mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
- mov edi, POINTER [output_data_ptr(ebp)]
- mov edi, JSAMPARRAY [edi] ; output_data
- alignx 16,7
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov edx, JDIMENSION [output_width(ebp)]
+ add edx, byte (2*SIZEOF_XMMWORD)-1
+ and edx, byte -(2*SIZEOF_XMMWORD)
+ jz near .return
+
+ mov ecx, INT [max_v_samp(ebp)] ; rowctr
+ test ecx, ecx
+ jz near .return
+
+ mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
+ mov edi, POINTER [output_data_ptr(ebp)]
+ mov edi, JSAMPARRAY [edi] ; output_data
+ alignx 16, 7
.rowloop:
- push edi
- push esi
-
- mov esi, JSAMPROW [esi] ; inptr
- mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
- mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
- mov eax,edx ; colctr
- alignx 16,7
+ push edi
+ push esi
+
+ mov esi, JSAMPROW [esi] ; inptr
+ mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov eax, edx ; colctr
+ alignx 16, 7
.columnloop:
- movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+ movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
- movdqa xmm1,xmm0
- punpcklbw xmm0,xmm0
- punpckhbw xmm1,xmm1
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0
+ punpckhbw xmm1, xmm1
- movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
- movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
- movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
- movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+ movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+ movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
- sub eax, byte 2*SIZEOF_XMMWORD
- jz short .nextrow
+ sub eax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
- movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+ movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
- movdqa xmm3,xmm2
- punpcklbw xmm2,xmm2
- punpckhbw xmm3,xmm3
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm2
+ punpckhbw xmm3, xmm3
- movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
- movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
- movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+ movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
+ movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+ movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
- sub eax, byte 2*SIZEOF_XMMWORD
- jz short .nextrow
+ sub eax, byte 2*SIZEOF_XMMWORD
+ jz short .nextrow
- add esi, byte 2*SIZEOF_XMMWORD ; inptr
- add ebx, byte 4*SIZEOF_XMMWORD ; outptr0
- add edi, byte 4*SIZEOF_XMMWORD ; outptr1
- jmp short .columnloop
- alignx 16,7
+ add esi, byte 2*SIZEOF_XMMWORD ; inptr
+ add ebx, byte 4*SIZEOF_XMMWORD ; outptr0
+ add edi, byte 4*SIZEOF_XMMWORD ; outptr1
+ jmp short .columnloop
+ alignx 16, 7
.nextrow:
- pop esi
- pop edi
+ pop esi
+ pop edi
- add esi, byte 1*SIZEOF_JSAMPROW ; input_data
- add edi, byte 2*SIZEOF_JSAMPROW ; output_data
- sub ecx, byte 2 ; rowctr
- jg short .rowloop
+ add esi, byte 1*SIZEOF_JSAMPROW ; input_data
+ add edi, byte 2*SIZEOF_JSAMPROW ; output_data
+ sub ecx, byte 2 ; rowctr
+ jg short .rowloop
.return:
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- pop ebp
- ret
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; --------------------------------------------------------------------------
-%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
- shufps %1,%2,0x44
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1, %2, 0x44
%endmacro
-%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
- shufps %1,%2,0xEE
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1, %2, 0xEE
%endmacro
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_fdct_float_sse)
+ alignz 16
+ global EXTN(jconst_fdct_float_sse)
EXTN(jconst_fdct_float_sse):
-PD_0_382 times 4 dd 0.382683432365089771728460
-PD_0_707 times 4 dd 0.707106781186547524400844
-PD_0_541 times 4 dd 0.541196100146196984399723
-PD_1_306 times 4 dd 1.306562964876376527856643
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
+ SECTION SEG_TEXT
+ BITS 64
;
; Perform the forward DCT on one block of samples.
;
; r10 = FAST_FLOAT *data
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
- align 16
- global EXTN(jsimd_fdct_float_sse)
+ align 16
+ global EXTN(jsimd_fdct_float_sse)
EXTN(jsimd_fdct_float_sse):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
-
- ; ---- Pass 1: process rows.
-
- mov rdx, r10 ; (FAST_FLOAT *)
- mov rcx, DCTSIZE/4
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args
+
+ ; ---- Pass 1: process rows.
+
+ mov rdx, r10 ; (FAST_FLOAT *)
+ mov rcx, DCTSIZE/4
.rowloop:
- movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
-
- ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
- ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
-
- movaps xmm4,xmm0 ; transpose coefficients(phase 1)
- unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31)
- unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33)
- movaps xmm5,xmm2 ; transpose coefficients(phase 1)
- unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35)
- unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37)
-
- movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
-
- ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
- ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
-
- movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
- movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
-
- movaps xmm4,xmm6 ; transpose coefficients(phase 1)
- unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
- unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13)
- movaps xmm2,xmm1 ; transpose coefficients(phase 1)
- unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15)
- unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17)
-
- movaps xmm7,xmm6 ; transpose coefficients(phase 2)
- unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0
- unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1
- movaps xmm3,xmm2 ; transpose coefficients(phase 2)
- unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6
- unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7
-
- movaps xmm0,xmm7
- movaps xmm5,xmm6
- subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
- subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
- addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
- addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
-
- movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
- movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
- movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
- movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
-
- movaps xmm7,xmm4 ; transpose coefficients(phase 2)
- unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2
- unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3
- movaps xmm6,xmm1 ; transpose coefficients(phase 2)
- unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4
- unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5
-
- movaps xmm2,xmm7
- movaps xmm3,xmm4
- addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
- addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
- subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
- subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
-
- ; -- Even part
-
- movaps xmm1,xmm5
- movaps xmm6,xmm0
- subps xmm5,xmm7 ; xmm5=tmp13
- subps xmm0,xmm4 ; xmm0=tmp12
- addps xmm1,xmm7 ; xmm1=tmp10
- addps xmm6,xmm4 ; xmm6=tmp11
-
- addps xmm0,xmm5
- mulps xmm0,[rel PD_0_707] ; xmm0=z1
-
- movaps xmm7,xmm1
- movaps xmm4,xmm5
- subps xmm1,xmm6 ; xmm1=data4
- subps xmm5,xmm0 ; xmm5=data6
- addps xmm7,xmm6 ; xmm7=data0
- addps xmm4,xmm0 ; xmm4=data2
-
- movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
- movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
- ; -- Odd part
-
- movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
- movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
-
- addps xmm2,xmm3 ; xmm2=tmp10
- addps xmm3,xmm6 ; xmm3=tmp11
- addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
-
- mulps xmm3,[rel PD_0_707] ; xmm3=z3
-
- movaps xmm1,xmm2 ; xmm1=tmp10
- subps xmm2,xmm6
- mulps xmm2,[rel PD_0_382] ; xmm2=z5
- mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
- mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
- addps xmm1,xmm2 ; xmm1=z2
- addps xmm6,xmm2 ; xmm6=z4
-
- movaps xmm5,xmm0
- subps xmm0,xmm3 ; xmm0=z13
- addps xmm5,xmm3 ; xmm5=z11
-
- movaps xmm7,xmm0
- movaps xmm4,xmm5
- subps xmm0,xmm1 ; xmm0=data3
- subps xmm5,xmm6 ; xmm5=data7
- addps xmm7,xmm1 ; xmm7=data5
- addps xmm4,xmm6 ; xmm4=data1
-
- movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
- movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
- add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
- dec rcx
- jnz near .rowloop
-
- ; ---- Pass 2: process columns.
-
- mov rdx, r10 ; (FAST_FLOAT *)
- mov rcx, DCTSIZE/4
+ movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+ ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+ movaps xmm4, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31)
+ unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33)
+ movaps xmm5, xmm2 ; transpose coefficients(phase 1)
+ unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35)
+ unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37)
+
+ movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+ ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+ movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
+ movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
+
+ movaps xmm4, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13)
+ movaps xmm2, xmm1 ; transpose coefficients(phase 1)
+ unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15)
+ unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17)
+
+ movaps xmm7, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0
+ unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1
+ movaps xmm3, xmm2 ; transpose coefficients(phase 2)
+ unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6
+ unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7
+
+ movaps xmm0, xmm7
+ movaps xmm5, xmm6
+ subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
+ subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
+ addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
+
+ movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
+ movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
+ movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movaps xmm7, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2
+ unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3
+ movaps xmm6, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4
+ unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5
+
+ movaps xmm2, xmm7
+ movaps xmm3, xmm4
+ addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
+ addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
+ subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movaps xmm1, xmm5
+ movaps xmm6, xmm0
+ subps xmm5, xmm7 ; xmm5=tmp13
+ subps xmm0, xmm4 ; xmm0=tmp12
+ addps xmm1, xmm7 ; xmm1=tmp10
+ addps xmm6, xmm4 ; xmm6=tmp11
+
+ addps xmm0, xmm5
+ mulps xmm0, [rel PD_0_707] ; xmm0=z1
+
+ movaps xmm7, xmm1
+ movaps xmm4, xmm5
+ subps xmm1, xmm6 ; xmm1=data4
+ subps xmm5, xmm0 ; xmm5=data6
+ addps xmm7, xmm6 ; xmm7=data0
+ addps xmm4, xmm0 ; xmm4=data2
+
+ movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+ ; -- Odd part
+
+ movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
+ movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
+
+ addps xmm2, xmm3 ; xmm2=tmp10
+ addps xmm3, xmm6 ; xmm3=tmp11
+ addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
+
+ mulps xmm3, [rel PD_0_707] ; xmm3=z3
+
+ movaps xmm1, xmm2 ; xmm1=tmp10
+ subps xmm2, xmm6
+ mulps xmm2, [rel PD_0_382] ; xmm2=z5
+ mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ addps xmm1, xmm2 ; xmm1=z2
+ addps xmm6, xmm2 ; xmm6=z4
+
+ movaps xmm5, xmm0
+ subps xmm0, xmm3 ; xmm0=z13
+ addps xmm5, xmm3 ; xmm5=z11
+
+ movaps xmm7, xmm0
+ movaps xmm4, xmm5
+ subps xmm0, xmm1 ; xmm0=data3
+ subps xmm5, xmm6 ; xmm5=data7
+ addps xmm7, xmm1 ; xmm7=data5
+ addps xmm4, xmm6 ; xmm4=data1
+
+ movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+ add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec rcx
+ jnz near .rowloop
+
+ ; ---- Pass 2: process columns.
+
+ mov rdx, r10 ; (FAST_FLOAT *)
+ mov rcx, DCTSIZE/4
.columnloop:
- movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
-
- ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
- ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
-
- movaps xmm4,xmm0 ; transpose coefficients(phase 1)
- unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13)
- unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33)
- movaps xmm5,xmm2 ; transpose coefficients(phase 1)
- unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53)
- unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73)
-
- movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
-
- ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
- ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
-
- movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
- movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
-
- movaps xmm4,xmm6 ; transpose coefficients(phase 1)
- unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11)
- unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31)
- movaps xmm2,xmm1 ; transpose coefficients(phase 1)
- unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51)
- unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71)
-
- movaps xmm7,xmm6 ; transpose coefficients(phase 2)
- unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0
- unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1
- movaps xmm3,xmm2 ; transpose coefficients(phase 2)
- unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6
- unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7
-
- movaps xmm0,xmm7
- movaps xmm5,xmm6
- subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
- subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
- addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
- addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
-
- movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
- movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
- movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
- movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
-
- movaps xmm7,xmm4 ; transpose coefficients(phase 2)
- unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2
- unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3
- movaps xmm6,xmm1 ; transpose coefficients(phase 2)
- unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4
- unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5
-
- movaps xmm2,xmm7
- movaps xmm3,xmm4
- addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
- addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
- subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
- subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
-
- ; -- Even part
-
- movaps xmm1,xmm5
- movaps xmm6,xmm0
- subps xmm5,xmm7 ; xmm5=tmp13
- subps xmm0,xmm4 ; xmm0=tmp12
- addps xmm1,xmm7 ; xmm1=tmp10
- addps xmm6,xmm4 ; xmm6=tmp11
-
- addps xmm0,xmm5
- mulps xmm0,[rel PD_0_707] ; xmm0=z1
-
- movaps xmm7,xmm1
- movaps xmm4,xmm5
- subps xmm1,xmm6 ; xmm1=data4
- subps xmm5,xmm0 ; xmm5=data6
- addps xmm7,xmm6 ; xmm7=data0
- addps xmm4,xmm0 ; xmm4=data2
-
- movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
- movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
- ; -- Odd part
-
- movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
- movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
-
- addps xmm2,xmm3 ; xmm2=tmp10
- addps xmm3,xmm6 ; xmm3=tmp11
- addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
-
- mulps xmm3,[rel PD_0_707] ; xmm3=z3
-
- movaps xmm1,xmm2 ; xmm1=tmp10
- subps xmm2,xmm6
- mulps xmm2,[rel PD_0_382] ; xmm2=z5
- mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
- mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
- addps xmm1,xmm2 ; xmm1=z2
- addps xmm6,xmm2 ; xmm6=z4
-
- movaps xmm5,xmm0
- subps xmm0,xmm3 ; xmm0=z13
- addps xmm5,xmm3 ; xmm5=z11
-
- movaps xmm7,xmm0
- movaps xmm4,xmm5
- subps xmm0,xmm1 ; xmm0=data3
- subps xmm5,xmm6 ; xmm5=data7
- addps xmm7,xmm1 ; xmm7=data5
- addps xmm4,xmm6 ; xmm4=data1
-
- movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
- movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
- add rdx, byte 4*SIZEOF_FAST_FLOAT
- dec rcx
- jnz near .columnloop
-
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
+ movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+ ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+ movaps xmm4, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13)
+ unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33)
+ movaps xmm5, xmm2 ; transpose coefficients(phase 1)
+ unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53)
+ unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73)
+
+ movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+ ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+ movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
+ movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
+
+ movaps xmm4, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11)
+ unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31)
+ movaps xmm2, xmm1 ; transpose coefficients(phase 1)
+ unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51)
+ unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71)
+
+ movaps xmm7, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0
+ unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1
+ movaps xmm3, xmm2 ; transpose coefficients(phase 2)
+ unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6
+ unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7
+
+ movaps xmm0, xmm7
+ movaps xmm5, xmm6
+ subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
+ subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
+ addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
+
+ movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
+ movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
+ movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movaps xmm7, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2
+ unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3
+ movaps xmm6, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4
+ unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5
+
+ movaps xmm2, xmm7
+ movaps xmm3, xmm4
+ addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
+ addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
+ subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movaps xmm1, xmm5
+ movaps xmm6, xmm0
+ subps xmm5, xmm7 ; xmm5=tmp13
+ subps xmm0, xmm4 ; xmm0=tmp12
+ addps xmm1, xmm7 ; xmm1=tmp10
+ addps xmm6, xmm4 ; xmm6=tmp11
+
+ addps xmm0, xmm5
+ mulps xmm0, [rel PD_0_707] ; xmm0=z1
+
+ movaps xmm7, xmm1
+ movaps xmm4, xmm5
+ subps xmm1, xmm6 ; xmm1=data4
+ subps xmm5, xmm0 ; xmm5=data6
+ addps xmm7, xmm6 ; xmm7=data0
+ addps xmm4, xmm0 ; xmm4=data2
+
+ movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+ ; -- Odd part
+
+ movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
+ movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
+
+ addps xmm2, xmm3 ; xmm2=tmp10
+ addps xmm3, xmm6 ; xmm3=tmp11
+ addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
+
+ mulps xmm3, [rel PD_0_707] ; xmm3=z3
+
+ movaps xmm1, xmm2 ; xmm1=tmp10
+ subps xmm2, xmm6
+ mulps xmm2, [rel PD_0_382] ; xmm2=z5
+ mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ addps xmm1, xmm2 ; xmm1=z2
+ addps xmm6, xmm2 ; xmm6=z4
+
+ movaps xmm5, xmm0
+ subps xmm0, xmm3 ; xmm0=z13
+ addps xmm5, xmm3 ; xmm5=z11
+
+ movaps xmm7, xmm0
+ movaps xmm4, xmm5
+ subps xmm0, xmm1 ; xmm0=data3
+ subps xmm5, xmm6 ; xmm5=data7
+ addps xmm7, xmm1 ; xmm7=data5
+ addps xmm4, xmm6 ; xmm4=data1
+
+ movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+ add rdx, byte 4*SIZEOF_FAST_FLOAT
+ dec rcx
+ jnz near .columnloop
+
+ uncollect_args
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; --------------------------------------------------------------------------
-%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
- shufps %1,%2,0x44
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1, %2, 0x44
%endmacro
-%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
- shufps %1,%2,0xEE
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1, %2, 0xEE
%endmacro
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_fdct_float_sse)
+ alignz 16
+ global EXTN(jconst_fdct_float_sse)
EXTN(jconst_fdct_float_sse):
-PD_0_382 times 4 dd 0.382683432365089771728460
-PD_0_707 times 4 dd 0.707106781186547524400844
-PD_0_541 times 4 dd 0.541196100146196984399723
-PD_1_306 times 4 dd 1.306562964876376527856643
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
+ SECTION SEG_TEXT
+ BITS 32
;
; Perform the forward DCT on one block of samples.
;
; jsimd_fdct_float_sse (FAST_FLOAT *data)
;
-%define data(b) (b)+8 ; FAST_FLOAT *data
+%define data(b) (b)+8 ; FAST_FLOAT *data
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
- align 16
- global EXTN(jsimd_fdct_float_sse)
+ align 16
+ global EXTN(jsimd_fdct_float_sse)
EXTN(jsimd_fdct_float_sse):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
-; push esi ; unused
-; push edi ; unused
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process rows.
-
- mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
- mov ecx, DCTSIZE/4
- alignx 16,7
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
.rowloop:
- movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
-
- ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
- ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
-
- movaps xmm4,xmm0 ; transpose coefficients(phase 1)
- unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31)
- unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33)
- movaps xmm5,xmm2 ; transpose coefficients(phase 1)
- unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35)
- unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37)
-
- movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
- ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
- ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
-
- movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
- movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
-
- movaps xmm4,xmm6 ; transpose coefficients(phase 1)
- unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
- unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13)
- movaps xmm2,xmm1 ; transpose coefficients(phase 1)
- unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15)
- unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17)
-
- movaps xmm7,xmm6 ; transpose coefficients(phase 2)
- unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0
- unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1
- movaps xmm3,xmm2 ; transpose coefficients(phase 2)
- unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6
- unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7
-
- movaps xmm0,xmm7
- movaps xmm5,xmm6
- subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
- subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
- addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
- addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
-
- movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
- movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
- movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
- movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
-
- movaps xmm7,xmm4 ; transpose coefficients(phase 2)
- unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2
- unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3
- movaps xmm6,xmm1 ; transpose coefficients(phase 2)
- unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4
- unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5
-
- movaps xmm2,xmm7
- movaps xmm3,xmm4
- addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
- addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
- subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
- subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
-
- ; -- Even part
-
- movaps xmm1,xmm5
- movaps xmm6,xmm0
- subps xmm5,xmm7 ; xmm5=tmp13
- subps xmm0,xmm4 ; xmm0=tmp12
- addps xmm1,xmm7 ; xmm1=tmp10
- addps xmm6,xmm4 ; xmm6=tmp11
-
- addps xmm0,xmm5
- mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
-
- movaps xmm7,xmm1
- movaps xmm4,xmm5
- subps xmm1,xmm6 ; xmm1=data4
- subps xmm5,xmm0 ; xmm5=data6
- addps xmm7,xmm6 ; xmm7=data0
- addps xmm4,xmm0 ; xmm4=data2
-
- movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
- movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
- ; -- Odd part
-
- movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
- movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
-
- addps xmm2,xmm3 ; xmm2=tmp10
- addps xmm3,xmm6 ; xmm3=tmp11
- addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
-
- mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
-
- movaps xmm1,xmm2 ; xmm1=tmp10
- subps xmm2,xmm6
- mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
- mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
- mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
- addps xmm1,xmm2 ; xmm1=z2
- addps xmm6,xmm2 ; xmm6=z4
-
- movaps xmm5,xmm0
- subps xmm0,xmm3 ; xmm0=z13
- addps xmm5,xmm3 ; xmm5=z11
-
- movaps xmm7,xmm0
- movaps xmm4,xmm5
- subps xmm0,xmm1 ; xmm0=data3
- subps xmm5,xmm6 ; xmm5=data7
- addps xmm7,xmm1 ; xmm7=data5
- addps xmm4,xmm6 ; xmm4=data1
-
- movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
- movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
- add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
- dec ecx
- jnz near .rowloop
-
- ; ---- Pass 2: process columns.
-
- mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
- mov ecx, DCTSIZE/4
- alignx 16,7
+ movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+ ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+ movaps xmm4, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31)
+ unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33)
+ movaps xmm5, xmm2 ; transpose coefficients(phase 1)
+ unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35)
+ unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37)
+
+ movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+ ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+ movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
+ movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
+
+ movaps xmm4, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13)
+ movaps xmm2, xmm1 ; transpose coefficients(phase 1)
+ unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15)
+ unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17)
+
+ movaps xmm7, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0
+ unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1
+ movaps xmm3, xmm2 ; transpose coefficients(phase 2)
+ unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6
+ unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7
+
+ movaps xmm0, xmm7
+ movaps xmm5, xmm6
+ subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
+ subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
+ addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
+
+ movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
+ movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
+ movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movaps xmm7, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2
+ unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3
+ movaps xmm6, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4
+ unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5
+
+ movaps xmm2, xmm7
+ movaps xmm3, xmm4
+ addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
+ addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
+ subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movaps xmm1, xmm5
+ movaps xmm6, xmm0
+ subps xmm5, xmm7 ; xmm5=tmp13
+ subps xmm0, xmm4 ; xmm0=tmp12
+ addps xmm1, xmm7 ; xmm1=tmp10
+ addps xmm6, xmm4 ; xmm6=tmp11
+
+ addps xmm0, xmm5
+ mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+ movaps xmm7, xmm1
+ movaps xmm4, xmm5
+ subps xmm1, xmm6 ; xmm1=data4
+ subps xmm5, xmm0 ; xmm5=data6
+ addps xmm7, xmm6 ; xmm7=data0
+ addps xmm4, xmm0 ; xmm4=data2
+
+ movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+ ; -- Odd part
+
+ movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
+ movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
+
+ addps xmm2, xmm3 ; xmm2=tmp10
+ addps xmm3, xmm6 ; xmm3=tmp11
+ addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
+
+ mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+ movaps xmm1, xmm2 ; xmm1=tmp10
+ subps xmm2, xmm6
+ mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+ mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ addps xmm1, xmm2 ; xmm1=z2
+ addps xmm6, xmm2 ; xmm6=z4
+
+ movaps xmm5, xmm0
+ subps xmm0, xmm3 ; xmm0=z13
+ addps xmm5, xmm3 ; xmm5=z11
+
+ movaps xmm7, xmm0
+ movaps xmm4, xmm5
+ subps xmm0, xmm1 ; xmm0=data3
+ subps xmm5, xmm6 ; xmm5=data7
+ addps xmm7, xmm1 ; xmm7=data5
+ addps xmm4, xmm6 ; xmm4=data1
+
+ movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+ add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .rowloop
+
+ ; ---- Pass 2: process columns.
+
+ mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
.columnloop:
- movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
-
- ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
- ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
-
- movaps xmm4,xmm0 ; transpose coefficients(phase 1)
- unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13)
- unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33)
- movaps xmm5,xmm2 ; transpose coefficients(phase 1)
- unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53)
- unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73)
-
- movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
-
- ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
- ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
-
- movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
- movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
-
- movaps xmm4,xmm6 ; transpose coefficients(phase 1)
- unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11)
- unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31)
- movaps xmm2,xmm1 ; transpose coefficients(phase 1)
- unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51)
- unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71)
-
- movaps xmm7,xmm6 ; transpose coefficients(phase 2)
- unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0
- unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1
- movaps xmm3,xmm2 ; transpose coefficients(phase 2)
- unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6
- unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7
-
- movaps xmm0,xmm7
- movaps xmm5,xmm6
- subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
- subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
- addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
- addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
-
- movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
- movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
- movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
- movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
-
- movaps xmm7,xmm4 ; transpose coefficients(phase 2)
- unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2
- unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3
- movaps xmm6,xmm1 ; transpose coefficients(phase 2)
- unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4
- unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5
-
- movaps xmm2,xmm7
- movaps xmm3,xmm4
- addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
- addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
- subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
- subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
-
- ; -- Even part
-
- movaps xmm1,xmm5
- movaps xmm6,xmm0
- subps xmm5,xmm7 ; xmm5=tmp13
- subps xmm0,xmm4 ; xmm0=tmp12
- addps xmm1,xmm7 ; xmm1=tmp10
- addps xmm6,xmm4 ; xmm6=tmp11
-
- addps xmm0,xmm5
- mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
-
- movaps xmm7,xmm1
- movaps xmm4,xmm5
- subps xmm1,xmm6 ; xmm1=data4
- subps xmm5,xmm0 ; xmm5=data6
- addps xmm7,xmm6 ; xmm7=data0
- addps xmm4,xmm0 ; xmm4=data2
-
- movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
- movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
- ; -- Odd part
-
- movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
- movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
-
- addps xmm2,xmm3 ; xmm2=tmp10
- addps xmm3,xmm6 ; xmm3=tmp11
- addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
-
- mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
-
- movaps xmm1,xmm2 ; xmm1=tmp10
- subps xmm2,xmm6
- mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
- mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
- mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
- addps xmm1,xmm2 ; xmm1=z2
- addps xmm6,xmm2 ; xmm6=z4
-
- movaps xmm5,xmm0
- subps xmm0,xmm3 ; xmm0=z13
- addps xmm5,xmm3 ; xmm5=z11
-
- movaps xmm7,xmm0
- movaps xmm4,xmm5
- subps xmm0,xmm1 ; xmm0=data3
- subps xmm5,xmm6 ; xmm5=data7
- addps xmm7,xmm1 ; xmm7=data5
- addps xmm4,xmm6 ; xmm4=data1
-
- movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
- movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
- add edx, byte 4*SIZEOF_FAST_FLOAT
- dec ecx
- jnz near .columnloop
-
-; pop edi ; unused
-; pop esi ; unused
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- poppic ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
+ movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+ ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+ movaps xmm4, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13)
+ unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33)
+ movaps xmm5, xmm2 ; transpose coefficients(phase 1)
+ unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53)
+ unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73)
+
+ movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+ ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+ ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+ movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
+ movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
+
+ movaps xmm4, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11)
+ unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31)
+ movaps xmm2, xmm1 ; transpose coefficients(phase 1)
+ unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51)
+ unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71)
+
+ movaps xmm7, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0
+ unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1
+ movaps xmm3, xmm2 ; transpose coefficients(phase 2)
+ unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6
+ unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7
+
+ movaps xmm0, xmm7
+ movaps xmm5, xmm6
+ subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
+ subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
+ addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
+
+ movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
+ movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
+ movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movaps xmm7, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2
+ unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3
+ movaps xmm6, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4
+ unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5
+
+ movaps xmm2, xmm7
+ movaps xmm3, xmm4
+ addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
+ addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
+ subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movaps xmm1, xmm5
+ movaps xmm6, xmm0
+ subps xmm5, xmm7 ; xmm5=tmp13
+ subps xmm0, xmm4 ; xmm0=tmp12
+ addps xmm1, xmm7 ; xmm1=tmp10
+ addps xmm6, xmm4 ; xmm6=tmp11
+
+ addps xmm0, xmm5
+ mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+ movaps xmm7, xmm1
+ movaps xmm4, xmm5
+ subps xmm1, xmm6 ; xmm1=data4
+ subps xmm5, xmm0 ; xmm5=data6
+ addps xmm7, xmm6 ; xmm7=data0
+ addps xmm4, xmm0 ; xmm4=data2
+
+ movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+ ; -- Odd part
+
+ movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
+ movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
+
+ addps xmm2, xmm3 ; xmm2=tmp10
+ addps xmm3, xmm6 ; xmm3=tmp11
+ addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
+
+ mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+ movaps xmm1, xmm2 ; xmm1=tmp10
+ subps xmm2, xmm6
+ mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+ mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ addps xmm1, xmm2 ; xmm1=z2
+ addps xmm6, xmm2 ; xmm6=z4
+
+ movaps xmm5, xmm0
+ subps xmm0, xmm3 ; xmm0=z13
+ addps xmm5, xmm3 ; xmm5=z11
+
+ movaps xmm7, xmm0
+ movaps xmm4, xmm5
+ subps xmm0, xmm1 ; xmm0=data3
+ subps xmm5, xmm6 ; xmm5=data7
+ addps xmm7, xmm1 ; xmm7=data5
+ addps xmm4, xmm6 ; xmm4=data1
+
+ movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+ movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+ add edx, byte 4*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz near .columnloop
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; --------------------------------------------------------------------------
-%define CONST_BITS 8 ; 14 is also OK.
+%define CONST_BITS 8 ; 14 is also OK.
%if CONST_BITS == 8
-F_0_382 equ 98 ; FIX(0.382683433)
-F_0_541 equ 139 ; FIX(0.541196100)
-F_0_707 equ 181 ; FIX(0.707106781)
-F_1_306 equ 334 ; FIX(1.306562965)
+F_0_382 equ 98 ; FIX(0.382683433)
+F_0_541 equ 139 ; FIX(0.541196100)
+F_0_707 equ 181 ; FIX(0.707106781)
+F_1_306 equ 334 ; FIX(1.306562965)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433)
-F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
-F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781)
-F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965)
+F_0_382 equ DESCALE( 410903207, 30-CONST_BITS) ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30-CONST_BITS) ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30-CONST_BITS) ; FIX(1.306562965)
%endif
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-%define PRE_MULTIPLY_SCALE_BITS 2
-%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
- alignz 16
- global EXTN(jconst_fdct_ifast_sse2)
+ alignz 16
+ global EXTN(jconst_fdct_ifast_sse2)
EXTN(jconst_fdct_ifast_sse2):
-PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
-PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
-PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
-PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
+ SECTION SEG_TEXT
+ BITS 64
;
; Perform the forward DCT on one block of samples.
;
; r10 = DCTELEM *data
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
- align 16
- global EXTN(jsimd_fdct_ifast_sse2)
+ align 16
+ global EXTN(jsimd_fdct_ifast_sse2)
EXTN(jsimd_fdct_ifast_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
-
- ; ---- Pass 1: process rows.
-
- mov rdx, r10 ; (DCTELEM *)
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
-
- ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
- ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
- movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
- punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
- punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
- movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
- punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
- punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
-
- movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
-
- ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
- ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
- movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
-
- movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
- punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
- movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
- punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
-
- movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
- punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
- movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
- punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
-
- movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
- movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
- movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
-
- movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
- punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
- punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
- movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
- punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
- punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
-
- movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
- punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
- punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
- movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
- punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
- punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
- movdqa xmm6,xmm1
- movdqa xmm3,xmm0
- psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
- psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
- paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
- paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
-
- movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
- movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
-
- movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
- punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
- punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
- movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
- punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
- punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
- movdqa xmm2,xmm1
- movdqa xmm5,xmm7
- paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
- paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
- psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
- psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
-
- ; -- Even part
-
- movdqa xmm4,xmm3
- movdqa xmm0,xmm6
- psubw xmm3,xmm1 ; xmm3=tmp13
- psubw xmm6,xmm7 ; xmm6=tmp12
- paddw xmm4,xmm1 ; xmm4=tmp10
- paddw xmm0,xmm7 ; xmm0=tmp11
-
- paddw xmm6,xmm3
- psllw xmm6,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm6,[rel PW_F0707] ; xmm6=z1
-
- movdqa xmm1,xmm4
- movdqa xmm7,xmm3
- psubw xmm4,xmm0 ; xmm4=data4
- psubw xmm3,xmm6 ; xmm3=data6
- paddw xmm1,xmm0 ; xmm1=data0
- paddw xmm7,xmm6 ; xmm7=data2
-
- movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
- movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
- movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
-
- ; -- Odd part
-
- paddw xmm2,xmm5 ; xmm2=tmp10
- paddw xmm5,xmm0 ; xmm5=tmp11
- paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7
-
- psllw xmm2,PRE_MULTIPLY_SCALE_BITS
- psllw xmm0,PRE_MULTIPLY_SCALE_BITS
-
- psllw xmm5,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm5,[rel PW_F0707] ; xmm5=z3
-
- movdqa xmm4,xmm2 ; xmm4=tmp10
- psubw xmm2,xmm0
- pmulhw xmm2,[rel PW_F0382] ; xmm2=z5
- pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
- pmulhw xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
- paddw xmm4,xmm2 ; xmm4=z2
- paddw xmm0,xmm2 ; xmm0=z4
-
- movdqa xmm3,xmm6
- psubw xmm6,xmm5 ; xmm6=z13
- paddw xmm3,xmm5 ; xmm3=z11
-
- movdqa xmm2,xmm6
- movdqa xmm5,xmm3
- psubw xmm6,xmm4 ; xmm6=data3
- psubw xmm3,xmm0 ; xmm3=data7
- paddw xmm2,xmm4 ; xmm2=data5
- paddw xmm5,xmm0 ; xmm5=data1
-
- ; ---- Pass 2: process columns.
-
- ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
- ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
-
- movdqa xmm4,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
- punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
- movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
- punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
- punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
-
- movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
- movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
-
- ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
- ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
-
- movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
-
- movdqa xmm7,xmm5 ; transpose coefficients(phase 1)
- punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
- punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
- movdqa xmm0,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
- punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
-
- movdqa xmm2,xmm5 ; transpose coefficients(phase 2)
- punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
- punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
- movdqa xmm3,xmm7 ; transpose coefficients(phase 2)
- punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
- punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
-
- movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
- movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
- movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
-
- movdqa xmm2,xmm1 ; transpose coefficients(phase 2)
- punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
- punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
- movdqa xmm7,xmm4 ; transpose coefficients(phase 2)
- punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
- punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
-
- movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
- punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
- punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
- movdqa xmm0,xmm7 ; transpose coefficients(phase 3)
- punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
- punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
-
- movdqa xmm5,xmm6
- movdqa xmm3,xmm1
- psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6
- psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7
- paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1
- paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
- movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
- movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
- movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
-
- movdqa xmm6,xmm2 ; transpose coefficients(phase 3)
- punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
- punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
- movdqa xmm1,xmm4 ; transpose coefficients(phase 3)
- punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
- punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
-
- movdqa xmm7,xmm6
- movdqa xmm0,xmm2
- paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3
- paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2
- psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4
- psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5
-
- ; -- Even part
-
- movdqa xmm4,xmm3
- movdqa xmm1,xmm5
- psubw xmm3,xmm6 ; xmm3=tmp13
- psubw xmm5,xmm2 ; xmm5=tmp12
- paddw xmm4,xmm6 ; xmm4=tmp10
- paddw xmm1,xmm2 ; xmm1=tmp11
-
- paddw xmm5,xmm3
- psllw xmm5,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm5,[rel PW_F0707] ; xmm5=z1
-
- movdqa xmm6,xmm4
- movdqa xmm2,xmm3
- psubw xmm4,xmm1 ; xmm4=data4
- psubw xmm3,xmm5 ; xmm3=data6
- paddw xmm6,xmm1 ; xmm6=data0
- paddw xmm2,xmm5 ; xmm2=data2
-
- movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
- movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
- movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
- movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
-
- ; -- Odd part
-
- movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
-
- paddw xmm7,xmm0 ; xmm7=tmp10
- paddw xmm0,xmm1 ; xmm0=tmp11
- paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7
-
- psllw xmm7,PRE_MULTIPLY_SCALE_BITS
- psllw xmm1,PRE_MULTIPLY_SCALE_BITS
-
- psllw xmm0,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm0,[rel PW_F0707] ; xmm0=z3
-
- movdqa xmm4,xmm7 ; xmm4=tmp10
- psubw xmm7,xmm1
- pmulhw xmm7,[rel PW_F0382] ; xmm7=z5
- pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
- pmulhw xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
- paddw xmm4,xmm7 ; xmm4=z2
- paddw xmm1,xmm7 ; xmm1=z4
-
- movdqa xmm3,xmm5
- psubw xmm5,xmm0 ; xmm5=z13
- paddw xmm3,xmm0 ; xmm3=z11
-
- movdqa xmm6,xmm5
- movdqa xmm2,xmm3
- psubw xmm5,xmm4 ; xmm5=data3
- psubw xmm3,xmm1 ; xmm3=data7
- paddw xmm6,xmm4 ; xmm6=data5
- paddw xmm2,xmm1 ; xmm2=data1
-
- movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
- movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
- movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
- movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
-
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args
+
+ ; ---- Pass 1: process rows.
+
+ mov rdx, r10 ; (DCTELEM *)
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+ ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+ ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+ ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+ ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
+ movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
+ movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
+ movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
+
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
+ movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
+ punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
+ punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+ movdqa xmm6, xmm1
+ movdqa xmm3, xmm0
+ psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
+ psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
+ paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
+ paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
+ movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
+ punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
+ punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+ movdqa xmm2, xmm1
+ movdqa xmm5, xmm7
+ paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
+ paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
+ psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
+ psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm0, xmm6
+ psubw xmm3, xmm1 ; xmm3=tmp13
+ psubw xmm6, xmm7 ; xmm6=tmp12
+ paddw xmm4, xmm1 ; xmm4=tmp10
+ paddw xmm0, xmm7 ; xmm0=tmp11
+
+ paddw xmm6, xmm3
+ psllw xmm6, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm6, [rel PW_F0707] ; xmm6=z1
+
+ movdqa xmm1, xmm4
+ movdqa xmm7, xmm3
+ psubw xmm4, xmm0 ; xmm4=data4
+ psubw xmm3, xmm6 ; xmm3=data6
+ paddw xmm1, xmm0 ; xmm1=data0
+ paddw xmm7, xmm6 ; xmm7=data2
+
+ movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
+
+ ; -- Odd part
+
+ paddw xmm2, xmm5 ; xmm2=tmp10
+ paddw xmm5, xmm0 ; xmm5=tmp11
+ paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7
+
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [rel PW_F0707] ; xmm5=z3
+
+ movdqa xmm4, xmm2 ; xmm4=tmp10
+ psubw xmm2, xmm0
+ pmulhw xmm2, [rel PW_F0382] ; xmm2=z5
+ pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+ pmulhw xmm0, [rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+ paddw xmm4, xmm2 ; xmm4=z2
+ paddw xmm0, xmm2 ; xmm0=z4
+
+ movdqa xmm3, xmm6
+ psubw xmm6, xmm5 ; xmm6=z13
+ paddw xmm3, xmm5 ; xmm3=z11
+
+ movdqa xmm2, xmm6
+ movdqa xmm5, xmm3
+ psubw xmm6, xmm4 ; xmm6=data3
+ psubw xmm3, xmm0 ; xmm3=data7
+ paddw xmm2, xmm4 ; xmm2=data5
+ paddw xmm5, xmm0 ; xmm5=data1
+
+ ; ---- Pass 2: process columns.
+
+ ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+ ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
+ punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
+
+ ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+ ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm7, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
+ punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
+ movdqa xmm0, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
+ punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
+
+ movdqa xmm2, xmm5 ; transpose coefficients(phase 2)
+ punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
+ punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
+ movdqa xmm3, xmm7 ; transpose coefficients(phase 2)
+ punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
+ punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
+
+ movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
+
+ movdqa xmm2, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
+ movdqa xmm7, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
+ punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
+
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
+ punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
+ punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+ movdqa xmm5, xmm6
+ movdqa xmm3, xmm1
+ psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6
+ psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7
+ paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1
+ paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
+ movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
+
+ movdqa xmm6, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
+ punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
+ movdqa xmm1, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
+ punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+ movdqa xmm7, xmm6
+ movdqa xmm0, xmm2
+ paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3
+ paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2
+ psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4
+ psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm1, xmm5
+ psubw xmm3, xmm6 ; xmm3=tmp13
+ psubw xmm5, xmm2 ; xmm5=tmp12
+ paddw xmm4, xmm6 ; xmm4=tmp10
+ paddw xmm1, xmm2 ; xmm1=tmp11
+
+ paddw xmm5, xmm3
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [rel PW_F0707] ; xmm5=z1
+
+ movdqa xmm6, xmm4
+ movdqa xmm2, xmm3
+ psubw xmm4, xmm1 ; xmm4=data4
+ psubw xmm3, xmm5 ; xmm3=data6
+ paddw xmm6, xmm1 ; xmm6=data0
+ paddw xmm2, xmm5 ; xmm2=data2
+
+ movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
+ movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
+ movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+ ; -- Odd part
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
+
+ paddw xmm7, xmm0 ; xmm7=tmp10
+ paddw xmm0, xmm1 ; xmm0=tmp11
+ paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7
+
+ psllw xmm7, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm1, PRE_MULTIPLY_SCALE_BITS
+
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm0, [rel PW_F0707] ; xmm0=z3
+
+ movdqa xmm4, xmm7 ; xmm4=tmp10
+ psubw xmm7, xmm1
+ pmulhw xmm7, [rel PW_F0382] ; xmm7=z5
+ pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+ pmulhw xmm1, [rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+ paddw xmm4, xmm7 ; xmm4=z2
+ paddw xmm1, xmm7 ; xmm1=z4
+
+ movdqa xmm3, xmm5
+ psubw xmm5, xmm0 ; xmm5=z13
+ paddw xmm3, xmm0 ; xmm3=z11
+
+ movdqa xmm6, xmm5
+ movdqa xmm2, xmm3
+ psubw xmm5, xmm4 ; xmm5=data3
+ psubw xmm3, xmm1 ; xmm3=data7
+ paddw xmm6, xmm4 ; xmm6=data5
+ paddw xmm2, xmm1 ; xmm2=data1
+
+ movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
+ movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
+ movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
+ movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+ uncollect_args
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; --------------------------------------------------------------------------
-%define CONST_BITS 8 ; 14 is also OK.
+%define CONST_BITS 8 ; 14 is also OK.
%if CONST_BITS == 8
-F_0_382 equ 98 ; FIX(0.382683433)
-F_0_541 equ 139 ; FIX(0.541196100)
-F_0_707 equ 181 ; FIX(0.707106781)
-F_1_306 equ 334 ; FIX(1.306562965)
+F_0_382 equ 98 ; FIX(0.382683433)
+F_0_541 equ 139 ; FIX(0.541196100)
+F_0_707 equ 181 ; FIX(0.707106781)
+F_1_306 equ 334 ; FIX(1.306562965)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433)
-F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
-F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781)
-F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965)
+F_0_382 equ DESCALE( 410903207, 30-CONST_BITS) ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30-CONST_BITS) ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30-CONST_BITS) ; FIX(1.306562965)
%endif
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-%define PRE_MULTIPLY_SCALE_BITS 2
-%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
- alignz 16
- global EXTN(jconst_fdct_ifast_sse2)
+ alignz 16
+ global EXTN(jconst_fdct_ifast_sse2)
EXTN(jconst_fdct_ifast_sse2):
-PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
-PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
-PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
-PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
+ SECTION SEG_TEXT
+ BITS 32
;
; Perform the forward DCT on one block of samples.
;
; jsimd_fdct_ifast_sse2 (DCTELEM *data)
;
-%define data(b) (b)+8 ; DCTELEM *data
+%define data(b) (b)+8 ; DCTELEM *data
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
- align 16
- global EXTN(jsimd_fdct_ifast_sse2)
+ align 16
+ global EXTN(jsimd_fdct_ifast_sse2)
EXTN(jsimd_fdct_ifast_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic ebx
-; push ecx ; unused
-; push edx ; need not be preserved
-; push esi ; unused
-; push edi ; unused
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process rows.
-
- mov edx, POINTER [data(eax)] ; (DCTELEM *)
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-
- ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
- ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
- movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
- punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
- punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
- movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
- punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
- punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
-
- movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
- ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
- ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
- movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
-
- movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
- punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
- movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
- punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
-
- movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
- punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
- movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
- punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
-
- movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
- movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
- movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
-
- movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
- punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
- punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
- movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
- punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
- punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
-
- movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
- punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
- punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
- movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
- punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
- punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
- movdqa xmm6,xmm1
- movdqa xmm3,xmm0
- psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
- psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
- paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
- paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
-
- movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
- movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
-
- movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
- punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
- punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
- movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
- punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
- punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
- movdqa xmm2,xmm1
- movdqa xmm5,xmm7
- paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
- paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
- psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
- psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
-
- ; -- Even part
-
- movdqa xmm4,xmm3
- movdqa xmm0,xmm6
- psubw xmm3,xmm1 ; xmm3=tmp13
- psubw xmm6,xmm7 ; xmm6=tmp12
- paddw xmm4,xmm1 ; xmm4=tmp10
- paddw xmm0,xmm7 ; xmm0=tmp11
-
- paddw xmm6,xmm3
- psllw xmm6,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1
-
- movdqa xmm1,xmm4
- movdqa xmm7,xmm3
- psubw xmm4,xmm0 ; xmm4=data4
- psubw xmm3,xmm6 ; xmm3=data6
- paddw xmm1,xmm0 ; xmm1=data0
- paddw xmm7,xmm6 ; xmm7=data2
-
- movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
- movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
- movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
-
- ; -- Odd part
-
- paddw xmm2,xmm5 ; xmm2=tmp10
- paddw xmm5,xmm0 ; xmm5=tmp11
- paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7
-
- psllw xmm2,PRE_MULTIPLY_SCALE_BITS
- psllw xmm0,PRE_MULTIPLY_SCALE_BITS
-
- psllw xmm5,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3
-
- movdqa xmm4,xmm2 ; xmm4=tmp10
- psubw xmm2,xmm0
- pmulhw xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5
- pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
- pmulhw xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
- paddw xmm4,xmm2 ; xmm4=z2
- paddw xmm0,xmm2 ; xmm0=z4
-
- movdqa xmm3,xmm6
- psubw xmm6,xmm5 ; xmm6=z13
- paddw xmm3,xmm5 ; xmm3=z11
-
- movdqa xmm2,xmm6
- movdqa xmm5,xmm3
- psubw xmm6,xmm4 ; xmm6=data3
- psubw xmm3,xmm0 ; xmm3=data7
- paddw xmm2,xmm4 ; xmm2=data5
- paddw xmm5,xmm0 ; xmm5=data1
-
- ; ---- Pass 2: process columns.
-
-; mov edx, POINTER [data(eax)] ; (DCTELEM *)
-
- ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
- ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
-
- movdqa xmm4,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
- punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
- movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
- punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
- punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
-
- movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
- movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
-
- ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
- ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
-
- movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
-
- movdqa xmm7,xmm5 ; transpose coefficients(phase 1)
- punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
- punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
- movdqa xmm0,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
- punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
-
- movdqa xmm2,xmm5 ; transpose coefficients(phase 2)
- punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
- punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
- movdqa xmm3,xmm7 ; transpose coefficients(phase 2)
- punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
- punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
-
- movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
- movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
- movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
-
- movdqa xmm2,xmm1 ; transpose coefficients(phase 2)
- punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
- punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
- movdqa xmm7,xmm4 ; transpose coefficients(phase 2)
- punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
- punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
-
- movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
- punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
- punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
- movdqa xmm0,xmm7 ; transpose coefficients(phase 3)
- punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
- punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
-
- movdqa xmm5,xmm6
- movdqa xmm3,xmm1
- psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6
- psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7
- paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1
- paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
- movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
- movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
- movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
-
- movdqa xmm6,xmm2 ; transpose coefficients(phase 3)
- punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
- punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
- movdqa xmm1,xmm4 ; transpose coefficients(phase 3)
- punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
- punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
-
- movdqa xmm7,xmm6
- movdqa xmm0,xmm2
- paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3
- paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2
- psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4
- psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5
-
- ; -- Even part
-
- movdqa xmm4,xmm3
- movdqa xmm1,xmm5
- psubw xmm3,xmm6 ; xmm3=tmp13
- psubw xmm5,xmm2 ; xmm5=tmp12
- paddw xmm4,xmm6 ; xmm4=tmp10
- paddw xmm1,xmm2 ; xmm1=tmp11
-
- paddw xmm5,xmm3
- psllw xmm5,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1
-
- movdqa xmm6,xmm4
- movdqa xmm2,xmm3
- psubw xmm4,xmm1 ; xmm4=data4
- psubw xmm3,xmm5 ; xmm3=data6
- paddw xmm6,xmm1 ; xmm6=data0
- paddw xmm2,xmm5 ; xmm2=data2
-
- movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
- movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
- movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
- movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
-
- ; -- Odd part
-
- movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
-
- paddw xmm7,xmm0 ; xmm7=tmp10
- paddw xmm0,xmm1 ; xmm0=tmp11
- paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7
-
- psllw xmm7,PRE_MULTIPLY_SCALE_BITS
- psllw xmm1,PRE_MULTIPLY_SCALE_BITS
-
- psllw xmm0,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3
-
- movdqa xmm4,xmm7 ; xmm4=tmp10
- psubw xmm7,xmm1
- pmulhw xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5
- pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
- pmulhw xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
- paddw xmm4,xmm7 ; xmm4=z2
- paddw xmm1,xmm7 ; xmm1=z4
-
- movdqa xmm3,xmm5
- psubw xmm5,xmm0 ; xmm5=z13
- paddw xmm3,xmm0 ; xmm3=z11
-
- movdqa xmm6,xmm5
- movdqa xmm2,xmm3
- psubw xmm5,xmm4 ; xmm5=data3
- psubw xmm3,xmm1 ; xmm3=data7
- paddw xmm6,xmm4 ; xmm6=data5
- paddw xmm2,xmm1 ; xmm2=data1
-
- movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
- movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
- movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
- movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
-
-; pop edi ; unused
-; pop esi ; unused
-; pop edx ; need not be preserved
-; pop ecx ; unused
- poppic ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+ ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+ ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+ ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+ ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
+ movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
+ movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
+ movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
+
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
+ movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
+ punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
+ punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+ movdqa xmm6, xmm1
+ movdqa xmm3, xmm0
+ psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
+ psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
+ paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
+ paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
+ movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
+ punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
+ punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+ movdqa xmm2, xmm1
+ movdqa xmm5, xmm7
+ paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
+ paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
+ psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
+ psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm0, xmm6
+ psubw xmm3, xmm1 ; xmm3=tmp13
+ psubw xmm6, xmm7 ; xmm6=tmp12
+ paddw xmm4, xmm1 ; xmm4=tmp10
+ paddw xmm0, xmm7 ; xmm0=tmp11
+
+ paddw xmm6, xmm3
+ psllw xmm6, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm6, [GOTOFF(ebx,PW_F0707)] ; xmm6=z1
+
+ movdqa xmm1, xmm4
+ movdqa xmm7, xmm3
+ psubw xmm4, xmm0 ; xmm4=data4
+ psubw xmm3, xmm6 ; xmm3=data6
+ paddw xmm1, xmm0 ; xmm1=data0
+ paddw xmm7, xmm6 ; xmm7=data2
+
+ movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
+
+ ; -- Odd part
+
+ paddw xmm2, xmm5 ; xmm2=tmp10
+ paddw xmm5, xmm0 ; xmm5=tmp11
+ paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7
+
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z3
+
+ movdqa xmm4, xmm2 ; xmm4=tmp10
+ psubw xmm2, xmm0
+ pmulhw xmm2, [GOTOFF(ebx,PW_F0382)] ; xmm2=z5
+ pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+ pmulhw xmm0, [GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+ paddw xmm4, xmm2 ; xmm4=z2
+ paddw xmm0, xmm2 ; xmm0=z4
+
+ movdqa xmm3, xmm6
+ psubw xmm6, xmm5 ; xmm6=z13
+ paddw xmm3, xmm5 ; xmm3=z11
+
+ movdqa xmm2, xmm6
+ movdqa xmm5, xmm3
+ psubw xmm6, xmm4 ; xmm6=data3
+ psubw xmm3, xmm0 ; xmm3=data7
+ paddw xmm2, xmm4 ; xmm2=data5
+ paddw xmm5, xmm0 ; xmm5=data1
+
+ ; ---- Pass 2: process columns.
+
+; mov edx, POINTER [data(eax)] ; (DCTELEM *)
+
+ ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+ ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
+ punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
+
+ ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+ ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm7, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
+ punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
+ movdqa xmm0, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
+ punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
+
+ movdqa xmm2, xmm5 ; transpose coefficients(phase 2)
+ punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
+ punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
+ movdqa xmm3, xmm7 ; transpose coefficients(phase 2)
+ punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
+ punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
+
+ movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
+
+ movdqa xmm2, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
+ movdqa xmm7, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
+ punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
+
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
+ punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
+ punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+ movdqa xmm5, xmm6
+ movdqa xmm3, xmm1
+ psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6
+ psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7
+ paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1
+ paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
+ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
+ movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
+
+ movdqa xmm6, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
+ punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
+ movdqa xmm1, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
+ punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+ movdqa xmm7, xmm6
+ movdqa xmm0, xmm2
+ paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3
+ paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2
+ psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4
+ psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm1, xmm5
+ psubw xmm3, xmm6 ; xmm3=tmp13
+ psubw xmm5, xmm2 ; xmm5=tmp12
+ paddw xmm4, xmm6 ; xmm4=tmp10
+ paddw xmm1, xmm2 ; xmm1=tmp11
+
+ paddw xmm5, xmm3
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z1
+
+ movdqa xmm6, xmm4
+ movdqa xmm2, xmm3
+ psubw xmm4, xmm1 ; xmm4=data4
+ psubw xmm3, xmm5 ; xmm3=data6
+ paddw xmm6, xmm1 ; xmm6=data0
+ paddw xmm2, xmm5 ; xmm2=data2
+
+ movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
+ movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
+ movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
+
+ ; -- Odd part
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
+
+ paddw xmm7, xmm0 ; xmm7=tmp10
+ paddw xmm0, xmm1 ; xmm0=tmp11
+ paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7
+
+ psllw xmm7, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm1, PRE_MULTIPLY_SCALE_BITS
+
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm0, [GOTOFF(ebx,PW_F0707)] ; xmm0=z3
+
+ movdqa xmm4, xmm7 ; xmm4=tmp10
+ psubw xmm7, xmm1
+ pmulhw xmm7, [GOTOFF(ebx,PW_F0382)] ; xmm7=z5
+ pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+ pmulhw xmm1, [GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+ paddw xmm4, xmm7 ; xmm4=z2
+ paddw xmm1, xmm7 ; xmm1=z4
+
+ movdqa xmm3, xmm5
+ psubw xmm5, xmm0 ; xmm5=z13
+ paddw xmm3, xmm0 ; xmm3=z11
+
+ movdqa xmm6, xmm5
+ movdqa xmm2, xmm3
+ psubw xmm5, xmm4 ; xmm5=data3
+ psubw xmm3, xmm1 ; xmm3=data7
+ paddw xmm6, xmm4 ; xmm6=data5
+ paddw xmm2, xmm1 ; xmm2=data1
+
+ movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
+ movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
+ movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
+ movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp,ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; --------------------------------------------------------------------------
-%define CONST_BITS 13
-%define PASS1_BITS 2
+%define CONST_BITS 13
+%define PASS1_BITS 2
-%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2 (CONST_BITS+PASS1_BITS)
+%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS+PASS1_BITS)
%if CONST_BITS == 13
-F_0_298 equ 2446 ; FIX(0.298631336)
-F_0_390 equ 3196 ; FIX(0.390180644)
-F_0_541 equ 4433 ; FIX(0.541196100)
-F_0_765 equ 6270 ; FIX(0.765366865)
-F_0_899 equ 7373 ; FIX(0.899976223)
-F_1_175 equ 9633 ; FIX(1.175875602)
-F_1_501 equ 12299 ; FIX(1.501321110)
-F_1_847 equ 15137 ; FIX(1.847759065)
-F_1_961 equ 16069 ; FIX(1.961570560)
-F_2_053 equ 16819 ; FIX(2.053119869)
-F_2_562 equ 20995 ; FIX(2.562915447)
-F_3_072 equ 25172 ; FIX(3.072711026)
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
-F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
-F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
-F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
-F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
-F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
-F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
-F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
-F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
-F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
+F_0_298 equ DESCALE( 320652955, 30-CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30-CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30-CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30-CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30-CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30-CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026)
%endif
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_fdct_islow_sse2)
+ alignz 16
+ global EXTN(jconst_fdct_islow_sse2)
EXTN(jconst_fdct_islow_sse2):
-PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
-PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
-PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
-PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
-PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1)
+PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
+PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
+PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
+PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
+PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1)
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
+ SECTION SEG_TEXT
+ BITS 64
;
; Perform the forward DCT on one block of samples.
;
; r10 = DCTELEM *data
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 6
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 6
- align 16
- global EXTN(jsimd_fdct_islow_sse2)
+ align 16
+ global EXTN(jsimd_fdct_islow_sse2)
EXTN(jsimd_fdct_islow_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
-
- ; ---- Pass 1: process rows.
-
- mov rdx, r10 ; (DCTELEM *)
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
-
- ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
- ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
- movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
- punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
- punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
- movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
- punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
- punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
-
- movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
-
- ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
- ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
- movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
-
- movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
- punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
- movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
- punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
-
- movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
- punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
- movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
- punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
-
- movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
- movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
- movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
-
- movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
- punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
- punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
- movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
- punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
- punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
-
- movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
- punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
- punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
- movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
- punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
- punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
- movdqa xmm6,xmm1
- movdqa xmm3,xmm0
- psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
- psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
- paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
- paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
-
- movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
- movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
- movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
-
- movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
- punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
- punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
- movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
- punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
- punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
- movdqa xmm2,xmm1
- movdqa xmm5,xmm7
- paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
- paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
- psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
- psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
-
- ; -- Even part
-
- movdqa xmm4,xmm3
- movdqa xmm0,xmm6
- paddw xmm3,xmm1 ; xmm3=tmp10
- paddw xmm6,xmm7 ; xmm6=tmp11
- psubw xmm4,xmm1 ; xmm4=tmp13
- psubw xmm0,xmm7 ; xmm0=tmp12
-
- movdqa xmm1,xmm3
- paddw xmm3,xmm6 ; xmm3=tmp10+tmp11
- psubw xmm1,xmm6 ; xmm1=tmp10-tmp11
-
- psllw xmm3,PASS1_BITS ; xmm3=data0
- psllw xmm1,PASS1_BITS ; xmm1=data4
-
- movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
- movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
-
- ; (Original)
- ; z1 = (tmp12 + tmp13) * 0.541196100;
- ; data2 = z1 + tmp13 * 0.765366865;
- ; data6 = z1 + tmp12 * -1.847759065;
- ;
- ; (This implementation)
- ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
- ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
- movdqa xmm7,xmm4 ; xmm4=tmp13
- movdqa xmm6,xmm4
- punpcklwd xmm7,xmm0 ; xmm0=tmp12
- punpckhwd xmm6,xmm0
- movdqa xmm4,xmm7
- movdqa xmm0,xmm6
- pmaddwd xmm7,[rel PW_F130_F054] ; xmm7=data2L
- pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=data2H
- pmaddwd xmm4,[rel PW_F054_MF130] ; xmm4=data6L
- pmaddwd xmm0,[rel PW_F054_MF130] ; xmm0=data6H
-
- paddd xmm7,[rel PD_DESCALE_P1]
- paddd xmm6,[rel PD_DESCALE_P1]
- psrad xmm7,DESCALE_P1
- psrad xmm6,DESCALE_P1
- paddd xmm4,[rel PD_DESCALE_P1]
- paddd xmm0,[rel PD_DESCALE_P1]
- psrad xmm4,DESCALE_P1
- psrad xmm0,DESCALE_P1
-
- packssdw xmm7,xmm6 ; xmm7=data2
- packssdw xmm4,xmm0 ; xmm4=data6
-
- movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
- movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
-
- ; -- Odd part
-
- movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
- movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
-
- movdqa xmm6,xmm2 ; xmm2=tmp4
- movdqa xmm0,xmm5 ; xmm5=tmp5
- paddw xmm6,xmm3 ; xmm6=z3
- paddw xmm0,xmm1 ; xmm0=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movdqa xmm7,xmm6
- movdqa xmm4,xmm6
- punpcklwd xmm7,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm6,xmm7
- movdqa xmm0,xmm4
- pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3L
- pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3H
- pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4L
- pmaddwd xmm0,[rel PW_F117_F078] ; xmm0=z4H
-
- movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
- movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
-
- ; (Original)
- ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
- ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
- ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
- ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
- ;
- ; (This implementation)
- ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
- ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
- ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
- ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
- ; data7 = tmp4 + z3; data5 = tmp5 + z4;
- ; data3 = tmp6 + z3; data1 = tmp7 + z4;
-
- movdqa xmm7,xmm2
- movdqa xmm4,xmm2
- punpcklwd xmm7,xmm1
- punpckhwd xmm4,xmm1
- movdqa xmm2,xmm7
- movdqa xmm1,xmm4
- pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp4L
- pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4H
- pmaddwd xmm2,[rel PW_MF089_F060] ; xmm2=tmp7L
- pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp7H
-
- paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
- paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
- paddd xmm2,xmm6 ; xmm2=data1L
- paddd xmm1,xmm0 ; xmm1=data1H
-
- paddd xmm7,[rel PD_DESCALE_P1]
- paddd xmm4,[rel PD_DESCALE_P1]
- psrad xmm7,DESCALE_P1
- psrad xmm4,DESCALE_P1
- paddd xmm2,[rel PD_DESCALE_P1]
- paddd xmm1,[rel PD_DESCALE_P1]
- psrad xmm2,DESCALE_P1
- psrad xmm1,DESCALE_P1
-
- packssdw xmm7,xmm4 ; xmm7=data7
- packssdw xmm2,xmm1 ; xmm2=data1
-
- movdqa xmm4,xmm5
- movdqa xmm1,xmm5
- punpcklwd xmm4,xmm3
- punpckhwd xmm1,xmm3
- movdqa xmm5,xmm4
- movdqa xmm3,xmm1
- pmaddwd xmm4,[rel PW_MF050_MF256] ; xmm4=tmp5L
- pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5H
- pmaddwd xmm5,[rel PW_MF256_F050] ; xmm5=tmp6L
- pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6H
-
- paddd xmm4,xmm6 ; xmm4=data5L
- paddd xmm1,xmm0 ; xmm1=data5H
- paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
- paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
-
- paddd xmm4,[rel PD_DESCALE_P1]
- paddd xmm1,[rel PD_DESCALE_P1]
- psrad xmm4,DESCALE_P1
- psrad xmm1,DESCALE_P1
- paddd xmm5,[rel PD_DESCALE_P1]
- paddd xmm3,[rel PD_DESCALE_P1]
- psrad xmm5,DESCALE_P1
- psrad xmm3,DESCALE_P1
-
- packssdw xmm4,xmm1 ; xmm4=data5
- packssdw xmm5,xmm3 ; xmm5=data3
-
- ; ---- Pass 2: process columns.
-
- movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0
- movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2
-
- ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
- ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
-
- movdqa xmm1,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
- punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
- movdqa xmm3,xmm0 ; transpose coefficients(phase 1)
- punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
- punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
-
- movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
- movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
-
- ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
- ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
-
- movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
- movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
-
- movdqa xmm0,xmm2 ; transpose coefficients(phase 1)
- punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
- punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
- movdqa xmm3,xmm5 ; transpose coefficients(phase 1)
- punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
- punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
-
- movdqa xmm4,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
- punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
- movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
- punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
- punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
-
- movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
- movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
- movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
- movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
-
- movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
- punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
- movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
- punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
- punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
-
- movdqa xmm5,xmm6 ; transpose coefficients(phase 3)
- punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
- punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
- movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
- punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
- punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
-
- movdqa xmm2,xmm5
- movdqa xmm7,xmm6
- psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6
- psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7
- paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1
- paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0
-
- movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
- movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
- movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
- movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
-
- movdqa xmm5,xmm4 ; transpose coefficients(phase 3)
- punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
- punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
- movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
- punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
- punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
-
- movdqa xmm0,xmm5
- movdqa xmm3,xmm4
- paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3
- paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2
- psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4
- psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5
-
- ; -- Even part
-
- movdqa xmm1,xmm7
- movdqa xmm6,xmm2
- paddw xmm7,xmm5 ; xmm7=tmp10
- paddw xmm2,xmm4 ; xmm2=tmp11
- psubw xmm1,xmm5 ; xmm1=tmp13
- psubw xmm6,xmm4 ; xmm6=tmp12
-
- movdqa xmm5,xmm7
- paddw xmm7,xmm2 ; xmm7=tmp10+tmp11
- psubw xmm5,xmm2 ; xmm5=tmp10-tmp11
-
- paddw xmm7,[rel PW_DESCALE_P2X]
- paddw xmm5,[rel PW_DESCALE_P2X]
- psraw xmm7,PASS1_BITS ; xmm7=data0
- psraw xmm5,PASS1_BITS ; xmm5=data4
-
- movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
- movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
-
- ; (Original)
- ; z1 = (tmp12 + tmp13) * 0.541196100;
- ; data2 = z1 + tmp13 * 0.765366865;
- ; data6 = z1 + tmp12 * -1.847759065;
- ;
- ; (This implementation)
- ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
- ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
- movdqa xmm4,xmm1 ; xmm1=tmp13
- movdqa xmm2,xmm1
- punpcklwd xmm4,xmm6 ; xmm6=tmp12
- punpckhwd xmm2,xmm6
- movdqa xmm1,xmm4
- movdqa xmm6,xmm2
- pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=data2L
- pmaddwd xmm2,[rel PW_F130_F054] ; xmm2=data2H
- pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=data6L
- pmaddwd xmm6,[rel PW_F054_MF130] ; xmm6=data6H
-
- paddd xmm4,[rel PD_DESCALE_P2]
- paddd xmm2,[rel PD_DESCALE_P2]
- psrad xmm4,DESCALE_P2
- psrad xmm2,DESCALE_P2
- paddd xmm1,[rel PD_DESCALE_P2]
- paddd xmm6,[rel PD_DESCALE_P2]
- psrad xmm1,DESCALE_P2
- psrad xmm6,DESCALE_P2
-
- packssdw xmm4,xmm2 ; xmm4=data2
- packssdw xmm1,xmm6 ; xmm1=data6
-
- movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
- movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
-
- ; -- Odd part
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
-
- movdqa xmm2,xmm0 ; xmm0=tmp4
- movdqa xmm6,xmm3 ; xmm3=tmp5
- paddw xmm2,xmm7 ; xmm2=z3
- paddw xmm6,xmm5 ; xmm6=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movdqa xmm4,xmm2
- movdqa xmm1,xmm2
- punpcklwd xmm4,xmm6
- punpckhwd xmm1,xmm6
- movdqa xmm2,xmm4
- movdqa xmm6,xmm1
- pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3L
- pmaddwd xmm1,[rel PW_MF078_F117] ; xmm1=z3H
- pmaddwd xmm2,[rel PW_F117_F078] ; xmm2=z4L
- pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4H
-
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
- movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
-
- ; (Original)
- ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
- ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
- ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
- ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
- ;
- ; (This implementation)
- ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
- ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
- ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
- ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
- ; data7 = tmp4 + z3; data5 = tmp5 + z4;
- ; data3 = tmp6 + z3; data1 = tmp7 + z4;
-
- movdqa xmm4,xmm0
- movdqa xmm1,xmm0
- punpcklwd xmm4,xmm5
- punpckhwd xmm1,xmm5
- movdqa xmm0,xmm4
- movdqa xmm5,xmm1
- pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4L
- pmaddwd xmm1,[rel PW_MF060_MF089] ; xmm1=tmp4H
- pmaddwd xmm0,[rel PW_MF089_F060] ; xmm0=tmp7L
- pmaddwd xmm5,[rel PW_MF089_F060] ; xmm5=tmp7H
-
- paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
- paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
- paddd xmm0,xmm2 ; xmm0=data1L
- paddd xmm5,xmm6 ; xmm5=data1H
-
- paddd xmm4,[rel PD_DESCALE_P2]
- paddd xmm1,[rel PD_DESCALE_P2]
- psrad xmm4,DESCALE_P2
- psrad xmm1,DESCALE_P2
- paddd xmm0,[rel PD_DESCALE_P2]
- paddd xmm5,[rel PD_DESCALE_P2]
- psrad xmm0,DESCALE_P2
- psrad xmm5,DESCALE_P2
-
- packssdw xmm4,xmm1 ; xmm4=data7
- packssdw xmm0,xmm5 ; xmm0=data1
-
- movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
- movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
-
- movdqa xmm1,xmm3
- movdqa xmm5,xmm3
- punpcklwd xmm1,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm3,xmm1
- movdqa xmm7,xmm5
- pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5L
- pmaddwd xmm5,[rel PW_MF050_MF256] ; xmm5=tmp5H
- pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6L
- pmaddwd xmm7,[rel PW_MF256_F050] ; xmm7=tmp6H
-
- paddd xmm1,xmm2 ; xmm1=data5L
- paddd xmm5,xmm6 ; xmm5=data5H
- paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
- paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
-
- paddd xmm1,[rel PD_DESCALE_P2]
- paddd xmm5,[rel PD_DESCALE_P2]
- psrad xmm1,DESCALE_P2
- psrad xmm5,DESCALE_P2
- paddd xmm3,[rel PD_DESCALE_P2]
- paddd xmm7,[rel PD_DESCALE_P2]
- psrad xmm3,DESCALE_P2
- psrad xmm7,DESCALE_P2
-
- packssdw xmm1,xmm5 ; xmm1=data5
- packssdw xmm3,xmm7 ; xmm3=data3
-
- movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
- movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
-
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args
+
+ ; ---- Pass 1: process rows.
+
+ mov rdx, r10 ; (DCTELEM *)
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+ ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+ ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+ ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+ ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
+ movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
+ movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
+ movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
+ movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
+
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
+ movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
+ punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
+ punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+ movdqa xmm6, xmm1
+ movdqa xmm3, xmm0
+ psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
+ psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
+ paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
+ paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
+ movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
+ punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
+ punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+ movdqa xmm2, xmm1
+ movdqa xmm5, xmm7
+ paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
+ paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
+ psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
+ psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm0, xmm6
+ paddw xmm3, xmm1 ; xmm3=tmp10
+ paddw xmm6, xmm7 ; xmm6=tmp11
+ psubw xmm4, xmm1 ; xmm4=tmp13
+ psubw xmm0, xmm7 ; xmm0=tmp12
+
+ movdqa xmm1, xmm3
+ paddw xmm3, xmm6 ; xmm3=tmp10+tmp11
+ psubw xmm1, xmm6 ; xmm1=tmp10-tmp11
+
+ psllw xmm3, PASS1_BITS ; xmm3=data0
+ psllw xmm1, PASS1_BITS ; xmm1=data4
+
+ movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
+ movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movdqa xmm7, xmm4 ; xmm4=tmp13
+ movdqa xmm6, xmm4
+ punpcklwd xmm7, xmm0 ; xmm0=tmp12
+ punpckhwd xmm6, xmm0
+ movdqa xmm4, xmm7
+ movdqa xmm0, xmm6
+ pmaddwd xmm7, [rel PW_F130_F054] ; xmm7=data2L
+ pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=data2H
+ pmaddwd xmm4, [rel PW_F054_MF130] ; xmm4=data6L
+ pmaddwd xmm0, [rel PW_F054_MF130] ; xmm0=data6H
+
+ paddd xmm7, [rel PD_DESCALE_P1]
+ paddd xmm6, [rel PD_DESCALE_P1]
+ psrad xmm7, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+ paddd xmm4, [rel PD_DESCALE_P1]
+ paddd xmm0, [rel PD_DESCALE_P1]
+ psrad xmm4, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm7, xmm6 ; xmm7=data2
+ packssdw xmm4, xmm0 ; xmm4=data6
+
+ movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
+ movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
+
+ ; -- Odd part
+
+ movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
+
+ movdqa xmm6, xmm2 ; xmm2=tmp4
+ movdqa xmm0, xmm5 ; xmm5=tmp5
+ paddw xmm6, xmm3 ; xmm6=z3
+ paddw xmm0, xmm1 ; xmm0=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm7, xmm6
+ movdqa xmm4, xmm6
+ punpcklwd xmm7, xmm0
+ punpckhwd xmm4, xmm0
+ movdqa xmm6, xmm7
+ movdqa xmm0, xmm4
+ pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3L
+ pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3H
+ pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4L
+ pmaddwd xmm0, [rel PW_F117_F078] ; xmm0=z4H
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
+ movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movdqa xmm7, xmm2
+ movdqa xmm4, xmm2
+ punpcklwd xmm7, xmm1
+ punpckhwd xmm4, xmm1
+ movdqa xmm2, xmm7
+ movdqa xmm1, xmm4
+ pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp4L
+ pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4H
+ pmaddwd xmm2, [rel PW_MF089_F060] ; xmm2=tmp7L
+ pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp7H
+
+ paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
+ paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
+ paddd xmm2, xmm6 ; xmm2=data1L
+ paddd xmm1, xmm0 ; xmm1=data1H
+
+ paddd xmm7, [rel PD_DESCALE_P1]
+ paddd xmm4, [rel PD_DESCALE_P1]
+ psrad xmm7, DESCALE_P1
+ psrad xmm4, DESCALE_P1
+ paddd xmm2, [rel PD_DESCALE_P1]
+ paddd xmm1, [rel PD_DESCALE_P1]
+ psrad xmm2, DESCALE_P1
+ psrad xmm1, DESCALE_P1
+
+ packssdw xmm7, xmm4 ; xmm7=data7
+ packssdw xmm2, xmm1 ; xmm2=data1
+
+ movdqa xmm4, xmm5
+ movdqa xmm1, xmm5
+ punpcklwd xmm4, xmm3
+ punpckhwd xmm1, xmm3
+ movdqa xmm5, xmm4
+ movdqa xmm3, xmm1
+ pmaddwd xmm4, [rel PW_MF050_MF256] ; xmm4=tmp5L
+ pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5H
+ pmaddwd xmm5, [rel PW_MF256_F050] ; xmm5=tmp6L
+ pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6H
+
+ paddd xmm4, xmm6 ; xmm4=data5L
+ paddd xmm1, xmm0 ; xmm1=data5H
+ paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
+ paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
+
+ paddd xmm4, [rel PD_DESCALE_P1]
+ paddd xmm1, [rel PD_DESCALE_P1]
+ psrad xmm4, DESCALE_P1
+ psrad xmm1, DESCALE_P1
+ paddd xmm5, [rel PD_DESCALE_P1]
+ paddd xmm3, [rel PD_DESCALE_P1]
+ psrad xmm5, DESCALE_P1
+ psrad xmm3, DESCALE_P1
+
+ packssdw xmm4, xmm1 ; xmm4=data5
+ packssdw xmm5, xmm3 ; xmm5=data3
+
+ ; ---- Pass 2: process columns.
+
+ movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0
+ movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2
+
+ ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+ ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm1, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
+ punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
+ movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
+
+ ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+ ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm0, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
+ punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
+ movdqa xmm3, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
+ punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
+
+ movdqa xmm4, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
+ punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
+ punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
+ movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
+ movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
+ movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
+ punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
+
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 3)
+ punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
+ punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
+ punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+ movdqa xmm2, xmm5
+ movdqa xmm7, xmm6
+ psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6
+ psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1
+ paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0
+
+ movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
+ movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movdqa xmm5, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
+ punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
+ punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+ movdqa xmm0, xmm5
+ movdqa xmm3, xmm4
+ paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3
+ paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4
+ psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm1, xmm7
+ movdqa xmm6, xmm2
+ paddw xmm7, xmm5 ; xmm7=tmp10
+ paddw xmm2, xmm4 ; xmm2=tmp11
+ psubw xmm1, xmm5 ; xmm1=tmp13
+ psubw xmm6, xmm4 ; xmm6=tmp12
+
+ movdqa xmm5, xmm7
+ paddw xmm7, xmm2 ; xmm7=tmp10+tmp11
+ psubw xmm5, xmm2 ; xmm5=tmp10-tmp11
+
+ paddw xmm7, [rel PW_DESCALE_P2X]
+ paddw xmm5, [rel PW_DESCALE_P2X]
+ psraw xmm7, PASS1_BITS ; xmm7=data0
+ psraw xmm5, PASS1_BITS ; xmm5=data4
+
+ movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
+ movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movdqa xmm4, xmm1 ; xmm1=tmp13
+ movdqa xmm2, xmm1
+ punpcklwd xmm4, xmm6 ; xmm6=tmp12
+ punpckhwd xmm2, xmm6
+ movdqa xmm1, xmm4
+ movdqa xmm6, xmm2
+ pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=data2L
+ pmaddwd xmm2, [rel PW_F130_F054] ; xmm2=data2H
+ pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=data6L
+ pmaddwd xmm6, [rel PW_F054_MF130] ; xmm6=data6H
+
+ paddd xmm4, [rel PD_DESCALE_P2]
+ paddd xmm2, [rel PD_DESCALE_P2]
+ psrad xmm4, DESCALE_P2
+ psrad xmm2, DESCALE_P2
+ paddd xmm1, [rel PD_DESCALE_P2]
+ paddd xmm6, [rel PD_DESCALE_P2]
+ psrad xmm1, DESCALE_P2
+ psrad xmm6, DESCALE_P2
+
+ packssdw xmm4, xmm2 ; xmm4=data2
+ packssdw xmm1, xmm6 ; xmm1=data6
+
+ movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
+
+ ; -- Odd part
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
+
+ movdqa xmm2, xmm0 ; xmm0=tmp4
+ movdqa xmm6, xmm3 ; xmm3=tmp5
+ paddw xmm2, xmm7 ; xmm2=z3
+ paddw xmm6, xmm5 ; xmm6=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm4, xmm2
+ movdqa xmm1, xmm2
+ punpcklwd xmm4, xmm6
+ punpckhwd xmm1, xmm6
+ movdqa xmm2, xmm4
+ movdqa xmm6, xmm1
+ pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3L
+ pmaddwd xmm1, [rel PW_MF078_F117] ; xmm1=z3H
+ pmaddwd xmm2, [rel PW_F117_F078] ; xmm2=z4L
+ pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4H
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movdqa xmm4, xmm0
+ movdqa xmm1, xmm0
+ punpcklwd xmm4, xmm5
+ punpckhwd xmm1, xmm5
+ movdqa xmm0, xmm4
+ movdqa xmm5, xmm1
+ pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4L
+ pmaddwd xmm1, [rel PW_MF060_MF089] ; xmm1=tmp4H
+ pmaddwd xmm0, [rel PW_MF089_F060] ; xmm0=tmp7L
+ pmaddwd xmm5, [rel PW_MF089_F060] ; xmm5=tmp7H
+
+ paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
+ paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
+ paddd xmm0, xmm2 ; xmm0=data1L
+ paddd xmm5, xmm6 ; xmm5=data1H
+
+ paddd xmm4, [rel PD_DESCALE_P2]
+ paddd xmm1, [rel PD_DESCALE_P2]
+ psrad xmm4, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm0, [rel PD_DESCALE_P2]
+ paddd xmm5, [rel PD_DESCALE_P2]
+ psrad xmm0, DESCALE_P2
+ psrad xmm5, DESCALE_P2
+
+ packssdw xmm4, xmm1 ; xmm4=data7
+ packssdw xmm0, xmm5 ; xmm0=data1
+
+ movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
+
+ movdqa xmm1, xmm3
+ movdqa xmm5, xmm3
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm5, xmm7
+ movdqa xmm3, xmm1
+ movdqa xmm7, xmm5
+ pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5L
+ pmaddwd xmm5, [rel PW_MF050_MF256] ; xmm5=tmp5H
+ pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6L
+ pmaddwd xmm7, [rel PW_MF256_F050] ; xmm7=tmp6H
+
+ paddd xmm1, xmm2 ; xmm1=data5L
+ paddd xmm5, xmm6 ; xmm5=data5H
+ paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
+ paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
+
+ paddd xmm1, [rel PD_DESCALE_P2]
+ paddd xmm5, [rel PD_DESCALE_P2]
+ psrad xmm1, DESCALE_P2
+ psrad xmm5, DESCALE_P2
+ paddd xmm3, [rel PD_DESCALE_P2]
+ paddd xmm7, [rel PD_DESCALE_P2]
+ psrad xmm3, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm1, xmm5 ; xmm1=data5
+ packssdw xmm3, xmm7 ; xmm3=data3
+
+ movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
+
+ uncollect_args
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; --------------------------------------------------------------------------
-%define CONST_BITS 13
-%define PASS1_BITS 2
+%define CONST_BITS 13
+%define PASS1_BITS 2
-%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2 (CONST_BITS+PASS1_BITS)
+%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS+PASS1_BITS)
%if CONST_BITS == 13
-F_0_298 equ 2446 ; FIX(0.298631336)
-F_0_390 equ 3196 ; FIX(0.390180644)
-F_0_541 equ 4433 ; FIX(0.541196100)
-F_0_765 equ 6270 ; FIX(0.765366865)
-F_0_899 equ 7373 ; FIX(0.899976223)
-F_1_175 equ 9633 ; FIX(1.175875602)
-F_1_501 equ 12299 ; FIX(1.501321110)
-F_1_847 equ 15137 ; FIX(1.847759065)
-F_1_961 equ 16069 ; FIX(1.961570560)
-F_2_053 equ 16819 ; FIX(2.053119869)
-F_2_562 equ 20995 ; FIX(2.562915447)
-F_3_072 equ 25172 ; FIX(3.072711026)
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
-F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
-F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
-F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
-F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
-F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
-F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
-F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
-F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
-F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
+F_0_298 equ DESCALE( 320652955, 30-CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30-CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30-CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30-CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30-CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30-CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026)
%endif
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_fdct_islow_sse2)
+ alignz 16
+ global EXTN(jconst_fdct_islow_sse2)
EXTN(jconst_fdct_islow_sse2):
-PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
-PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
-PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
-PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
-PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1)
+PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
+PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
+PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
+PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
+PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1)
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
+ SECTION SEG_TEXT
+ BITS 32
;
; Perform the forward DCT on one block of samples.
;
; jsimd_fdct_islow_sse2 (DCTELEM *data)
;
-%define data(b) (b)+8 ; DCTELEM *data
+%define data(b) (b)+8 ; DCTELEM *data
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 6
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 6
- align 16
- global EXTN(jsimd_fdct_islow_sse2)
+ align 16
+ global EXTN(jsimd_fdct_islow_sse2)
EXTN(jsimd_fdct_islow_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic ebx
-; push ecx ; unused
-; push edx ; need not be preserved
-; push esi ; unused
-; push edi ; unused
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process rows.
-
- mov edx, POINTER [data(eax)] ; (DCTELEM *)
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-
- ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
- ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
- movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
- punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
- punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
- movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
- punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
- punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
-
- movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
- ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
- ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
- movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
-
- movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
- punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
- movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
- punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
-
- movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
- punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
- movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
- punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
-
- movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
- movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
- movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
-
- movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
- punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
- punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
- movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
- punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
- punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
-
- movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
- punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
- punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
- movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
- punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
- punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
- movdqa xmm6,xmm1
- movdqa xmm3,xmm0
- psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
- psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
- paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
- paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
-
- movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
- movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
- movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
-
- movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
- punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
- punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
- movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
- punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
- punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
- movdqa xmm2,xmm1
- movdqa xmm5,xmm7
- paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
- paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
- psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
- psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
-
- ; -- Even part
-
- movdqa xmm4,xmm3
- movdqa xmm0,xmm6
- paddw xmm3,xmm1 ; xmm3=tmp10
- paddw xmm6,xmm7 ; xmm6=tmp11
- psubw xmm4,xmm1 ; xmm4=tmp13
- psubw xmm0,xmm7 ; xmm0=tmp12
-
- movdqa xmm1,xmm3
- paddw xmm3,xmm6 ; xmm3=tmp10+tmp11
- psubw xmm1,xmm6 ; xmm1=tmp10-tmp11
-
- psllw xmm3,PASS1_BITS ; xmm3=data0
- psllw xmm1,PASS1_BITS ; xmm1=data4
-
- movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
- movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
-
- ; (Original)
- ; z1 = (tmp12 + tmp13) * 0.541196100;
- ; data2 = z1 + tmp13 * 0.765366865;
- ; data6 = z1 + tmp12 * -1.847759065;
- ;
- ; (This implementation)
- ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
- ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
- movdqa xmm7,xmm4 ; xmm4=tmp13
- movdqa xmm6,xmm4
- punpcklwd xmm7,xmm0 ; xmm0=tmp12
- punpckhwd xmm6,xmm0
- movdqa xmm4,xmm7
- movdqa xmm0,xmm6
- pmaddwd xmm7,[GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L
- pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H
- pmaddwd xmm4,[GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L
- pmaddwd xmm0,[GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H
-
- paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad xmm7,DESCALE_P1
- psrad xmm6,DESCALE_P1
- paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad xmm4,DESCALE_P1
- psrad xmm0,DESCALE_P1
-
- packssdw xmm7,xmm6 ; xmm7=data2
- packssdw xmm4,xmm0 ; xmm4=data6
-
- movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
- movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
-
- ; -- Odd part
-
- movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
- movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
-
- movdqa xmm6,xmm2 ; xmm2=tmp4
- movdqa xmm0,xmm5 ; xmm5=tmp5
- paddw xmm6,xmm3 ; xmm6=z3
- paddw xmm0,xmm1 ; xmm0=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movdqa xmm7,xmm6
- movdqa xmm4,xmm6
- punpcklwd xmm7,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm6,xmm7
- movdqa xmm0,xmm4
- pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L
- pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H
- pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L
- pmaddwd xmm0,[GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H
-
- movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
- movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
-
- ; (Original)
- ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
- ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
- ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
- ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
- ;
- ; (This implementation)
- ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
- ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
- ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
- ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
- ; data7 = tmp4 + z3; data5 = tmp5 + z4;
- ; data3 = tmp6 + z3; data1 = tmp7 + z4;
-
- movdqa xmm7,xmm2
- movdqa xmm4,xmm2
- punpcklwd xmm7,xmm1
- punpckhwd xmm4,xmm1
- movdqa xmm2,xmm7
- movdqa xmm1,xmm4
- pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L
- pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H
- pmaddwd xmm2,[GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L
- pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H
-
- paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
- paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
- paddd xmm2,xmm6 ; xmm2=data1L
- paddd xmm1,xmm0 ; xmm1=data1H
-
- paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad xmm7,DESCALE_P1
- psrad xmm4,DESCALE_P1
- paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad xmm2,DESCALE_P1
- psrad xmm1,DESCALE_P1
-
- packssdw xmm7,xmm4 ; xmm7=data7
- packssdw xmm2,xmm1 ; xmm2=data1
-
- movdqa xmm4,xmm5
- movdqa xmm1,xmm5
- punpcklwd xmm4,xmm3
- punpckhwd xmm1,xmm3
- movdqa xmm5,xmm4
- movdqa xmm3,xmm1
- pmaddwd xmm4,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L
- pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H
- pmaddwd xmm5,[GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L
- pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H
-
- paddd xmm4,xmm6 ; xmm4=data5L
- paddd xmm1,xmm0 ; xmm1=data5H
- paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
- paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
-
- paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad xmm4,DESCALE_P1
- psrad xmm1,DESCALE_P1
- paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P1)]
- paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]
- psrad xmm5,DESCALE_P1
- psrad xmm3,DESCALE_P1
-
- packssdw xmm4,xmm1 ; xmm4=data5
- packssdw xmm5,xmm3 ; xmm5=data3
-
- ; ---- Pass 2: process columns.
-
-; mov edx, POINTER [data(eax)] ; (DCTELEM *)
-
- movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0
- movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2
-
- ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
- ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
-
- movdqa xmm1,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
- punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
- movdqa xmm3,xmm0 ; transpose coefficients(phase 1)
- punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
- punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
-
- movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
- movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
-
- ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
- ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
-
- movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
- movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
-
- movdqa xmm0,xmm2 ; transpose coefficients(phase 1)
- punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
- punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
- movdqa xmm3,xmm5 ; transpose coefficients(phase 1)
- punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
- punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
-
- movdqa xmm4,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
- punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
- movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
- punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
- punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
-
- movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
- movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
- movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
- movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
-
- movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
- punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
- movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
- punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
- punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
-
- movdqa xmm5,xmm6 ; transpose coefficients(phase 3)
- punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
- punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
- movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
- punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
- punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
-
- movdqa xmm2,xmm5
- movdqa xmm7,xmm6
- psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6
- psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7
- paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1
- paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0
-
- movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
- movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
- movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
- movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
-
- movdqa xmm5,xmm4 ; transpose coefficients(phase 3)
- punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
- punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
- movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
- punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
- punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
-
- movdqa xmm0,xmm5
- movdqa xmm3,xmm4
- paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3
- paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2
- psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4
- psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5
-
- ; -- Even part
-
- movdqa xmm1,xmm7
- movdqa xmm6,xmm2
- paddw xmm7,xmm5 ; xmm7=tmp10
- paddw xmm2,xmm4 ; xmm2=tmp11
- psubw xmm1,xmm5 ; xmm1=tmp13
- psubw xmm6,xmm4 ; xmm6=tmp12
-
- movdqa xmm5,xmm7
- paddw xmm7,xmm2 ; xmm7=tmp10+tmp11
- psubw xmm5,xmm2 ; xmm5=tmp10-tmp11
-
- paddw xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
- paddw xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
- psraw xmm7,PASS1_BITS ; xmm7=data0
- psraw xmm5,PASS1_BITS ; xmm5=data4
-
- movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
- movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
-
- ; (Original)
- ; z1 = (tmp12 + tmp13) * 0.541196100;
- ; data2 = z1 + tmp13 * 0.765366865;
- ; data6 = z1 + tmp12 * -1.847759065;
- ;
- ; (This implementation)
- ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
- ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
- movdqa xmm4,xmm1 ; xmm1=tmp13
- movdqa xmm2,xmm1
- punpcklwd xmm4,xmm6 ; xmm6=tmp12
- punpckhwd xmm2,xmm6
- movdqa xmm1,xmm4
- movdqa xmm6,xmm2
- pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L
- pmaddwd xmm2,[GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H
- pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L
- pmaddwd xmm6,[GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H
-
- paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad xmm4,DESCALE_P2
- psrad xmm2,DESCALE_P2
- paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad xmm1,DESCALE_P2
- psrad xmm6,DESCALE_P2
-
- packssdw xmm4,xmm2 ; xmm4=data2
- packssdw xmm1,xmm6 ; xmm1=data6
-
- movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
- movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
-
- ; -- Odd part
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
-
- movdqa xmm2,xmm0 ; xmm0=tmp4
- movdqa xmm6,xmm3 ; xmm3=tmp5
- paddw xmm2,xmm7 ; xmm2=z3
- paddw xmm6,xmm5 ; xmm6=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movdqa xmm4,xmm2
- movdqa xmm1,xmm2
- punpcklwd xmm4,xmm6
- punpckhwd xmm1,xmm6
- movdqa xmm2,xmm4
- movdqa xmm6,xmm1
- pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L
- pmaddwd xmm1,[GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H
- pmaddwd xmm2,[GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L
- pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H
-
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
- movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
-
- ; (Original)
- ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
- ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
- ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
- ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
- ;
- ; (This implementation)
- ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
- ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
- ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
- ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
- ; data7 = tmp4 + z3; data5 = tmp5 + z4;
- ; data3 = tmp6 + z3; data1 = tmp7 + z4;
-
- movdqa xmm4,xmm0
- movdqa xmm1,xmm0
- punpcklwd xmm4,xmm5
- punpckhwd xmm1,xmm5
- movdqa xmm0,xmm4
- movdqa xmm5,xmm1
- pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L
- pmaddwd xmm1,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H
- pmaddwd xmm0,[GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L
- pmaddwd xmm5,[GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H
-
- paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
- paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
- paddd xmm0,xmm2 ; xmm0=data1L
- paddd xmm5,xmm6 ; xmm5=data1H
-
- paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad xmm4,DESCALE_P2
- psrad xmm1,DESCALE_P2
- paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad xmm0,DESCALE_P2
- psrad xmm5,DESCALE_P2
-
- packssdw xmm4,xmm1 ; xmm4=data7
- packssdw xmm0,xmm5 ; xmm0=data1
-
- movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
- movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
-
- movdqa xmm1,xmm3
- movdqa xmm5,xmm3
- punpcklwd xmm1,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm3,xmm1
- movdqa xmm7,xmm5
- pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L
- pmaddwd xmm5,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H
- pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L
- pmaddwd xmm7,[GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H
-
- paddd xmm1,xmm2 ; xmm1=data5L
- paddd xmm5,xmm6 ; xmm5=data5H
- paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
- paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
-
- paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad xmm1,DESCALE_P2
- psrad xmm5,DESCALE_P2
- paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P2)]
- paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]
- psrad xmm3,DESCALE_P2
- psrad xmm7,DESCALE_P2
-
- packssdw xmm1,xmm5 ; xmm1=data5
- packssdw xmm3,xmm7 ; xmm3=data3
-
- movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
- movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
-
-; pop edi ; unused
-; pop esi ; unused
-; pop edx ; need not be preserved
-; pop ecx ; unused
- poppic ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+; push esi ; unused
+; push edi ; unused
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process rows.
+
+ mov edx, POINTER [data(eax)] ; (DCTELEM *)
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+ ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+ ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+ ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+ ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
+ movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
+ movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
+ movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
+ movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
+ movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
+
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
+ movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
+ punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
+ punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+ movdqa xmm6, xmm1
+ movdqa xmm3, xmm0
+ psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
+ psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
+ paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
+ paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
+
+ movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
+ movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
+ punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
+ punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+ movdqa xmm2, xmm1
+ movdqa xmm5, xmm7
+ paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
+ paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
+ psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
+ psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm4, xmm3
+ movdqa xmm0, xmm6
+ paddw xmm3, xmm1 ; xmm3=tmp10
+ paddw xmm6, xmm7 ; xmm6=tmp11
+ psubw xmm4, xmm1 ; xmm4=tmp13
+ psubw xmm0, xmm7 ; xmm0=tmp12
+
+ movdqa xmm1, xmm3
+ paddw xmm3, xmm6 ; xmm3=tmp10+tmp11
+ psubw xmm1, xmm6 ; xmm1=tmp10-tmp11
+
+ psllw xmm3, PASS1_BITS ; xmm3=data0
+ psllw xmm1, PASS1_BITS ; xmm1=data4
+
+ movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
+ movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movdqa xmm7, xmm4 ; xmm4=tmp13
+ movdqa xmm6, xmm4
+ punpcklwd xmm7, xmm0 ; xmm0=tmp12
+ punpckhwd xmm6, xmm0
+ movdqa xmm4, xmm7
+ movdqa xmm0, xmm6
+ pmaddwd xmm7, [GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L
+ pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L
+ pmaddwd xmm0, [GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H
+
+ paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm7, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+ paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm4, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm7, xmm6 ; xmm7=data2
+ packssdw xmm4, xmm0 ; xmm4=data6
+
+ movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
+ movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
+
+ ; -- Odd part
+
+ movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
+
+ movdqa xmm6, xmm2 ; xmm2=tmp4
+ movdqa xmm0, xmm5 ; xmm5=tmp5
+ paddw xmm6, xmm3 ; xmm6=z3
+ paddw xmm0, xmm1 ; xmm0=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm7, xmm6
+ movdqa xmm4, xmm6
+ punpcklwd xmm7, xmm0
+ punpckhwd xmm4, xmm0
+ movdqa xmm6, xmm7
+ movdqa xmm0, xmm4
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H
+ pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L
+ pmaddwd xmm0, [GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
+ movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movdqa xmm7, xmm2
+ movdqa xmm4, xmm2
+ punpcklwd xmm7, xmm1
+ punpckhwd xmm4, xmm1
+ movdqa xmm2, xmm7
+ movdqa xmm1, xmm4
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H
+ pmaddwd xmm2, [GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H
+
+ paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
+ paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
+ paddd xmm2, xmm6 ; xmm2=data1L
+ paddd xmm1, xmm0 ; xmm1=data1H
+
+ paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm7, DESCALE_P1
+ psrad xmm4, DESCALE_P1
+ paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm2, DESCALE_P1
+ psrad xmm1, DESCALE_P1
+
+ packssdw xmm7, xmm4 ; xmm7=data7
+ packssdw xmm2, xmm1 ; xmm2=data1
+
+ movdqa xmm4, xmm5
+ movdqa xmm1, xmm5
+ punpcklwd xmm4, xmm3
+ punpckhwd xmm1, xmm3
+ movdqa xmm5, xmm4
+ movdqa xmm3, xmm1
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H
+ pmaddwd xmm5, [GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L
+ pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H
+
+ paddd xmm4, xmm6 ; xmm4=data5L
+ paddd xmm1, xmm0 ; xmm1=data5H
+ paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
+ paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
+
+ paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm4, DESCALE_P1
+ psrad xmm1, DESCALE_P1
+ paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P1)]
+ paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P1)]
+ psrad xmm5, DESCALE_P1
+ psrad xmm3, DESCALE_P1
+
+ packssdw xmm4, xmm1 ; xmm4=data5
+ packssdw xmm5, xmm3 ; xmm5=data3
+
+ ; ---- Pass 2: process columns.
+
+; mov edx, POINTER [data(eax)] ; (DCTELEM *)
+
+ movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0
+ movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2
+
+ ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+ ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm1, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
+ punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
+ movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
+
+ ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+ ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
+
+ movdqa xmm0, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
+ punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
+ movdqa xmm3, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
+ punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
+
+ movdqa xmm4, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
+ punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
+ movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
+ punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
+ movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
+ movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
+ movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
+ punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
+
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 3)
+ punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
+ punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
+ punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+ movdqa xmm2, xmm5
+ movdqa xmm7, xmm6
+ psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6
+ psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7
+ paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1
+ paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0
+
+ movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
+ movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
+
+ movdqa xmm5, xmm4 ; transpose coefficients(phase 3)
+ punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
+ punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
+ punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+ movdqa xmm0, xmm5
+ movdqa xmm3, xmm4
+ paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3
+ paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2
+ psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4
+ psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5
+
+ ; -- Even part
+
+ movdqa xmm1, xmm7
+ movdqa xmm6, xmm2
+ paddw xmm7, xmm5 ; xmm7=tmp10
+ paddw xmm2, xmm4 ; xmm2=tmp11
+ psubw xmm1, xmm5 ; xmm1=tmp13
+ psubw xmm6, xmm4 ; xmm6=tmp12
+
+ movdqa xmm5, xmm7
+ paddw xmm7, xmm2 ; xmm7=tmp10+tmp11
+ psubw xmm5, xmm2 ; xmm5=tmp10-tmp11
+
+ paddw xmm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
+ paddw xmm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
+ psraw xmm7, PASS1_BITS ; xmm7=data0
+ psraw xmm5, PASS1_BITS ; xmm5=data4
+
+ movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
+ movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
+
+ ; (Original)
+ ; z1 = (tmp12 + tmp13) * 0.541196100;
+ ; data2 = z1 + tmp13 * 0.765366865;
+ ; data6 = z1 + tmp12 * -1.847759065;
+ ;
+ ; (This implementation)
+ ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+ ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+ movdqa xmm4, xmm1 ; xmm1=tmp13
+ movdqa xmm2, xmm1
+ punpcklwd xmm4, xmm6 ; xmm6=tmp12
+ punpckhwd xmm2, xmm6
+ movdqa xmm1, xmm4
+ movdqa xmm6, xmm2
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L
+ pmaddwd xmm6, [GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H
+
+ paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm4, DESCALE_P2
+ psrad xmm2, DESCALE_P2
+ paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm1, DESCALE_P2
+ psrad xmm6, DESCALE_P2
+
+ packssdw xmm4, xmm2 ; xmm4=data2
+ packssdw xmm1, xmm6 ; xmm1=data6
+
+ movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
+
+ ; -- Odd part
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
+
+ movdqa xmm2, xmm0 ; xmm0=tmp4
+ movdqa xmm6, xmm3 ; xmm3=tmp5
+ paddw xmm2, xmm7 ; xmm2=z3
+ paddw xmm6, xmm5 ; xmm6=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm4, xmm2
+ movdqa xmm1, xmm2
+ punpcklwd xmm4, xmm6
+ punpckhwd xmm1, xmm6
+ movdqa xmm2, xmm4
+ movdqa xmm6, xmm1
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L
+ pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
+ movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
+
+ ; (Original)
+ ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
+ ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
+ ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
+ ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+ ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+ ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+ ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+ ; data7 = tmp4 + z3; data5 = tmp5 + z4;
+ ; data3 = tmp6 + z3; data1 = tmp7 + z4;
+
+ movdqa xmm4, xmm0
+ movdqa xmm1, xmm0
+ punpcklwd xmm4, xmm5
+ punpckhwd xmm1, xmm5
+ movdqa xmm0, xmm4
+ movdqa xmm5, xmm1
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L
+ pmaddwd xmm5, [GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H
+
+ paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
+ paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
+ paddd xmm0, xmm2 ; xmm0=data1L
+ paddd xmm5, xmm6 ; xmm5=data1H
+
+ paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm4, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm0, DESCALE_P2
+ psrad xmm5, DESCALE_P2
+
+ packssdw xmm4, xmm1 ; xmm4=data7
+ packssdw xmm0, xmm5 ; xmm0=data1
+
+ movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
+ movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
+
+ movdqa xmm1, xmm3
+ movdqa xmm5, xmm3
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm5, xmm7
+ movdqa xmm3, xmm1
+ movdqa xmm7, xmm5
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L
+ pmaddwd xmm5, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H
+ pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H
+
+ paddd xmm1, xmm2 ; xmm1=data5L
+ paddd xmm5, xmm6 ; xmm5=data5H
+ paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
+ paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
+
+ paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm1, DESCALE_P2
+ psrad xmm5, DESCALE_P2
+ paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P2)]
+ paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+ psrad xmm3, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm1, xmm5 ; xmm1=data5
+ packssdw xmm3, xmm7 ; xmm3=data3
+
+ movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
+
+; pop edi ; unused
+; pop esi ; unused
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; --------------------------------------------------------------------------
-%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
- shufps %1,%2,0x44
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1, %2, 0x44
%endmacro
-%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
- shufps %1,%2,0xEE
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1, %2, 0xEE
%endmacro
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_idct_float_sse2)
+ alignz 16
+ global EXTN(jconst_idct_float_sse2)
EXTN(jconst_idct_float_sse2):
-PD_1_414 times 4 dd 1.414213562373095048801689
-PD_1_847 times 4 dd 1.847759065022573512256366
-PD_1_082 times 4 dd 1.082392200292393968799446
-PD_M2_613 times 4 dd -2.613125929752753055713286
-PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+PD_1_414 times 4 dd 1.414213562373095048801689
+PD_1_847 times 4 dd 1.847759065022573512256366
+PD_1_082 times 4 dd 1.082392200292393968799446
+PD_M2_613 times 4 dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
+ SECTION SEG_TEXT
+ BITS 64
;
; Perform dequantization and inverse DCT on one block of coefficients.
;
; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col
-%define original_rbp rbp+0
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+%define original_rbp rbp+0
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
; FAST_FLOAT workspace[DCTSIZE2]
- align 16
- global EXTN(jsimd_idct_float_sse2)
+ align 16
+ global EXTN(jsimd_idct_float_sse2)
EXTN(jsimd_idct_float_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [workspace]
- collect_args
- push rbx
-
- ; ---- Pass 1: process columns from input, store into work array.
-
- mov rdx, r10 ; quantptr
- mov rsi, r11 ; inptr
- lea rdi, [workspace] ; FAST_FLOAT *wsptr
- mov rcx, DCTSIZE/4 ; ctr
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [workspace]
+ collect_args
+ push rbx
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
+ lea rdi, [workspace] ; FAST_FLOAT *wsptr
+ mov rcx, DCTSIZE/4 ; ctr
.columnloop:
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
- mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- jnz near .columnDCT
-
- movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
- movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
- movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
- por xmm1,xmm2
- por xmm3,xmm4
- por xmm5,xmm6
- por xmm1,xmm3
- por xmm5,xmm7
- por xmm1,xmm5
- packsswb xmm1,xmm1
- movd eax,xmm1
- test rax,rax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-
- punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
- psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
- cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
-
- mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movaps xmm1,xmm0
- movaps xmm2,xmm0
- movaps xmm3,xmm0
-
- shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
- shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
- shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
- shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
-
- movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
- movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
- movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
- movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
- jmp near .nextcolumn
+ mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, xmm2
+ por xmm3, xmm4
+ por xmm5, xmm6
+ por xmm1, xmm3
+ por xmm5, xmm7
+ por xmm1, xmm5
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test rax, rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
+ cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, xmm0
+
+ shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
+ shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
+ shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
+ shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
+
+ movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+ jmp near .nextcolumn
%endif
.columnDCT:
- ; -- Even part
-
- movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
- movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
- movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-
- punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
- punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
- psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
- psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
- cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
- cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)
-
- punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
- punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
- psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
- psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
- cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)
- cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)
-
- mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movaps xmm4,xmm0
- movaps xmm5,xmm1
- subps xmm0,xmm2 ; xmm0=tmp11
- subps xmm1,xmm3
- addps xmm4,xmm2 ; xmm4=tmp10
- addps xmm5,xmm3 ; xmm5=tmp13
-
- mulps xmm1,[rel PD_1_414]
- subps xmm1,xmm5 ; xmm1=tmp12
-
- movaps xmm6,xmm4
- movaps xmm7,xmm0
- subps xmm4,xmm5 ; xmm4=tmp3
- subps xmm0,xmm1 ; xmm0=tmp2
- addps xmm6,xmm5 ; xmm6=tmp0
- addps xmm7,xmm1 ; xmm7=tmp1
-
- movaps XMMWORD [wk(1)], xmm4 ; tmp3
- movaps XMMWORD [wk(0)], xmm0 ; tmp2
-
- ; -- Odd part
-
- movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-
- punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
- punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
- psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
- psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
- cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)
- cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)
-
- punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
- punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
- psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
- psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
- cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)
- cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)
-
- mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movaps xmm4,xmm2
- movaps xmm0,xmm5
- addps xmm2,xmm1 ; xmm2=z11
- addps xmm5,xmm3 ; xmm5=z13
- subps xmm4,xmm1 ; xmm4=z12
- subps xmm0,xmm3 ; xmm0=z10
-
- movaps xmm1,xmm2
- subps xmm2,xmm5
- addps xmm1,xmm5 ; xmm1=tmp7
-
- mulps xmm2,[rel PD_1_414] ; xmm2=tmp11
-
- movaps xmm3,xmm0
- addps xmm0,xmm4
- mulps xmm0,[rel PD_1_847] ; xmm0=z5
- mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
- mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)
- addps xmm3,xmm0 ; xmm3=tmp12
- subps xmm4,xmm0 ; xmm4=tmp10
-
- ; -- Final output stage
-
- subps xmm3,xmm1 ; xmm3=tmp6
- movaps xmm5,xmm6
- movaps xmm0,xmm7
- addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
- addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
- subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
- subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
- subps xmm2,xmm3 ; xmm2=tmp5
-
- movaps xmm1,xmm6 ; transpose coefficients(phase 1)
- unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
- unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
- movaps xmm3,xmm0 ; transpose coefficients(phase 1)
- unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
- unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
-
- movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
- movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
-
- movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
- movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
-
- addps xmm4,xmm2 ; xmm4=tmp4
- movaps xmm0,xmm7
- movaps xmm3,xmm5
- addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
- addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
- subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
- subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
-
- movaps xmm2,xmm7 ; transpose coefficients(phase 1)
- unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
- unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
- movaps xmm4,xmm5 ; transpose coefficients(phase 1)
- unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
- unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
-
- movaps xmm3,xmm6 ; transpose coefficients(phase 2)
- unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
- unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
- movaps xmm0,xmm1 ; transpose coefficients(phase 2)
- unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
- unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
-
- movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
- movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
-
- movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
- movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
- movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
-
- movaps xmm6,xmm5 ; transpose coefficients(phase 2)
- unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
- unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
- movaps xmm3,xmm4 ; transpose coefficients(phase 2)
- unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
- unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
-
- movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
- movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
- movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+ ; -- Even part
+
+ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
+ psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
+ psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
+ cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
+ cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
+
+ punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
+ punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
+ psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
+ psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
+ cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
+ cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+ subps xmm0, xmm2 ; xmm0=tmp11
+ subps xmm1, xmm3
+ addps xmm4, xmm2 ; xmm4=tmp10
+ addps xmm5, xmm3 ; xmm5=tmp13
+
+ mulps xmm1, [rel PD_1_414]
+ subps xmm1, xmm5 ; xmm1=tmp12
+
+ movaps xmm6, xmm4
+ movaps xmm7, xmm0
+ subps xmm4, xmm5 ; xmm4=tmp3
+ subps xmm0, xmm1 ; xmm0=tmp2
+ addps xmm6, xmm5 ; xmm6=tmp0
+ addps xmm7, xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
+ punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
+ psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
+ psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
+ cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
+ cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
+
+ punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
+ punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
+ psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
+ psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
+ cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
+ cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
+
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4, xmm2
+ movaps xmm0, xmm5
+ addps xmm2, xmm1 ; xmm2=z11
+ addps xmm5, xmm3 ; xmm5=z13
+ subps xmm4, xmm1 ; xmm4=z12
+ subps xmm0, xmm3 ; xmm0=z10
+
+ movaps xmm1, xmm2
+ subps xmm2, xmm5
+ addps xmm1, xmm5 ; xmm1=tmp7
+
+ mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
+
+ movaps xmm3, xmm0
+ addps xmm0, xmm4
+ mulps xmm0, [rel PD_1_847] ; xmm0=z5
+ mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
+ addps xmm3, xmm0 ; xmm3=tmp12
+ subps xmm4, xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3, xmm1 ; xmm3=tmp6
+ movaps xmm5, xmm6
+ movaps xmm0, xmm7
+ addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
+ addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
+ subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
+ subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
+ subps xmm2, xmm3 ; xmm2=tmp5
+
+ movaps xmm1, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
+ movaps xmm3, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
+ unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
+
+ movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
+ movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
+
+ addps xmm4, xmm2 ; xmm4=tmp4
+ movaps xmm0, xmm7
+ movaps xmm3, xmm5
+ addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
+ addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
+ subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
+ subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
+
+ movaps xmm2, xmm7 ; transpose coefficients(phase 1)
+ unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
+ unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
+ movaps xmm4, xmm5 ; transpose coefficients(phase 1)
+ unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
+ unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
+
+ movaps xmm3, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
+ unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
+ movaps xmm0, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
+ unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
+ movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+
+ movaps xmm6, xmm5 ; transpose coefficients(phase 2)
+ unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
+ unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
+ movaps xmm3, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
+ unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
+ movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
.nextcolumn:
- add rsi, byte 4*SIZEOF_JCOEF ; coef_block
- add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
- add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
- dec rcx ; ctr
- jnz near .columnloop
-
- ; -- Prefetch the next coefficient block
-
- prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
- prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
- prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
- prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows from work array, store into output array.
-
- mov rax, [original_rbp]
- lea rsi, [workspace] ; FAST_FLOAT *wsptr
- mov rdi, r12 ; (JSAMPROW *)
- mov eax, r13d
- mov rcx, DCTSIZE/4 ; ctr
+ add rsi, byte 4*SIZEOF_JCOEF ; coef_block
+ add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
+ add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
+ dec rcx ; ctr
+ jnz near .columnloop
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov rax, [original_rbp]
+ lea rsi, [workspace] ; FAST_FLOAT *wsptr
+ mov rdi, r12 ; (JSAMPROW *)
+ mov eax, r13d
+ mov rcx, DCTSIZE/4 ; ctr
.rowloop:
- ; -- Even part
-
- movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
-
- movaps xmm4,xmm0
- movaps xmm5,xmm1
- subps xmm0,xmm2 ; xmm0=tmp11
- subps xmm1,xmm3
- addps xmm4,xmm2 ; xmm4=tmp10
- addps xmm5,xmm3 ; xmm5=tmp13
-
- mulps xmm1,[rel PD_1_414]
- subps xmm1,xmm5 ; xmm1=tmp12
-
- movaps xmm6,xmm4
- movaps xmm7,xmm0
- subps xmm4,xmm5 ; xmm4=tmp3
- subps xmm0,xmm1 ; xmm0=tmp2
- addps xmm6,xmm5 ; xmm6=tmp0
- addps xmm7,xmm1 ; xmm7=tmp1
-
- movaps XMMWORD [wk(1)], xmm4 ; tmp3
- movaps XMMWORD [wk(0)], xmm0 ; tmp2
-
- ; -- Odd part
-
- movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
- movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
-
- movaps xmm4,xmm2
- movaps xmm0,xmm5
- addps xmm2,xmm1 ; xmm2=z11
- addps xmm5,xmm3 ; xmm5=z13
- subps xmm4,xmm1 ; xmm4=z12
- subps xmm0,xmm3 ; xmm0=z10
-
- movaps xmm1,xmm2
- subps xmm2,xmm5
- addps xmm1,xmm5 ; xmm1=tmp7
-
- mulps xmm2,[rel PD_1_414] ; xmm2=tmp11
-
- movaps xmm3,xmm0
- addps xmm0,xmm4
- mulps xmm0,[rel PD_1_847] ; xmm0=z5
- mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
- mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)
- addps xmm3,xmm0 ; xmm3=tmp12
- subps xmm4,xmm0 ; xmm4=tmp10
-
- ; -- Final output stage
-
- subps xmm3,xmm1 ; xmm3=tmp6
- movaps xmm5,xmm6
- movaps xmm0,xmm7
- addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
- addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
- subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
- subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
- subps xmm2,xmm3 ; xmm2=tmp5
-
- movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
- pcmpeqd xmm3,xmm3
- psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
- addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
- addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
- addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
- addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
-
- pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
- pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
- pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
- pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
- por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
- por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
-
- movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
- movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
-
- addps xmm4,xmm2 ; xmm4=tmp4
- movaps xmm7,xmm1
- movaps xmm5,xmm3
- addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)
- addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)
- subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)
- subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)
-
- movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
- pcmpeqd xmm4,xmm4
- psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
- addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
- addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
- addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
- addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
-
- pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
- pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
- pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
- pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
- por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
- por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
-
- movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
-
- packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
- packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
- paddb xmm6,xmm2
- paddb xmm1,xmm2
-
- movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
- punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
- punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-
- movdqa xmm7,xmm6 ; transpose coefficients(phase 3)
- punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
- punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-
- pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
- pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
- mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
- movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
- mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
- mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
- movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
-
- add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
- add rdi, byte 4*SIZEOF_JSAMPROW
- dec rcx ; ctr
- jnz near .rowloop
-
- pop rbx
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
+ ; -- Even part
+
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+ subps xmm0, xmm2 ; xmm0=tmp11
+ subps xmm1, xmm3
+ addps xmm4, xmm2 ; xmm4=tmp10
+ addps xmm5, xmm3 ; xmm5=tmp13
+
+ mulps xmm1, [rel PD_1_414]
+ subps xmm1, xmm5 ; xmm1=tmp12
+
+ movaps xmm6, xmm4
+ movaps xmm7, xmm0
+ subps xmm4, xmm5 ; xmm4=tmp3
+ subps xmm0, xmm1 ; xmm0=tmp2
+ addps xmm6, xmm5 ; xmm6=tmp0
+ addps xmm7, xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4, xmm2
+ movaps xmm0, xmm5
+ addps xmm2, xmm1 ; xmm2=z11
+ addps xmm5, xmm3 ; xmm5=z13
+ subps xmm4, xmm1 ; xmm4=z12
+ subps xmm0, xmm3 ; xmm0=z10
+
+ movaps xmm1, xmm2
+ subps xmm2, xmm5
+ addps xmm1, xmm5 ; xmm1=tmp7
+
+ mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
+
+ movaps xmm3, xmm0
+ addps xmm0, xmm4
+ mulps xmm0, [rel PD_1_847] ; xmm0=z5
+ mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
+ addps xmm3, xmm0 ; xmm3=tmp12
+ subps xmm4, xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3, xmm1 ; xmm3=tmp6
+ movaps xmm5, xmm6
+ movaps xmm0, xmm7
+ addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
+ addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
+ subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
+ subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
+ subps xmm2, xmm3 ; xmm2=tmp5
+
+ movaps xmm1, [rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
+ pcmpeqd xmm3, xmm3
+ psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+ addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+ addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+ addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+ addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+ pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
+ pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+ pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
+ pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+ por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
+ por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
+
+ movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
+
+ addps xmm4, xmm2 ; xmm4=tmp4
+ movaps xmm7, xmm1
+ movaps xmm5, xmm3
+ addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
+ addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
+ subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
+ subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
+
+ movaps xmm2, [rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
+ pcmpeqd xmm4, xmm4
+ psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+ addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+ addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+ addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+ addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+ pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
+ pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+ pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
+ pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+ por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
+ por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
+
+ movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
+
+ packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+ packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+ paddb xmm6, xmm2
+ paddb xmm1, xmm2
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
+ punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
+ punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+ pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+ mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
+ mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+ movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
+
+ add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
+ add rdi, byte 4*SIZEOF_JSAMPROW
+ dec rcx ; ctr
+ jnz near .rowloop
+
+ pop rbx
+ uncollect_args
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; --------------------------------------------------------------------------
-%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
- shufps %1,%2,0x44
+%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ shufps %1, %2, 0x44
%endmacro
-%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
- shufps %1,%2,0xEE
+%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ shufps %1, %2, 0xEE
%endmacro
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_idct_float_sse2)
+ alignz 16
+ global EXTN(jconst_idct_float_sse2)
EXTN(jconst_idct_float_sse2):
-PD_1_414 times 4 dd 1.414213562373095048801689
-PD_1_847 times 4 dd 1.847759065022573512256366
-PD_1_082 times 4 dd 1.082392200292393968799446
-PD_M2_613 times 4 dd -2.613125929752753055713286
-PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+PD_1_414 times 4 dd 1.414213562373095048801689
+PD_1_847 times 4 dd 1.847759065022573512256366
+PD_1_082 times 4 dd 1.082392200292393968799446
+PD_M2_613 times 4 dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
+ SECTION SEG_TEXT
+ BITS 32
;
; Perform dequantization and inverse DCT on one block of coefficients.
;
; JSAMPARRAY output_buf, JDIMENSION output_col)
;
-%define dct_table(b) (b)+8 ; void *dct_table
-%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
-%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
-%define output_col(b) (b)+20 ; JDIMENSION output_col
+%define dct_table(b) (b)+8 ; void *dct_table
+%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
+%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
+%define output_col(b) (b)+20 ; JDIMENSION output_col
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
-%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
+%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
; FAST_FLOAT workspace[DCTSIZE2]
- align 16
- global EXTN(jsimd_idct_float_sse2)
+ align 16
+ global EXTN(jsimd_idct_float_sse2)
EXTN(jsimd_idct_float_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [workspace]
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process columns from input, store into work array.
-
-; mov eax, [original_ebp]
- mov edx, POINTER [dct_table(eax)] ; quantptr
- mov esi, JCOEFPTR [coef_block(eax)] ; inptr
- lea edi, [workspace] ; FAST_FLOAT *wsptr
- mov ecx, DCTSIZE/4 ; ctr
- alignx 16,7
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [workspace]
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input, store into work array.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ lea edi, [workspace] ; FAST_FLOAT *wsptr
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
.columnloop:
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
- jnz near .columnDCT
-
- movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- por xmm1,xmm2
- por xmm3,xmm4
- por xmm5,xmm6
- por xmm1,xmm3
- por xmm5,xmm7
- por xmm1,xmm5
- packsswb xmm1,xmm1
- movd eax,xmm1
- test eax,eax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
- punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
- psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
- cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
-
- mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movaps xmm1,xmm0
- movaps xmm2,xmm0
- movaps xmm3,xmm0
-
- shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
- shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
- shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
- shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
-
- movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
- movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
- movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
- movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
- jmp near .nextcolumn
- alignx 16,7
+ mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por xmm1, xmm2
+ por xmm3, xmm4
+ por xmm5, xmm6
+ por xmm1, xmm3
+ por xmm5, xmm7
+ por xmm1, xmm5
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
+ cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm1, xmm0
+ movaps xmm2, xmm0
+ movaps xmm3, xmm0
+
+ shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
+ shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
+ shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
+ shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+ jmp near .nextcolumn
+ alignx 16, 7
%endif
.columnDCT:
- ; -- Even part
-
- movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
- punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
- punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
- psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
- psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
- cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
- cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)
-
- punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
- punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
- psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
- psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
- cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)
- cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)
-
- mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movaps xmm4,xmm0
- movaps xmm5,xmm1
- subps xmm0,xmm2 ; xmm0=tmp11
- subps xmm1,xmm3
- addps xmm4,xmm2 ; xmm4=tmp10
- addps xmm5,xmm3 ; xmm5=tmp13
-
- mulps xmm1,[GOTOFF(ebx,PD_1_414)]
- subps xmm1,xmm5 ; xmm1=tmp12
-
- movaps xmm6,xmm4
- movaps xmm7,xmm0
- subps xmm4,xmm5 ; xmm4=tmp3
- subps xmm0,xmm1 ; xmm0=tmp2
- addps xmm6,xmm5 ; xmm6=tmp0
- addps xmm7,xmm1 ; xmm7=tmp1
-
- movaps XMMWORD [wk(1)], xmm4 ; tmp3
- movaps XMMWORD [wk(0)], xmm0 ; tmp2
-
- ; -- Odd part
-
- movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
- punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
- punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
- psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
- psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
- cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)
- cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)
-
- punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
- punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
- psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
- psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
- cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)
- cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)
-
- mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
- mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
- movaps xmm4,xmm2
- movaps xmm0,xmm5
- addps xmm2,xmm1 ; xmm2=z11
- addps xmm5,xmm3 ; xmm5=z13
- subps xmm4,xmm1 ; xmm4=z12
- subps xmm0,xmm3 ; xmm0=z10
-
- movaps xmm1,xmm2
- subps xmm2,xmm5
- addps xmm1,xmm5 ; xmm1=tmp7
-
- mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
-
- movaps xmm3,xmm0
- addps xmm0,xmm4
- mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
- mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
- mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
- addps xmm3,xmm0 ; xmm3=tmp12
- subps xmm4,xmm0 ; xmm4=tmp10
-
- ; -- Final output stage
-
- subps xmm3,xmm1 ; xmm3=tmp6
- movaps xmm5,xmm6
- movaps xmm0,xmm7
- addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
- addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
- subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
- subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
- subps xmm2,xmm3 ; xmm2=tmp5
-
- movaps xmm1,xmm6 ; transpose coefficients(phase 1)
- unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
- unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
- movaps xmm3,xmm0 ; transpose coefficients(phase 1)
- unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
- unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
-
- movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
- movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
-
- movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
- movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
-
- addps xmm4,xmm2 ; xmm4=tmp4
- movaps xmm0,xmm7
- movaps xmm3,xmm5
- addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
- addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
- subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
- subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
-
- movaps xmm2,xmm7 ; transpose coefficients(phase 1)
- unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
- unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
- movaps xmm4,xmm5 ; transpose coefficients(phase 1)
- unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
- unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
-
- movaps xmm3,xmm6 ; transpose coefficients(phase 2)
- unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
- unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
- movaps xmm0,xmm1 ; transpose coefficients(phase 2)
- unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
- unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
-
- movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
- movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
-
- movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
- movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
- movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
- movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-
- movaps xmm6,xmm5 ; transpose coefficients(phase 2)
- unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
- unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
- movaps xmm3,xmm4 ; transpose coefficients(phase 2)
- unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
- unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
-
- movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
- movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
- movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
- movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+ ; -- Even part
+
+ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
+ psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
+ psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
+ cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
+ cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
+
+ punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
+ punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
+ psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
+ psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
+ cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
+ cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
+
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+ subps xmm0, xmm2 ; xmm0=tmp11
+ subps xmm1, xmm3
+ addps xmm4, xmm2 ; xmm4=tmp10
+ addps xmm5, xmm3 ; xmm5=tmp13
+
+ mulps xmm1, [GOTOFF(ebx,PD_1_414)]
+ subps xmm1, xmm5 ; xmm1=tmp12
+
+ movaps xmm6, xmm4
+ movaps xmm7, xmm0
+ subps xmm4, xmm5 ; xmm4=tmp3
+ subps xmm0, xmm1 ; xmm0=tmp2
+ addps xmm6, xmm5 ; xmm6=tmp0
+ addps xmm7, xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+ punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
+ punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
+ psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
+ psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
+ cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
+ cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
+
+ punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
+ punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
+ psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
+ psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
+ cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
+ cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
+
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+ movaps xmm4, xmm2
+ movaps xmm0, xmm5
+ addps xmm2, xmm1 ; xmm2=z11
+ addps xmm5, xmm3 ; xmm5=z13
+ subps xmm4, xmm1 ; xmm4=z12
+ subps xmm0, xmm3 ; xmm0=z10
+
+ movaps xmm1, xmm2
+ subps xmm2, xmm5
+ addps xmm1, xmm5 ; xmm1=tmp7
+
+ mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
+
+ movaps xmm3, xmm0
+ addps xmm0, xmm4
+ mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
+ mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
+ addps xmm3, xmm0 ; xmm3=tmp12
+ subps xmm4, xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3, xmm1 ; xmm3=tmp6
+ movaps xmm5, xmm6
+ movaps xmm0, xmm7
+ addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
+ addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
+ subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
+ subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
+ subps xmm2, xmm3 ; xmm2=tmp5
+
+ movaps xmm1, xmm6 ; transpose coefficients(phase 1)
+ unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
+ unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
+ movaps xmm3, xmm0 ; transpose coefficients(phase 1)
+ unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
+ unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
+
+ movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
+ movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
+
+ addps xmm4, xmm2 ; xmm4=tmp4
+ movaps xmm0, xmm7
+ movaps xmm3, xmm5
+ addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
+ addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
+ subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
+ subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
+
+ movaps xmm2, xmm7 ; transpose coefficients(phase 1)
+ unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
+ unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
+ movaps xmm4, xmm5 ; transpose coefficients(phase 1)
+ unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
+ unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
+
+ movaps xmm3, xmm6 ; transpose coefficients(phase 2)
+ unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
+ unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
+ movaps xmm0, xmm1 ; transpose coefficients(phase 2)
+ unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
+ unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
+
+ movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
+ movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+ movaps xmm6, xmm5 ; transpose coefficients(phase 2)
+ unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
+ unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
+ movaps xmm3, xmm4 ; transpose coefficients(phase 2)
+ unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
+ unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
+
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+ movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+ movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
.nextcolumn:
- add esi, byte 4*SIZEOF_JCOEF ; coef_block
- add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
- add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
- dec ecx ; ctr
- jnz near .columnloop
-
- ; -- Prefetch the next coefficient block
-
- prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
- prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
- prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
- prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows from work array, store into output array.
-
- mov eax, [original_ebp]
- lea esi, [workspace] ; FAST_FLOAT *wsptr
- mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
- mov eax, JDIMENSION [output_col(eax)]
- mov ecx, DCTSIZE/4 ; ctr
- alignx 16,7
+ add esi, byte 4*SIZEOF_JCOEF ; coef_block
+ add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
+ add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
+ dec ecx ; ctr
+ jnz near .columnloop
+
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ lea esi, [workspace] ; FAST_FLOAT *wsptr
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+ mov ecx, DCTSIZE/4 ; ctr
+ alignx 16, 7
.rowloop:
- ; -- Even part
-
- movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
- movaps xmm4,xmm0
- movaps xmm5,xmm1
- subps xmm0,xmm2 ; xmm0=tmp11
- subps xmm1,xmm3
- addps xmm4,xmm2 ; xmm4=tmp10
- addps xmm5,xmm3 ; xmm5=tmp13
-
- mulps xmm1,[GOTOFF(ebx,PD_1_414)]
- subps xmm1,xmm5 ; xmm1=tmp12
-
- movaps xmm6,xmm4
- movaps xmm7,xmm0
- subps xmm4,xmm5 ; xmm4=tmp3
- subps xmm0,xmm1 ; xmm0=tmp2
- addps xmm6,xmm5 ; xmm6=tmp0
- addps xmm7,xmm1 ; xmm7=tmp1
-
- movaps XMMWORD [wk(1)], xmm4 ; tmp3
- movaps XMMWORD [wk(0)], xmm0 ; tmp2
-
- ; -- Odd part
-
- movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
- movaps xmm4,xmm2
- movaps xmm0,xmm5
- addps xmm2,xmm1 ; xmm2=z11
- addps xmm5,xmm3 ; xmm5=z13
- subps xmm4,xmm1 ; xmm4=z12
- subps xmm0,xmm3 ; xmm0=z10
-
- movaps xmm1,xmm2
- subps xmm2,xmm5
- addps xmm1,xmm5 ; xmm1=tmp7
-
- mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
-
- movaps xmm3,xmm0
- addps xmm0,xmm4
- mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
- mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
- mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
- addps xmm3,xmm0 ; xmm3=tmp12
- subps xmm4,xmm0 ; xmm4=tmp10
-
- ; -- Final output stage
-
- subps xmm3,xmm1 ; xmm3=tmp6
- movaps xmm5,xmm6
- movaps xmm0,xmm7
- addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
- addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
- subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
- subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
- subps xmm2,xmm3 ; xmm2=tmp5
-
- movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]
- pcmpeqd xmm3,xmm3
- psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
- addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
- addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
- addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
- addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
-
- pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
- pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
- pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
- pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
- por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
- por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
-
- movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
- movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
-
- addps xmm4,xmm2 ; xmm4=tmp4
- movaps xmm7,xmm1
- movaps xmm5,xmm3
- addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)
- addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)
- subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)
- subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)
-
- movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]
- pcmpeqd xmm4,xmm4
- psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
- addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
- addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
- addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
- addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
-
- pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
- pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
- pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
- pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
- por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
- por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
-
- movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
-
- packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
- packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
- paddb xmm6,xmm2
- paddb xmm1,xmm2
-
- movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
- punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
- punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-
- movdqa xmm7,xmm6 ; transpose coefficients(phase 3)
- punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
- punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-
- pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
- pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-
- pushpic ebx ; save GOT address
-
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
- mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
- movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
- mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
- movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
-
- poppic ebx ; restore GOT address
-
- add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
- add edi, byte 4*SIZEOF_JSAMPROW
- dec ecx ; ctr
- jnz near .rowloop
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
+ ; -- Even part
+
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+ subps xmm0, xmm2 ; xmm0=tmp11
+ subps xmm1, xmm3
+ addps xmm4, xmm2 ; xmm4=tmp10
+ addps xmm5, xmm3 ; xmm5=tmp13
+
+ mulps xmm1, [GOTOFF(ebx,PD_1_414)]
+ subps xmm1, xmm5 ; xmm1=tmp12
+
+ movaps xmm6, xmm4
+ movaps xmm7, xmm0
+ subps xmm4, xmm5 ; xmm4=tmp3
+ subps xmm0, xmm1 ; xmm0=tmp2
+ addps xmm6, xmm5 ; xmm6=tmp0
+ addps xmm7, xmm1 ; xmm7=tmp1
+
+ movaps XMMWORD [wk(1)], xmm4 ; tmp3
+ movaps XMMWORD [wk(0)], xmm0 ; tmp2
+
+ ; -- Odd part
+
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+ movaps xmm4, xmm2
+ movaps xmm0, xmm5
+ addps xmm2, xmm1 ; xmm2=z11
+ addps xmm5, xmm3 ; xmm5=z13
+ subps xmm4, xmm1 ; xmm4=z12
+ subps xmm0, xmm3 ; xmm0=z10
+
+ movaps xmm1, xmm2
+ subps xmm2, xmm5
+ addps xmm1, xmm5 ; xmm1=tmp7
+
+ mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
+
+ movaps xmm3, xmm0
+ addps xmm0, xmm4
+ mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
+ mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
+ mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
+ addps xmm3, xmm0 ; xmm3=tmp12
+ subps xmm4, xmm0 ; xmm4=tmp10
+
+ ; -- Final output stage
+
+ subps xmm3, xmm1 ; xmm3=tmp6
+ movaps xmm5, xmm6
+ movaps xmm0, xmm7
+ addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
+ addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
+ subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
+ subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
+ subps xmm2, xmm3 ; xmm2=tmp5
+
+ movaps xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]
+ pcmpeqd xmm3, xmm3
+ psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+ addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+ addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+ addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+ addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+ pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
+ pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+ pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
+ pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+ por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
+ por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
+
+ movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
+ movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
+
+ addps xmm4, xmm2 ; xmm4=tmp4
+ movaps xmm7, xmm1
+ movaps xmm5, xmm3
+ addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
+ addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
+ subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
+ subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
+
+ movaps xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]
+ pcmpeqd xmm4, xmm4
+ psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+ addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+ addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+ addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+ addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+ pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
+ pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+ pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
+ pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+ por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
+ por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
+
+ movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
+
+ packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+ packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+ paddb xmm6, xmm2
+ paddb xmm1, xmm2
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
+ punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+ movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
+ punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+ pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+ pushpic ebx ; save GOT address
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
+ mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+ movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
+
+ poppic ebx ; restore GOT address
+
+ add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
+ add edi, byte 4*SIZEOF_JSAMPROW
+ dec ecx ; ctr
+ jnz near .rowloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; --------------------------------------------------------------------------
-%define CONST_BITS 8 ; 14 is also OK.
-%define PASS1_BITS 2
+%define CONST_BITS 8 ; 14 is also OK.
+%define PASS1_BITS 2
%if IFAST_SCALE_BITS != PASS1_BITS
%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
%endif
%if CONST_BITS == 8
-F_1_082 equ 277 ; FIX(1.082392200)
-F_1_414 equ 362 ; FIX(1.414213562)
-F_1_847 equ 473 ; FIX(1.847759065)
-F_2_613 equ 669 ; FIX(2.613125930)
-F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
+F_1_082 equ 277 ; FIX(1.082392200)
+F_1_414 equ 362 ; FIX(1.414213562)
+F_1_847 equ 473 ; FIX(1.847759065)
+F_2_613 equ 669 ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200)
-F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930)
-F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
+F_1_082 equ DESCALE(1162209775, 30-CONST_BITS) ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30-CONST_BITS) ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30-CONST_BITS) ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
%endif
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-%define PRE_MULTIPLY_SCALE_BITS 2
-%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
- alignz 16
- global EXTN(jconst_idct_ifast_sse2)
+ alignz 16
+ global EXTN(jconst_idct_ifast_sse2)
EXTN(jconst_idct_ifast_sse2):
-PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
-PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
-PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
-PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
+PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
+PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
+PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
+ SECTION SEG_TEXT
+ BITS 64
;
; Perform dequantization and inverse DCT on one block of coefficients.
;
; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col
-%define original_rbp rbp+0
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
+%define original_rbp rbp+0
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
- align 16
- global EXTN(jsimd_idct_ifast_sse2)
+ align 16
+ global EXTN(jsimd_idct_ifast_sse2)
EXTN(jsimd_idct_ifast_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args
- ; ---- Pass 1: process columns from input.
+ ; ---- Pass 1: process columns from input.
- mov rdx, r10 ; quantptr
- mov rsi, r11 ; inptr
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
- mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- jnz near .columnDCT
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
- por xmm1,xmm0
- packsswb xmm1,xmm1
- packsswb xmm1,xmm1
- movd eax,xmm1
- test rax,rax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
- movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
- punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
- punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
-
- pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
- pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
- pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
- pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
- pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
- pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
- pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
- pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
- jmp near .column_end
+ mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, xmm0
+ packsswb xmm1, xmm1
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test rax, rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
+ pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
+ pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
+ pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
+ pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
+ pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
+ pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
+ pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
+ jmp near .column_end
%endif
.columnDCT:
- ; -- Even part
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
- movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-
- movdqa xmm4,xmm0
- movdqa xmm5,xmm1
- psubw xmm0,xmm2 ; xmm0=tmp11
- psubw xmm1,xmm3
- paddw xmm4,xmm2 ; xmm4=tmp10
- paddw xmm5,xmm3 ; xmm5=tmp13
-
- psllw xmm1,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm1,[rel PW_F1414]
- psubw xmm1,xmm5 ; xmm1=tmp12
-
- movdqa xmm6,xmm4
- movdqa xmm7,xmm0
- psubw xmm4,xmm5 ; xmm4=tmp3
- psubw xmm0,xmm1 ; xmm0=tmp2
- paddw xmm6,xmm5 ; xmm6=tmp0
- paddw xmm7,xmm1 ; xmm7=tmp1
-
- movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
- movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
-
- ; -- Odd part
-
- movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
- movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-
- movdqa xmm4,xmm2
- movdqa xmm0,xmm5
- psubw xmm2,xmm1 ; xmm2=z12
- psubw xmm5,xmm3 ; xmm5=z10
- paddw xmm4,xmm1 ; xmm4=z11
- paddw xmm0,xmm3 ; xmm0=z13
-
- movdqa xmm1,xmm5 ; xmm1=z10(unscaled)
- psllw xmm2,PRE_MULTIPLY_SCALE_BITS
- psllw xmm5,PRE_MULTIPLY_SCALE_BITS
-
- movdqa xmm3,xmm4
- psubw xmm4,xmm0
- paddw xmm3,xmm0 ; xmm3=tmp7
-
- psllw xmm4,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm4,[rel PW_F1414] ; xmm4=tmp11
-
- ; To avoid overflow...
- ;
- ; (Original)
- ; tmp12 = -2.613125930 * z10 + z5;
- ;
- ; (This implementation)
- ; tmp12 = (-1.613125930 - 1) * z10 + z5;
- ; = -1.613125930 * z10 - z10 + z5;
-
- movdqa xmm0,xmm5
- paddw xmm5,xmm2
- pmulhw xmm5,[rel PW_F1847] ; xmm5=z5
- pmulhw xmm0,[rel PW_MF1613]
- pmulhw xmm2,[rel PW_F1082]
- psubw xmm0,xmm1
- psubw xmm2,xmm5 ; xmm2=tmp10
- paddw xmm0,xmm5 ; xmm0=tmp12
-
- ; -- Final output stage
-
- psubw xmm0,xmm3 ; xmm0=tmp6
- movdqa xmm1,xmm6
- movdqa xmm5,xmm7
- paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
- paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
- psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
- psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
- psubw xmm4,xmm0 ; xmm4=tmp5
-
- movdqa xmm3,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
- punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
- movdqa xmm0,xmm5 ; transpose coefficients(phase 1)
- punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
- punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
- movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
-
- movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
-
- paddw xmm2,xmm4 ; xmm2=tmp4
- movdqa xmm5,xmm7
- movdqa xmm0,xmm1
- paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
- paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
- psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
- psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
-
- movdqa xmm4,xmm7 ; transpose coefficients(phase 1)
- punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
- punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
- movdqa xmm2,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
- punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
-
- movdqa xmm0,xmm3 ; transpose coefficients(phase 2)
- punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
- punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
- movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
- punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
-
- movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
- movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
-
- movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
-
- movdqa xmm3,xmm1 ; transpose coefficients(phase 2)
- punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
- punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
- movdqa xmm0,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
- punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
-
- movdqa xmm4,xmm6 ; transpose coefficients(phase 3)
- punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
- punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
- movdqa xmm7,xmm5 ; transpose coefficients(phase 3)
- punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
- punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
-
- movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
- movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
-
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
- movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
-
- movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
- punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
- punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
- movdqa xmm7,xmm3 ; transpose coefficients(phase 3)
- punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
- punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
+ ; -- Even part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm1
+ psubw xmm0, xmm2 ; xmm0=tmp11
+ psubw xmm1, xmm3
+ paddw xmm4, xmm2 ; xmm4=tmp10
+ paddw xmm5, xmm3 ; xmm5=tmp13
+
+ psllw xmm1, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm1, [rel PW_F1414]
+ psubw xmm1, xmm5 ; xmm1=tmp12
+
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm0
+ psubw xmm4, xmm5 ; xmm4=tmp3
+ psubw xmm0, xmm1 ; xmm0=tmp2
+ paddw xmm6, xmm5 ; xmm6=tmp0
+ paddw xmm7, xmm1 ; xmm7=tmp1
+
+ movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
+
+ ; -- Odd part
+
+ movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movdqa xmm4, xmm2
+ movdqa xmm0, xmm5
+ psubw xmm2, xmm1 ; xmm2=z12
+ psubw xmm5, xmm3 ; xmm5=z10
+ paddw xmm4, xmm1 ; xmm4=z11
+ paddw xmm0, xmm3 ; xmm0=z13
+
+ movdqa xmm1, xmm5 ; xmm1=z10(unscaled)
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+
+ movdqa xmm3, xmm4
+ psubw xmm4, xmm0
+ paddw xmm3, xmm0 ; xmm3=tmp7
+
+ psllw xmm4, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm4, [rel PW_F1414] ; xmm4=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movdqa xmm0, xmm5
+ paddw xmm5, xmm2
+ pmulhw xmm5, [rel PW_F1847] ; xmm5=z5
+ pmulhw xmm0, [rel PW_MF1613]
+ pmulhw xmm2, [rel PW_F1082]
+ psubw xmm0, xmm1
+ psubw xmm2, xmm5 ; xmm2=tmp10
+ paddw xmm0, xmm5 ; xmm0=tmp12
+
+ ; -- Final output stage
+
+ psubw xmm0, xmm3 ; xmm0=tmp6
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm7
+ paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
+ paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
+ psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
+ psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
+ psubw xmm4, xmm0 ; xmm4=tmp5
+
+ movdqa xmm3, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
+ movdqa xmm0, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
+
+ paddw xmm2, xmm4 ; xmm2=tmp4
+ movdqa xmm5, xmm7
+ movdqa xmm0, xmm1
+ paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
+ paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
+ psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
+ psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
+ movdqa xmm2, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
+
+ movdqa xmm0, xmm3 ; transpose coefficients(phase 2)
+ punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
+
+ movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
+ movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm3, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
+ movdqa xmm0, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 3)
+ punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
+ punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
+ movdqa xmm7, xmm5 ; transpose coefficients(phase 3)
+ punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
+ punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
+ movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
+ punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
+ movdqa xmm7, xmm3 ; transpose coefficients(phase 3)
+ punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
+ punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
.column_end:
- ; -- Prefetch the next coefficient block
-
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows from work array, store into output array.
-
- mov rax, [original_rbp]
- mov rdi, r12 ; (JSAMPROW *)
- mov eax, r13d
-
- ; -- Even part
-
- ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
-
- movdqa xmm2,xmm6
- movdqa xmm0,xmm5
- psubw xmm6,xmm1 ; xmm6=tmp11
- psubw xmm5,xmm3
- paddw xmm2,xmm1 ; xmm2=tmp10
- paddw xmm0,xmm3 ; xmm0=tmp13
-
- psllw xmm5,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm5,[rel PW_F1414]
- psubw xmm5,xmm0 ; xmm5=tmp12
-
- movdqa xmm1,xmm2
- movdqa xmm3,xmm6
- psubw xmm2,xmm0 ; xmm2=tmp3
- psubw xmm6,xmm5 ; xmm6=tmp2
- paddw xmm1,xmm0 ; xmm1=tmp0
- paddw xmm3,xmm5 ; xmm3=tmp1
-
- movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
- movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
-
- ; -- Odd part
-
- ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
-
- movdqa xmm2,xmm0
- movdqa xmm6,xmm4
- psubw xmm0,xmm7 ; xmm0=z12
- psubw xmm4,xmm5 ; xmm4=z10
- paddw xmm2,xmm7 ; xmm2=z11
- paddw xmm6,xmm5 ; xmm6=z13
-
- movdqa xmm7,xmm4 ; xmm7=z10(unscaled)
- psllw xmm0,PRE_MULTIPLY_SCALE_BITS
- psllw xmm4,PRE_MULTIPLY_SCALE_BITS
-
- movdqa xmm5,xmm2
- psubw xmm2,xmm6
- paddw xmm5,xmm6 ; xmm5=tmp7
-
- psllw xmm2,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm2,[rel PW_F1414] ; xmm2=tmp11
-
- ; To avoid overflow...
- ;
- ; (Original)
- ; tmp12 = -2.613125930 * z10 + z5;
- ;
- ; (This implementation)
- ; tmp12 = (-1.613125930 - 1) * z10 + z5;
- ; = -1.613125930 * z10 - z10 + z5;
-
- movdqa xmm6,xmm4
- paddw xmm4,xmm0
- pmulhw xmm4,[rel PW_F1847] ; xmm4=z5
- pmulhw xmm6,[rel PW_MF1613]
- pmulhw xmm0,[rel PW_F1082]
- psubw xmm6,xmm7
- psubw xmm0,xmm4 ; xmm0=tmp10
- paddw xmm6,xmm4 ; xmm6=tmp12
-
- ; -- Final output stage
-
- psubw xmm6,xmm5 ; xmm6=tmp6
- movdqa xmm7,xmm1
- movdqa xmm4,xmm3
- paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
- paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
- psraw xmm1,(PASS1_BITS+3) ; descale
- psraw xmm3,(PASS1_BITS+3) ; descale
- psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
- psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
- psraw xmm7,(PASS1_BITS+3) ; descale
- psraw xmm4,(PASS1_BITS+3) ; descale
- psubw xmm2,xmm6 ; xmm2=tmp5
-
- packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
- packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
- movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
-
- paddw xmm0,xmm2 ; xmm0=tmp4
- movdqa xmm4,xmm5
- movdqa xmm7,xmm6
- paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
- paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
- psraw xmm5,(PASS1_BITS+3) ; descale
- psraw xmm6,(PASS1_BITS+3) ; descale
- psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
- psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
- psraw xmm4,(PASS1_BITS+3) ; descale
- psraw xmm7,(PASS1_BITS+3) ; descale
-
- movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
-
- packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
- packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
- paddb xmm1,xmm2
- paddb xmm3,xmm2
- paddb xmm5,xmm2
- paddb xmm7,xmm2
-
- movdqa xmm0,xmm1 ; transpose coefficients(phase 1)
- punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
- punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
- movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
- punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
- punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
- movdqa xmm4,xmm1 ; transpose coefficients(phase 2)
- punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
- punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
- movdqa xmm2,xmm6 ; transpose coefficients(phase 2)
- punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
- punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
- movdqa xmm3,xmm1 ; transpose coefficients(phase 3)
- punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
- punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
- movdqa xmm7,xmm4 ; transpose coefficients(phase 3)
- punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
- punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
- pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
- pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
- pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
- pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
- movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
- mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
- movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
-
- mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
- movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
- mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
- movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
-
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
- ret
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov rax, [original_rbp]
+ mov rdi, r12 ; (JSAMPROW *)
+ mov eax, r13d
+
+ ; -- Even part
+
+ ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+ movdqa xmm2, xmm6
+ movdqa xmm0, xmm5
+ psubw xmm6, xmm1 ; xmm6=tmp11
+ psubw xmm5, xmm3
+ paddw xmm2, xmm1 ; xmm2=tmp10
+ paddw xmm0, xmm3 ; xmm0=tmp13
+
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [rel PW_F1414]
+ psubw xmm5, xmm0 ; xmm5=tmp12
+
+ movdqa xmm1, xmm2
+ movdqa xmm3, xmm6
+ psubw xmm2, xmm0 ; xmm2=tmp3
+ psubw xmm6, xmm5 ; xmm6=tmp2
+ paddw xmm1, xmm0 ; xmm1=tmp0
+ paddw xmm3, xmm5 ; xmm3=tmp1
+
+ movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
+
+ ; -- Odd part
+
+ ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+ movdqa xmm2, xmm0
+ movdqa xmm6, xmm4
+ psubw xmm0, xmm7 ; xmm0=z12
+ psubw xmm4, xmm5 ; xmm4=z10
+ paddw xmm2, xmm7 ; xmm2=z11
+ paddw xmm6, xmm5 ; xmm6=z13
+
+ movdqa xmm7, xmm4 ; xmm7=z10(unscaled)
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm4, PRE_MULTIPLY_SCALE_BITS
+
+ movdqa xmm5, xmm2
+ psubw xmm2, xmm6
+ paddw xmm5, xmm6 ; xmm5=tmp7
+
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm2, [rel PW_F1414] ; xmm2=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movdqa xmm6, xmm4
+ paddw xmm4, xmm0
+ pmulhw xmm4, [rel PW_F1847] ; xmm4=z5
+ pmulhw xmm6, [rel PW_MF1613]
+ pmulhw xmm0, [rel PW_F1082]
+ psubw xmm6, xmm7
+ psubw xmm0, xmm4 ; xmm0=tmp10
+ paddw xmm6, xmm4 ; xmm6=tmp12
+
+ ; -- Final output stage
+
+ psubw xmm6, xmm5 ; xmm6=tmp6
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm3
+ paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
+ paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
+ psraw xmm1, (PASS1_BITS+3) ; descale
+ psraw xmm3, (PASS1_BITS+3) ; descale
+ psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
+ psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
+ psraw xmm7, (PASS1_BITS+3) ; descale
+ psraw xmm4, (PASS1_BITS+3) ; descale
+ psubw xmm2, xmm6 ; xmm2=tmp5
+
+ packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
+ movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
+
+ paddw xmm0, xmm2 ; xmm0=tmp4
+ movdqa xmm4, xmm5
+ movdqa xmm7, xmm6
+ paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
+ paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
+ psraw xmm5, (PASS1_BITS+3) ; descale
+ psraw xmm6, (PASS1_BITS+3) ; descale
+ psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
+ psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
+ psraw xmm4, (PASS1_BITS+3) ; descale
+ psraw xmm7, (PASS1_BITS+3) ; descale
+
+ movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
+
+ packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+ packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+ paddb xmm1, xmm2
+ paddb xmm3, xmm2
+ paddb xmm5, xmm2
+ paddb xmm7, xmm2
+
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 1)
+ punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+ punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+ movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
+ punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+ punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 2)
+ punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 2)
+ punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+ punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+ movdqa xmm3, xmm1 ; transpose coefficients(phase 3)
+ punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ movdqa xmm7, xmm4 ; transpose coefficients(phase 3)
+ punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+ mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+ mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+ mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+
+ uncollect_args
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; --------------------------------------------------------------------------
-%define CONST_BITS 8 ; 14 is also OK.
-%define PASS1_BITS 2
+%define CONST_BITS 8 ; 14 is also OK.
+%define PASS1_BITS 2
%if IFAST_SCALE_BITS != PASS1_BITS
%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
%endif
%if CONST_BITS == 8
-F_1_082 equ 277 ; FIX(1.082392200)
-F_1_414 equ 362 ; FIX(1.414213562)
-F_1_847 equ 473 ; FIX(1.847759065)
-F_2_613 equ 669 ; FIX(2.613125930)
-F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
+F_1_082 equ 277 ; FIX(1.082392200)
+F_1_414 equ 362 ; FIX(1.414213562)
+F_1_847 equ 473 ; FIX(1.847759065)
+F_2_613 equ 669 ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200)
-F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930)
-F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
+F_1_082 equ DESCALE(1162209775, 30-CONST_BITS) ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30-CONST_BITS) ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30-CONST_BITS) ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
%endif
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-%define PRE_MULTIPLY_SCALE_BITS 2
-%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+%define PRE_MULTIPLY_SCALE_BITS 2
+%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
- alignz 16
- global EXTN(jconst_idct_ifast_sse2)
+ alignz 16
+ global EXTN(jconst_idct_ifast_sse2)
EXTN(jconst_idct_ifast_sse2):
-PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
-PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
-PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
-PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
+PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
+PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
+PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
+ SECTION SEG_TEXT
+ BITS 32
;
; Perform dequantization and inverse DCT on one block of coefficients.
;
; JSAMPARRAY output_buf, JDIMENSION output_col)
;
-%define dct_table(b) (b)+8 ; jpeg_component_info *compptr
-%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
-%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
-%define output_col(b) (b)+20 ; JDIMENSION output_col
+%define dct_table(b) (b)+8 ; jpeg_component_info *compptr
+%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
+%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
+%define output_col(b) (b)+20 ; JDIMENSION output_col
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
- align 16
- global EXTN(jsimd_idct_ifast_sse2)
+ align 16
+ global EXTN(jsimd_idct_ifast_sse2)
EXTN(jsimd_idct_ifast_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic ebx
-; push ecx ; unused
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process columns from input.
-
-; mov eax, [original_ebp]
- mov edx, POINTER [dct_table(eax)] ; quantptr
- mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
- jnz near .columnDCT
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- por xmm1,xmm0
- packsswb xmm1,xmm1
- packsswb xmm1,xmm1
- movd eax,xmm1
- test eax,eax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
- punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
- punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
-
- pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
- pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
- pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
- pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
- pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
- pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
- pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
- pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
- jmp near .column_end
- alignx 16,7
+ mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por xmm1, xmm0
+ packsswb xmm1, xmm1
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
+ pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
+ pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
+ pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
+ pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
+ pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
+ pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
+ pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
+ jmp near .column_end
+ alignx 16, 7
%endif
.columnDCT:
- ; -- Even part
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
- movdqa xmm4,xmm0
- movdqa xmm5,xmm1
- psubw xmm0,xmm2 ; xmm0=tmp11
- psubw xmm1,xmm3
- paddw xmm4,xmm2 ; xmm4=tmp10
- paddw xmm5,xmm3 ; xmm5=tmp13
-
- psllw xmm1,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm1,[GOTOFF(ebx,PW_F1414)]
- psubw xmm1,xmm5 ; xmm1=tmp12
-
- movdqa xmm6,xmm4
- movdqa xmm7,xmm0
- psubw xmm4,xmm5 ; xmm4=tmp3
- psubw xmm0,xmm1 ; xmm0=tmp2
- paddw xmm6,xmm5 ; xmm6=tmp0
- paddw xmm7,xmm1 ; xmm7=tmp1
-
- movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
- movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
-
- ; -- Odd part
-
- movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
- movdqa xmm4,xmm2
- movdqa xmm0,xmm5
- psubw xmm2,xmm1 ; xmm2=z12
- psubw xmm5,xmm3 ; xmm5=z10
- paddw xmm4,xmm1 ; xmm4=z11
- paddw xmm0,xmm3 ; xmm0=z13
-
- movdqa xmm1,xmm5 ; xmm1=z10(unscaled)
- psllw xmm2,PRE_MULTIPLY_SCALE_BITS
- psllw xmm5,PRE_MULTIPLY_SCALE_BITS
-
- movdqa xmm3,xmm4
- psubw xmm4,xmm0
- paddw xmm3,xmm0 ; xmm3=tmp7
-
- psllw xmm4,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm4,[GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11
-
- ; To avoid overflow...
- ;
- ; (Original)
- ; tmp12 = -2.613125930 * z10 + z5;
- ;
- ; (This implementation)
- ; tmp12 = (-1.613125930 - 1) * z10 + z5;
- ; = -1.613125930 * z10 - z10 + z5;
-
- movdqa xmm0,xmm5
- paddw xmm5,xmm2
- pmulhw xmm5,[GOTOFF(ebx,PW_F1847)] ; xmm5=z5
- pmulhw xmm0,[GOTOFF(ebx,PW_MF1613)]
- pmulhw xmm2,[GOTOFF(ebx,PW_F1082)]
- psubw xmm0,xmm1
- psubw xmm2,xmm5 ; xmm2=tmp10
- paddw xmm0,xmm5 ; xmm0=tmp12
-
- ; -- Final output stage
-
- psubw xmm0,xmm3 ; xmm0=tmp6
- movdqa xmm1,xmm6
- movdqa xmm5,xmm7
- paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
- paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
- psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
- psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
- psubw xmm4,xmm0 ; xmm4=tmp5
-
- movdqa xmm3,xmm6 ; transpose coefficients(phase 1)
- punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
- punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
- movdqa xmm0,xmm5 ; transpose coefficients(phase 1)
- punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
- punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
- movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
-
- movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
-
- paddw xmm2,xmm4 ; xmm2=tmp4
- movdqa xmm5,xmm7
- movdqa xmm0,xmm1
- paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
- paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
- psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
- psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
-
- movdqa xmm4,xmm7 ; transpose coefficients(phase 1)
- punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
- punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
- movdqa xmm2,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
- punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
-
- movdqa xmm0,xmm3 ; transpose coefficients(phase 2)
- punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
- punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
- movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
- punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
-
- movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
- movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
-
- movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
-
- movdqa xmm3,xmm1 ; transpose coefficients(phase 2)
- punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
- punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
- movdqa xmm0,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
- punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
-
- movdqa xmm4,xmm6 ; transpose coefficients(phase 3)
- punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
- punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
- movdqa xmm7,xmm5 ; transpose coefficients(phase 3)
- punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
- punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
-
- movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
- movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
-
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
- movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
-
- movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
- punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
- punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
- movdqa xmm7,xmm3 ; transpose coefficients(phase 3)
- punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
- punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
+ ; -- Even part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm1
+ psubw xmm0, xmm2 ; xmm0=tmp11
+ psubw xmm1, xmm3
+ paddw xmm4, xmm2 ; xmm4=tmp10
+ paddw xmm5, xmm3 ; xmm5=tmp13
+
+ psllw xmm1, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm1, [GOTOFF(ebx,PW_F1414)]
+ psubw xmm1, xmm5 ; xmm1=tmp12
+
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm0
+ psubw xmm4, xmm5 ; xmm4=tmp3
+ psubw xmm0, xmm1 ; xmm0=tmp2
+ paddw xmm6, xmm5 ; xmm6=tmp0
+ paddw xmm7, xmm1 ; xmm7=tmp1
+
+ movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
+ movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
+
+ ; -- Odd part
+
+ movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+ movdqa xmm4, xmm2
+ movdqa xmm0, xmm5
+ psubw xmm2, xmm1 ; xmm2=z12
+ psubw xmm5, xmm3 ; xmm5=z10
+ paddw xmm4, xmm1 ; xmm4=z11
+ paddw xmm0, xmm3 ; xmm0=z13
+
+ movdqa xmm1, xmm5 ; xmm1=z10(unscaled)
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+
+ movdqa xmm3, xmm4
+ psubw xmm4, xmm0
+ paddw xmm3, xmm0 ; xmm3=tmp7
+
+ psllw xmm4, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm4, [GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movdqa xmm0, xmm5
+ paddw xmm5, xmm2
+ pmulhw xmm5, [GOTOFF(ebx,PW_F1847)] ; xmm5=z5
+ pmulhw xmm0, [GOTOFF(ebx,PW_MF1613)]
+ pmulhw xmm2, [GOTOFF(ebx,PW_F1082)]
+ psubw xmm0, xmm1
+ psubw xmm2, xmm5 ; xmm2=tmp10
+ paddw xmm0, xmm5 ; xmm0=tmp12
+
+ ; -- Final output stage
+
+ psubw xmm0, xmm3 ; xmm0=tmp6
+ movdqa xmm1, xmm6
+ movdqa xmm5, xmm7
+ paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
+ paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
+ psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
+ psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
+ psubw xmm4, xmm0 ; xmm4=tmp5
+
+ movdqa xmm3, xmm6 ; transpose coefficients(phase 1)
+ punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
+ movdqa xmm0, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
+
+ paddw xmm2, xmm4 ; xmm2=tmp4
+ movdqa xmm5, xmm7
+ movdqa xmm0, xmm1
+ paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
+ paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
+ psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
+ psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm4, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
+ movdqa xmm2, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
+
+ movdqa xmm0, xmm3 ; transpose coefficients(phase 2)
+ punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
+
+ movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
+ movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
+
+ movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm3, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
+ movdqa xmm0, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm4, xmm6 ; transpose coefficients(phase 3)
+ punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
+ punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
+ movdqa xmm7, xmm5 ; transpose coefficients(phase 3)
+ punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
+ punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
+ movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
+ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
+ punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
+ movdqa xmm7, xmm3 ; transpose coefficients(phase 3)
+ punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
+ punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
.column_end:
- ; -- Prefetch the next coefficient block
-
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows from work array, store into output array.
-
- mov eax, [original_ebp]
- mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
- mov eax, JDIMENSION [output_col(eax)]
-
- ; -- Even part
-
- ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
-
- movdqa xmm2,xmm6
- movdqa xmm0,xmm5
- psubw xmm6,xmm1 ; xmm6=tmp11
- psubw xmm5,xmm3
- paddw xmm2,xmm1 ; xmm2=tmp10
- paddw xmm0,xmm3 ; xmm0=tmp13
-
- psllw xmm5,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm5,[GOTOFF(ebx,PW_F1414)]
- psubw xmm5,xmm0 ; xmm5=tmp12
-
- movdqa xmm1,xmm2
- movdqa xmm3,xmm6
- psubw xmm2,xmm0 ; xmm2=tmp3
- psubw xmm6,xmm5 ; xmm6=tmp2
- paddw xmm1,xmm0 ; xmm1=tmp0
- paddw xmm3,xmm5 ; xmm3=tmp1
-
- movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
- movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
-
- ; -- Odd part
-
- ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
-
- movdqa xmm2,xmm0
- movdqa xmm6,xmm4
- psubw xmm0,xmm7 ; xmm0=z12
- psubw xmm4,xmm5 ; xmm4=z10
- paddw xmm2,xmm7 ; xmm2=z11
- paddw xmm6,xmm5 ; xmm6=z13
-
- movdqa xmm7,xmm4 ; xmm7=z10(unscaled)
- psllw xmm0,PRE_MULTIPLY_SCALE_BITS
- psllw xmm4,PRE_MULTIPLY_SCALE_BITS
-
- movdqa xmm5,xmm2
- psubw xmm2,xmm6
- paddw xmm5,xmm6 ; xmm5=tmp7
-
- psllw xmm2,PRE_MULTIPLY_SCALE_BITS
- pmulhw xmm2,[GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11
-
- ; To avoid overflow...
- ;
- ; (Original)
- ; tmp12 = -2.613125930 * z10 + z5;
- ;
- ; (This implementation)
- ; tmp12 = (-1.613125930 - 1) * z10 + z5;
- ; = -1.613125930 * z10 - z10 + z5;
-
- movdqa xmm6,xmm4
- paddw xmm4,xmm0
- pmulhw xmm4,[GOTOFF(ebx,PW_F1847)] ; xmm4=z5
- pmulhw xmm6,[GOTOFF(ebx,PW_MF1613)]
- pmulhw xmm0,[GOTOFF(ebx,PW_F1082)]
- psubw xmm6,xmm7
- psubw xmm0,xmm4 ; xmm0=tmp10
- paddw xmm6,xmm4 ; xmm6=tmp12
-
- ; -- Final output stage
-
- psubw xmm6,xmm5 ; xmm6=tmp6
- movdqa xmm7,xmm1
- movdqa xmm4,xmm3
- paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
- paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
- psraw xmm1,(PASS1_BITS+3) ; descale
- psraw xmm3,(PASS1_BITS+3) ; descale
- psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
- psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
- psraw xmm7,(PASS1_BITS+3) ; descale
- psraw xmm4,(PASS1_BITS+3) ; descale
- psubw xmm2,xmm6 ; xmm2=tmp5
-
- packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
- packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
- movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
- movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
-
- paddw xmm0,xmm2 ; xmm0=tmp4
- movdqa xmm4,xmm5
- movdqa xmm7,xmm6
- paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
- paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
- psraw xmm5,(PASS1_BITS+3) ; descale
- psraw xmm6,(PASS1_BITS+3) ; descale
- psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
- psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
- psraw xmm4,(PASS1_BITS+3) ; descale
- psraw xmm7,(PASS1_BITS+3) ; descale
-
- movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
-
- packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
- packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
- paddb xmm1,xmm2
- paddb xmm3,xmm2
- paddb xmm5,xmm2
- paddb xmm7,xmm2
-
- movdqa xmm0,xmm1 ; transpose coefficients(phase 1)
- punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
- punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
- movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
- punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
- punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
- movdqa xmm4,xmm1 ; transpose coefficients(phase 2)
- punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
- punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
- movdqa xmm2,xmm6 ; transpose coefficients(phase 2)
- punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
- punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
- movdqa xmm3,xmm1 ; transpose coefficients(phase 3)
- punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
- punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
- movdqa xmm7,xmm4 ; transpose coefficients(phase 3)
- punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
- punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
- pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
- pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
- pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
- pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
- movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
- mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
- movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
-
- mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
- movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
- mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
- movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; unused
- poppic ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+
+ ; -- Even part
+
+ ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+ movdqa xmm2, xmm6
+ movdqa xmm0, xmm5
+ psubw xmm6, xmm1 ; xmm6=tmp11
+ psubw xmm5, xmm3
+ paddw xmm2, xmm1 ; xmm2=tmp10
+ paddw xmm0, xmm3 ; xmm0=tmp13
+
+ psllw xmm5, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm5, [GOTOFF(ebx,PW_F1414)]
+ psubw xmm5, xmm0 ; xmm5=tmp12
+
+ movdqa xmm1, xmm2
+ movdqa xmm3, xmm6
+ psubw xmm2, xmm0 ; xmm2=tmp3
+ psubw xmm6, xmm5 ; xmm6=tmp2
+ paddw xmm1, xmm0 ; xmm1=tmp0
+ paddw xmm3, xmm5 ; xmm3=tmp1
+
+ movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
+
+ ; -- Odd part
+
+ ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+ movdqa xmm2, xmm0
+ movdqa xmm6, xmm4
+ psubw xmm0, xmm7 ; xmm0=z12
+ psubw xmm4, xmm5 ; xmm4=z10
+ paddw xmm2, xmm7 ; xmm2=z11
+ paddw xmm6, xmm5 ; xmm6=z13
+
+ movdqa xmm7, xmm4 ; xmm7=z10(unscaled)
+ psllw xmm0, PRE_MULTIPLY_SCALE_BITS
+ psllw xmm4, PRE_MULTIPLY_SCALE_BITS
+
+ movdqa xmm5, xmm2
+ psubw xmm2, xmm6
+ paddw xmm5, xmm6 ; xmm5=tmp7
+
+ psllw xmm2, PRE_MULTIPLY_SCALE_BITS
+ pmulhw xmm2, [GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11
+
+ ; To avoid overflow...
+ ;
+ ; (Original)
+ ; tmp12 = -2.613125930 * z10 + z5;
+ ;
+ ; (This implementation)
+ ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+ ; = -1.613125930 * z10 - z10 + z5;
+
+ movdqa xmm6, xmm4
+ paddw xmm4, xmm0
+ pmulhw xmm4, [GOTOFF(ebx,PW_F1847)] ; xmm4=z5
+ pmulhw xmm6, [GOTOFF(ebx,PW_MF1613)]
+ pmulhw xmm0, [GOTOFF(ebx,PW_F1082)]
+ psubw xmm6, xmm7
+ psubw xmm0, xmm4 ; xmm0=tmp10
+ paddw xmm6, xmm4 ; xmm6=tmp12
+
+ ; -- Final output stage
+
+ psubw xmm6, xmm5 ; xmm6=tmp6
+ movdqa xmm7, xmm1
+ movdqa xmm4, xmm3
+ paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
+ paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
+ psraw xmm1, (PASS1_BITS+3) ; descale
+ psraw xmm3, (PASS1_BITS+3) ; descale
+ psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
+ psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
+ psraw xmm7, (PASS1_BITS+3) ; descale
+ psraw xmm4, (PASS1_BITS+3) ; descale
+ psubw xmm2, xmm6 ; xmm2=tmp5
+
+ packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
+ movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
+
+ paddw xmm0, xmm2 ; xmm0=tmp4
+ movdqa xmm4, xmm5
+ movdqa xmm7, xmm6
+ paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
+ paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
+ psraw xmm5, (PASS1_BITS+3) ; descale
+ psraw xmm6, (PASS1_BITS+3) ; descale
+ psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
+ psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
+ psraw xmm4, (PASS1_BITS+3) ; descale
+ psraw xmm7, (PASS1_BITS+3) ; descale
+
+ movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
+
+ packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+ packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+ paddb xmm1, xmm2
+ paddb xmm3, xmm2
+ paddb xmm5, xmm2
+ paddb xmm7, xmm2
+
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 1)
+ punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+ punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+ movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
+ punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+ punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 2)
+ punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 2)
+ punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+ punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+ movdqa xmm3, xmm1 ; transpose coefficients(phase 3)
+ punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ movdqa xmm7, xmm4 ; transpose coefficients(phase 3)
+ punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+ mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+ mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+ mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; --------------------------------------------------------------------------
-%define CONST_BITS 13
-%define PASS1_BITS 2
+%define CONST_BITS 13
+%define PASS1_BITS 2
-%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3)
+%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3)
%if CONST_BITS == 13
-F_0_298 equ 2446 ; FIX(0.298631336)
-F_0_390 equ 3196 ; FIX(0.390180644)
-F_0_541 equ 4433 ; FIX(0.541196100)
-F_0_765 equ 6270 ; FIX(0.765366865)
-F_0_899 equ 7373 ; FIX(0.899976223)
-F_1_175 equ 9633 ; FIX(1.175875602)
-F_1_501 equ 12299 ; FIX(1.501321110)
-F_1_847 equ 15137 ; FIX(1.847759065)
-F_1_961 equ 16069 ; FIX(1.961570560)
-F_2_053 equ 16819 ; FIX(2.053119869)
-F_2_562 equ 20995 ; FIX(2.562915447)
-F_3_072 equ 25172 ; FIX(3.072711026)
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
-F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
-F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
-F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
-F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
-F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
-F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
-F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
-F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
-F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
+F_0_298 equ DESCALE( 320652955, 30-CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30-CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30-CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30-CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30-CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30-CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026)
%endif
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_idct_islow_sse2)
+ alignz 16
+ global EXTN(jconst_idct_islow_sse2)
EXTN(jconst_idct_islow_sse2):
-PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
-PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
-PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
-PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
-PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
+PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
+PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
+PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
+ SECTION SEG_TEXT
+ BITS 64
;
; Perform dequantization and inverse DCT on one block of coefficients.
;
; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col
-%define original_rbp rbp+0
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 12
+%define original_rbp rbp+0
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 12
- align 16
- global EXTN(jsimd_idct_islow_sse2)
+ align 16
+ global EXTN(jsimd_idct_islow_sse2)
EXTN(jsimd_idct_islow_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args
- ; ---- Pass 1: process columns from input.
+ ; ---- Pass 1: process columns from input.
- mov rdx, r10 ; quantptr
- mov rsi, r11 ; inptr
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
- mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- jnz near .columnDCT
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
- por xmm1,xmm0
- packsswb xmm1,xmm1
- packsswb xmm1,xmm1
- movd eax,xmm1
- test rax,rax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
- psllw xmm5,PASS1_BITS
-
- movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
- punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
- punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
-
- pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
- pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
- pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
- pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
- pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
- pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
- pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
- pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
-
- movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
- movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
- movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
- movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
- jmp near .column_end
+ mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, xmm0
+ packsswb xmm1, xmm1
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test rax, rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw xmm5, PASS1_BITS
+
+ movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
+ pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
+ pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
+ pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
+ pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
+ pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
+ pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
+ pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+ movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
+ movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
+ jmp near .column_end
%endif
.columnDCT:
- ; -- Even part
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
- ; (Original)
- ; z1 = (z2 + z3) * 0.541196100;
- ; tmp2 = z1 + z3 * -1.847759065;
- ; tmp3 = z1 + z2 * 0.765366865;
- ;
- ; (This implementation)
- ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
- ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
- movdqa xmm4,xmm1 ; xmm1=in2=z2
- movdqa xmm5,xmm1
- punpcklwd xmm4,xmm3 ; xmm3=in6=z3
- punpckhwd xmm5,xmm3
- movdqa xmm1,xmm4
- movdqa xmm3,xmm5
- pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=tmp3L
- pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H
- pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L
- pmaddwd xmm3,[rel PW_F054_MF130] ; xmm3=tmp2H
-
- movdqa xmm6,xmm0
- paddw xmm0,xmm2 ; xmm0=in0+in4
- psubw xmm6,xmm2 ; xmm6=in0-in4
-
- pxor xmm7,xmm7
- pxor xmm2,xmm2
- punpcklwd xmm7,xmm0 ; xmm7=tmp0L
- punpckhwd xmm2,xmm0 ; xmm2=tmp0H
- psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
- psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
-
- movdqa xmm0,xmm7
- paddd xmm7,xmm4 ; xmm7=tmp10L
- psubd xmm0,xmm4 ; xmm0=tmp13L
- movdqa xmm4,xmm2
- paddd xmm2,xmm5 ; xmm2=tmp10H
- psubd xmm4,xmm5 ; xmm4=tmp13H
-
- movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
- movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
- movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
- movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
-
- pxor xmm5,xmm5
- pxor xmm7,xmm7
- punpcklwd xmm5,xmm6 ; xmm5=tmp1L
- punpckhwd xmm7,xmm6 ; xmm7=tmp1H
- psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
- psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
-
- movdqa xmm2,xmm5
- paddd xmm5,xmm1 ; xmm5=tmp11L
- psubd xmm2,xmm1 ; xmm2=tmp12L
- movdqa xmm0,xmm7
- paddd xmm7,xmm3 ; xmm7=tmp11H
- psubd xmm0,xmm3 ; xmm0=tmp12H
-
- movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
- movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
- movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L
- movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H
-
- ; -- Odd part
-
- movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
- movdqa xmm5,xmm6
- movdqa xmm7,xmm4
- paddw xmm5,xmm3 ; xmm5=z3
- paddw xmm7,xmm1 ; xmm7=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movdqa xmm2,xmm5
- movdqa xmm0,xmm5
- punpcklwd xmm2,xmm7
- punpckhwd xmm0,xmm7
- movdqa xmm5,xmm2
- movdqa xmm7,xmm0
- pmaddwd xmm2,[rel PW_MF078_F117] ; xmm2=z3L
- pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3H
- pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L
- pmaddwd xmm7,[rel PW_F117_F078] ; xmm7=z4H
-
- movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
- movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
-
- ; (Original)
- ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
- ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
- ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; tmp0 += z1 + z3; tmp1 += z2 + z4;
- ; tmp2 += z2 + z3; tmp3 += z1 + z4;
- ;
- ; (This implementation)
- ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
- ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
- ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
- ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
- ; tmp0 += z3; tmp1 += z4;
- ; tmp2 += z3; tmp3 += z4;
-
- movdqa xmm2,xmm3
- movdqa xmm0,xmm3
- punpcklwd xmm2,xmm4
- punpckhwd xmm0,xmm4
- movdqa xmm3,xmm2
- movdqa xmm4,xmm0
- pmaddwd xmm2,[rel PW_MF060_MF089] ; xmm2=tmp0L
- pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0H
- pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3L
- pmaddwd xmm4,[rel PW_MF089_F060] ; xmm4=tmp3H
-
- paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
- paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
- paddd xmm3,xmm5 ; xmm3=tmp3L
- paddd xmm4,xmm7 ; xmm4=tmp3H
-
- movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
- movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
-
- movdqa xmm2,xmm1
- movdqa xmm0,xmm1
- punpcklwd xmm2,xmm6
- punpckhwd xmm0,xmm6
- movdqa xmm1,xmm2
- movdqa xmm6,xmm0
- pmaddwd xmm2,[rel PW_MF050_MF256] ; xmm2=tmp1L
- pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1H
- pmaddwd xmm1,[rel PW_MF256_F050] ; xmm1=tmp2L
- pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H
-
- paddd xmm2,xmm5 ; xmm2=tmp1L
- paddd xmm0,xmm7 ; xmm0=tmp1H
- paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
- paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
-
- movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L
- movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H
-
- ; -- Final output stage
-
- movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
- movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
-
- movdqa xmm2,xmm5
- movdqa xmm0,xmm7
- paddd xmm5,xmm3 ; xmm5=data0L
- paddd xmm7,xmm4 ; xmm7=data0H
- psubd xmm2,xmm3 ; xmm2=data7L
- psubd xmm0,xmm4 ; xmm0=data7H
-
- movdqa xmm3,[rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1]
-
- paddd xmm5,xmm3
- paddd xmm7,xmm3
- psrad xmm5,DESCALE_P1
- psrad xmm7,DESCALE_P1
- paddd xmm2,xmm3
- paddd xmm0,xmm3
- psrad xmm2,DESCALE_P1
- psrad xmm0,DESCALE_P1
-
- packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
- packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
-
- movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
- movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
-
- movdqa xmm7,xmm4
- movdqa xmm0,xmm3
- paddd xmm4,xmm1 ; xmm4=data1L
- paddd xmm3,xmm6 ; xmm3=data1H
- psubd xmm7,xmm1 ; xmm7=data6L
- psubd xmm0,xmm6 ; xmm0=data6H
-
- movdqa xmm1,[rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1]
-
- paddd xmm4,xmm1
- paddd xmm3,xmm1
- psrad xmm4,DESCALE_P1
- psrad xmm3,DESCALE_P1
- paddd xmm7,xmm1
- paddd xmm0,xmm1
- psrad xmm7,DESCALE_P1
- psrad xmm0,DESCALE_P1
-
- packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
- packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
-
- movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
- punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
- punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
- movdqa xmm1,xmm7 ; transpose coefficients(phase 1)
- punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
- punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
-
- movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
- movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
- movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L
- movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H
-
- movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13)
- movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17)
- movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
- movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
-
- movdqa xmm5,xmm3
- movdqa xmm6,xmm0
- paddd xmm3,xmm4 ; xmm3=data2L
- paddd xmm0,xmm2 ; xmm0=data2H
- psubd xmm5,xmm4 ; xmm5=data5L
- psubd xmm6,xmm2 ; xmm6=data5H
-
- movdqa xmm7,[rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1]
-
- paddd xmm3,xmm7
- paddd xmm0,xmm7
- psrad xmm3,DESCALE_P1
- psrad xmm0,DESCALE_P1
- paddd xmm5,xmm7
- paddd xmm6,xmm7
- psrad xmm5,DESCALE_P1
- psrad xmm6,DESCALE_P1
-
- packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
- packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
-
- movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
- movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
- movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
- movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
-
- movdqa xmm0,xmm1
- movdqa xmm6,xmm4
- paddd xmm1,xmm2 ; xmm1=data3L
- paddd xmm4,xmm7 ; xmm4=data3H
- psubd xmm0,xmm2 ; xmm0=data4L
- psubd xmm6,xmm7 ; xmm6=data4H
-
- movdqa xmm2,[rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1]
-
- paddd xmm1,xmm2
- paddd xmm4,xmm2
- psrad xmm1,DESCALE_P1
- psrad xmm4,DESCALE_P1
- paddd xmm0,xmm2
- paddd xmm6,xmm2
- psrad xmm0,DESCALE_P1
- psrad xmm6,DESCALE_P1
-
- packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
- packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
- movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
-
- movdqa xmm4,xmm3 ; transpose coefficients(phase 1)
- punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
- punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
- movdqa xmm6,xmm0 ; transpose coefficients(phase 1)
- punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
- punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
-
- movdqa xmm1,xmm7 ; transpose coefficients(phase 2)
- punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
- punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
- movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
- punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
-
- movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
- movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
-
- movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
- movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
-
- movdqa xmm2,xmm0 ; transpose coefficients(phase 2)
- punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
- punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
- movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
- punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
-
- movdqa xmm3,xmm7 ; transpose coefficients(phase 3)
- punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
- punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
- movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
- punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
- punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
-
- movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
- movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
-
- movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
- movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
-
- movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
- punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
- punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
- movdqa xmm4,xmm2 ; transpose coefficients(phase 3)
- punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
- punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
-
- movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
- movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
+ ; -- Even part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movdqa xmm4, xmm1 ; xmm1=in2=z2
+ movdqa xmm5, xmm1
+ punpcklwd xmm4, xmm3 ; xmm3=in6=z3
+ punpckhwd xmm5, xmm3
+ movdqa xmm1, xmm4
+ movdqa xmm3, xmm5
+ pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=tmp3L
+ pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H
+ pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L
+ pmaddwd xmm3, [rel PW_F054_MF130] ; xmm3=tmp2H
+
+ movdqa xmm6, xmm0
+ paddw xmm0, xmm2 ; xmm0=in0+in4
+ psubw xmm6, xmm2 ; xmm6=in0-in4
+
+ pxor xmm7, xmm7
+ pxor xmm2, xmm2
+ punpcklwd xmm7, xmm0 ; xmm7=tmp0L
+ punpckhwd xmm2, xmm0 ; xmm2=tmp0H
+ psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+ psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+ movdqa xmm0, xmm7
+ paddd xmm7, xmm4 ; xmm7=tmp10L
+ psubd xmm0, xmm4 ; xmm0=tmp13L
+ movdqa xmm4, xmm2
+ paddd xmm2, xmm5 ; xmm2=tmp10H
+ psubd xmm4, xmm5 ; xmm4=tmp13H
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
+ movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
+ movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
+ movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
+
+ pxor xmm5, xmm5
+ pxor xmm7, xmm7
+ punpcklwd xmm5, xmm6 ; xmm5=tmp1L
+ punpckhwd xmm7, xmm6 ; xmm7=tmp1H
+ psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+ psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+ movdqa xmm2, xmm5
+ paddd xmm5, xmm1 ; xmm5=tmp11L
+ psubd xmm2, xmm1 ; xmm2=tmp12L
+ movdqa xmm0, xmm7
+ paddd xmm7, xmm3 ; xmm7=tmp11H
+ psubd xmm0, xmm3 ; xmm0=tmp12H
+
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
+ movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L
+ movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm5, xmm6
+ movdqa xmm7, xmm4
+ paddw xmm5, xmm3 ; xmm5=z3
+ paddw xmm7, xmm1 ; xmm7=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm2, xmm5
+ movdqa xmm0, xmm5
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm0, xmm7
+ movdqa xmm5, xmm2
+ movdqa xmm7, xmm0
+ pmaddwd xmm2, [rel PW_MF078_F117] ; xmm2=z3L
+ pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3H
+ pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L
+ pmaddwd xmm7, [rel PW_F117_F078] ; xmm7=z4H
+
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movdqa xmm2, xmm3
+ movdqa xmm0, xmm3
+ punpcklwd xmm2, xmm4
+ punpckhwd xmm0, xmm4
+ movdqa xmm3, xmm2
+ movdqa xmm4, xmm0
+ pmaddwd xmm2, [rel PW_MF060_MF089] ; xmm2=tmp0L
+ pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0H
+ pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3L
+ pmaddwd xmm4, [rel PW_MF089_F060] ; xmm4=tmp3H
+
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
+ paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
+ paddd xmm3, xmm5 ; xmm3=tmp3L
+ paddd xmm4, xmm7 ; xmm4=tmp3H
+
+ movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
+ movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
+
+ movdqa xmm2, xmm1
+ movdqa xmm0, xmm1
+ punpcklwd xmm2, xmm6
+ punpckhwd xmm0, xmm6
+ movdqa xmm1, xmm2
+ movdqa xmm6, xmm0
+ pmaddwd xmm2, [rel PW_MF050_MF256] ; xmm2=tmp1L
+ pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1H
+ pmaddwd xmm1, [rel PW_MF256_F050] ; xmm1=tmp2L
+ pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H
+
+ paddd xmm2, xmm5 ; xmm2=tmp1L
+ paddd xmm0, xmm7 ; xmm0=tmp1H
+ paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
+
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
+ movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
+
+ movdqa xmm2, xmm5
+ movdqa xmm0, xmm7
+ paddd xmm5, xmm3 ; xmm5=data0L
+ paddd xmm7, xmm4 ; xmm7=data0H
+ psubd xmm2, xmm3 ; xmm2=data7L
+ psubd xmm0, xmm4 ; xmm0=data7H
+
+ movdqa xmm3, [rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1]
+
+ paddd xmm5, xmm3
+ paddd xmm7, xmm3
+ psrad xmm5, DESCALE_P1
+ psrad xmm7, DESCALE_P1
+ paddd xmm2, xmm3
+ paddd xmm0, xmm3
+ psrad xmm2, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
+ packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+ movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
+ movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
+
+ movdqa xmm7, xmm4
+ movdqa xmm0, xmm3
+ paddd xmm4, xmm1 ; xmm4=data1L
+ paddd xmm3, xmm6 ; xmm3=data1H
+ psubd xmm7, xmm1 ; xmm7=data6L
+ psubd xmm0, xmm6 ; xmm0=data6H
+
+ movdqa xmm1, [rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1]
+
+ paddd xmm4, xmm1
+ paddd xmm3, xmm1
+ psrad xmm4, DESCALE_P1
+ psrad xmm3, DESCALE_P1
+ paddd xmm7, xmm1
+ paddd xmm0, xmm1
+ psrad xmm7, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
+ packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+ movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
+ movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
+ movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L
+ movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13)
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17)
+ movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
+ movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm0
+ paddd xmm3, xmm4 ; xmm3=data2L
+ paddd xmm0, xmm2 ; xmm0=data2H
+ psubd xmm5, xmm4 ; xmm5=data5L
+ psubd xmm6, xmm2 ; xmm6=data5H
+
+ movdqa xmm7, [rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1]
+
+ paddd xmm3, xmm7
+ paddd xmm0, xmm7
+ psrad xmm3, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+ paddd xmm5, xmm7
+ paddd xmm6, xmm7
+ psrad xmm5, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+
+ packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
+ packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
+ movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
+ movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
+ movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
+
+ movdqa xmm0, xmm1
+ movdqa xmm6, xmm4
+ paddd xmm1, xmm2 ; xmm1=data3L
+ paddd xmm4, xmm7 ; xmm4=data3H
+ psubd xmm0, xmm2 ; xmm0=data4L
+ psubd xmm6, xmm7 ; xmm6=data4H
+
+ movdqa xmm2, [rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1]
+
+ paddd xmm1, xmm2
+ paddd xmm4, xmm2
+ psrad xmm1, DESCALE_P1
+ psrad xmm4, DESCALE_P1
+ paddd xmm0, xmm2
+ paddd xmm6, xmm2
+ psrad xmm0, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+
+ packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
+ packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
+ movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
+
+ movdqa xmm4, xmm3 ; transpose coefficients(phase 1)
+ punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
+ movdqa xmm6, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 2)
+ punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
+ movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
+
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
+ movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm2, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm3, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
+ punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
+ punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
+ movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
+
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
+ punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
+ movdqa xmm4, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
+ punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
.column_end:
- ; -- Prefetch the next coefficient block
-
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows from work array, store into output array.
-
- mov rax, [original_rbp]
- mov rdi, r12 ; (JSAMPROW *)
- mov eax, r13d
-
- ; -- Even part
-
- ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
-
- ; (Original)
- ; z1 = (z2 + z3) * 0.541196100;
- ; tmp2 = z1 + z3 * -1.847759065;
- ; tmp3 = z1 + z2 * 0.765366865;
- ;
- ; (This implementation)
- ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
- ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
- movdqa xmm6,xmm1 ; xmm1=in2=z2
- movdqa xmm5,xmm1
- punpcklwd xmm6,xmm2 ; xmm2=in6=z3
- punpckhwd xmm5,xmm2
- movdqa xmm1,xmm6
- movdqa xmm2,xmm5
- pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=tmp3L
- pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H
- pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L
- pmaddwd xmm2,[rel PW_F054_MF130] ; xmm2=tmp2H
-
- movdqa xmm3,xmm7
- paddw xmm7,xmm0 ; xmm7=in0+in4
- psubw xmm3,xmm0 ; xmm3=in0-in4
-
- pxor xmm4,xmm4
- pxor xmm0,xmm0
- punpcklwd xmm4,xmm7 ; xmm4=tmp0L
- punpckhwd xmm0,xmm7 ; xmm0=tmp0H
- psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
- psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
-
- movdqa xmm7,xmm4
- paddd xmm4,xmm6 ; xmm4=tmp10L
- psubd xmm7,xmm6 ; xmm7=tmp13L
- movdqa xmm6,xmm0
- paddd xmm0,xmm5 ; xmm0=tmp10H
- psubd xmm6,xmm5 ; xmm6=tmp13H
-
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
- movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
- movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
-
- pxor xmm5,xmm5
- pxor xmm4,xmm4
- punpcklwd xmm5,xmm3 ; xmm5=tmp1L
- punpckhwd xmm4,xmm3 ; xmm4=tmp1H
- psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
- psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
-
- movdqa xmm0,xmm5
- paddd xmm5,xmm1 ; xmm5=tmp11L
- psubd xmm0,xmm1 ; xmm0=tmp12L
- movdqa xmm7,xmm4
- paddd xmm4,xmm2 ; xmm4=tmp11H
- psubd xmm7,xmm2 ; xmm7=tmp12H
-
- movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
- movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
- movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L
- movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H
-
- ; -- Odd part
-
- movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3
- movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1
- movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
- movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
-
- movdqa xmm5,xmm6
- movdqa xmm4,xmm3
- paddw xmm5,xmm1 ; xmm5=z3
- paddw xmm4,xmm2 ; xmm4=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movdqa xmm0,xmm5
- movdqa xmm7,xmm5
- punpcklwd xmm0,xmm4
- punpckhwd xmm7,xmm4
- movdqa xmm5,xmm0
- movdqa xmm4,xmm7
- pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3L
- pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3H
- pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L
- pmaddwd xmm4,[rel PW_F117_F078] ; xmm4=z4H
-
- movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
- movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
-
- ; (Original)
- ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
- ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
- ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; tmp0 += z1 + z3; tmp1 += z2 + z4;
- ; tmp2 += z2 + z3; tmp3 += z1 + z4;
- ;
- ; (This implementation)
- ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
- ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
- ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
- ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
- ; tmp0 += z3; tmp1 += z4;
- ; tmp2 += z3; tmp3 += z4;
-
- movdqa xmm0,xmm1
- movdqa xmm7,xmm1
- punpcklwd xmm0,xmm3
- punpckhwd xmm7,xmm3
- movdqa xmm1,xmm0
- movdqa xmm3,xmm7
- pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0L
- pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp0H
- pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp3L
- pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3H
-
- paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
- paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
- paddd xmm1,xmm5 ; xmm1=tmp3L
- paddd xmm3,xmm4 ; xmm3=tmp3H
-
- movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
- movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
-
- movdqa xmm0,xmm2
- movdqa xmm7,xmm2
- punpcklwd xmm0,xmm6
- punpckhwd xmm7,xmm6
- movdqa xmm2,xmm0
- movdqa xmm6,xmm7
- pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1L
- pmaddwd xmm7,[rel PW_MF050_MF256] ; xmm7=tmp1H
- pmaddwd xmm2,[rel PW_MF256_F050] ; xmm2=tmp2L
- pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H
-
- paddd xmm0,xmm5 ; xmm0=tmp1L
- paddd xmm7,xmm4 ; xmm7=tmp1H
- paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
- paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
-
- movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L
- movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H
-
- ; -- Final output stage
-
- movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
- movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
-
- movdqa xmm0,xmm5
- movdqa xmm7,xmm4
- paddd xmm5,xmm1 ; xmm5=data0L
- paddd xmm4,xmm3 ; xmm4=data0H
- psubd xmm0,xmm1 ; xmm0=data7L
- psubd xmm7,xmm3 ; xmm7=data7H
-
- movdqa xmm1,[rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2]
-
- paddd xmm5,xmm1
- paddd xmm4,xmm1
- psrad xmm5,DESCALE_P2
- psrad xmm4,DESCALE_P2
- paddd xmm0,xmm1
- paddd xmm7,xmm1
- psrad xmm0,DESCALE_P2
- psrad xmm7,DESCALE_P2
-
- packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
- packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
-
- movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
- movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
-
- movdqa xmm4,xmm3
- movdqa xmm7,xmm1
- paddd xmm3,xmm2 ; xmm3=data1L
- paddd xmm1,xmm6 ; xmm1=data1H
- psubd xmm4,xmm2 ; xmm4=data6L
- psubd xmm7,xmm6 ; xmm7=data6H
-
- movdqa xmm2,[rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2]
-
- paddd xmm3,xmm2
- paddd xmm1,xmm2
- psrad xmm3,DESCALE_P2
- psrad xmm1,DESCALE_P2
- paddd xmm4,xmm2
- paddd xmm7,xmm2
- psrad xmm4,DESCALE_P2
- psrad xmm7,DESCALE_P2
-
- packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
- packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
-
- packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
- packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
- movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
- movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
- movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L
- movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H
-
- movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
- movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
- movdqa xmm4,xmm6
- movdqa xmm0,xmm2
- paddd xmm6,xmm1 ; xmm6=data2L
- paddd xmm2,xmm7 ; xmm2=data2H
- psubd xmm4,xmm1 ; xmm4=data5L
- psubd xmm0,xmm7 ; xmm0=data5H
-
- movdqa xmm5,[rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2]
-
- paddd xmm6,xmm5
- paddd xmm2,xmm5
- psrad xmm6,DESCALE_P2
- psrad xmm2,DESCALE_P2
- paddd xmm4,xmm5
- paddd xmm0,xmm5
- psrad xmm4,DESCALE_P2
- psrad xmm0,DESCALE_P2
-
- packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
- packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
-
- movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
- movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
- movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
- movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
-
- movdqa xmm2,xmm3
- movdqa xmm0,xmm1
- paddd xmm3,xmm7 ; xmm3=data3L
- paddd xmm1,xmm5 ; xmm1=data3H
- psubd xmm2,xmm7 ; xmm2=data4L
- psubd xmm0,xmm5 ; xmm0=data4H
-
- movdqa xmm7,[rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2]
-
- paddd xmm3,xmm7
- paddd xmm1,xmm7
- psrad xmm3,DESCALE_P2
- psrad xmm1,DESCALE_P2
- paddd xmm2,xmm7
- paddd xmm0,xmm7
- psrad xmm2,DESCALE_P2
- psrad xmm0,DESCALE_P2
-
- movdqa xmm5,[rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP]
-
- packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
- packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
- movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
- packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
- packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
- paddb xmm7,xmm5
- paddb xmm1,xmm5
- paddb xmm6,xmm5
- paddb xmm3,xmm5
-
- movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
- punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
- punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
- movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
- punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
- punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
- movdqa xmm4,xmm7 ; transpose coefficients(phase 2)
- punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
- punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
- movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
- punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
- punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
- movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
- punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
- punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
- movdqa xmm3,xmm4 ; transpose coefficients(phase 3)
- punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
- punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
- pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
- pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
- pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
- pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
- movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
- mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
- movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-
- mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
- movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
- mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
- movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
-
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov rax, [original_rbp]
+ mov rdi, r12 ; (JSAMPROW *)
+ mov eax, r13d
+
+ ; -- Even part
+
+ ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movdqa xmm6, xmm1 ; xmm1=in2=z2
+ movdqa xmm5, xmm1
+ punpcklwd xmm6, xmm2 ; xmm2=in6=z3
+ punpckhwd xmm5, xmm2
+ movdqa xmm1, xmm6
+ movdqa xmm2, xmm5
+ pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=tmp3L
+ pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H
+ pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L
+ pmaddwd xmm2, [rel PW_F054_MF130] ; xmm2=tmp2H
+
+ movdqa xmm3, xmm7
+ paddw xmm7, xmm0 ; xmm7=in0+in4
+ psubw xmm3, xmm0 ; xmm3=in0-in4
+
+ pxor xmm4, xmm4
+ pxor xmm0, xmm0
+ punpcklwd xmm4, xmm7 ; xmm4=tmp0L
+ punpckhwd xmm0, xmm7 ; xmm0=tmp0H
+ psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+ psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm6 ; xmm4=tmp10L
+ psubd xmm7, xmm6 ; xmm7=tmp13L
+ movdqa xmm6, xmm0
+ paddd xmm0, xmm5 ; xmm0=tmp10H
+ psubd xmm6, xmm5 ; xmm6=tmp13H
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
+ movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
+ movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
+
+ pxor xmm5, xmm5
+ pxor xmm4, xmm4
+ punpcklwd xmm5, xmm3 ; xmm5=tmp1L
+ punpckhwd xmm4, xmm3 ; xmm4=tmp1H
+ psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+ psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+ movdqa xmm0, xmm5
+ paddd xmm5, xmm1 ; xmm5=tmp11L
+ psubd xmm0, xmm1 ; xmm0=tmp12L
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm2 ; xmm4=tmp11H
+ psubd xmm7, xmm2 ; xmm7=tmp12H
+
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
+ movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L
+ movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3
+ movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1
+ movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
+ movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
+
+ movdqa xmm5, xmm6
+ movdqa xmm4, xmm3
+ paddw xmm5, xmm1 ; xmm5=z3
+ paddw xmm4, xmm2 ; xmm4=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm0, xmm5
+ movdqa xmm7, xmm5
+ punpcklwd xmm0, xmm4
+ punpckhwd xmm7, xmm4
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm7
+ pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3L
+ pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3H
+ pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L
+ pmaddwd xmm4, [rel PW_F117_F078] ; xmm4=z4H
+
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movdqa xmm0, xmm1
+ movdqa xmm7, xmm1
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm7, xmm3
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm7
+ pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0L
+ pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp0H
+ pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp3L
+ pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3H
+
+ paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
+ paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
+ paddd xmm1, xmm5 ; xmm1=tmp3L
+ paddd xmm3, xmm4 ; xmm3=tmp3H
+
+ movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
+ movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
+
+ movdqa xmm0, xmm2
+ movdqa xmm7, xmm2
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm7, xmm6
+ movdqa xmm2, xmm0
+ movdqa xmm6, xmm7
+ pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1L
+ pmaddwd xmm7, [rel PW_MF050_MF256] ; xmm7=tmp1H
+ pmaddwd xmm2, [rel PW_MF256_F050] ; xmm2=tmp2L
+ pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H
+
+ paddd xmm0, xmm5 ; xmm0=tmp1L
+ paddd xmm7, xmm4 ; xmm7=tmp1H
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
+
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
+ movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
+
+ movdqa xmm0, xmm5
+ movdqa xmm7, xmm4
+ paddd xmm5, xmm1 ; xmm5=data0L
+ paddd xmm4, xmm3 ; xmm4=data0H
+ psubd xmm0, xmm1 ; xmm0=data7L
+ psubd xmm7, xmm3 ; xmm7=data7H
+
+ movdqa xmm1, [rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2]
+
+ paddd xmm5, xmm1
+ paddd xmm4, xmm1
+ psrad xmm5, DESCALE_P2
+ psrad xmm4, DESCALE_P2
+ paddd xmm0, xmm1
+ paddd xmm7, xmm1
+ psrad xmm0, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
+ packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
+ movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
+
+ movdqa xmm4, xmm3
+ movdqa xmm7, xmm1
+ paddd xmm3, xmm2 ; xmm3=data1L
+ paddd xmm1, xmm6 ; xmm1=data1H
+ psubd xmm4, xmm2 ; xmm4=data6L
+ psubd xmm7, xmm6 ; xmm7=data6H
+
+ movdqa xmm2, [rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2]
+
+ paddd xmm3, xmm2
+ paddd xmm1, xmm2
+ psrad xmm3, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm4, xmm2
+ paddd xmm7, xmm2
+ psrad xmm4, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
+ packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+ packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
+ movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L
+ movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm4, xmm6
+ movdqa xmm0, xmm2
+ paddd xmm6, xmm1 ; xmm6=data2L
+ paddd xmm2, xmm7 ; xmm2=data2H
+ psubd xmm4, xmm1 ; xmm4=data5L
+ psubd xmm0, xmm7 ; xmm0=data5H
+
+ movdqa xmm5, [rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2]
+
+ paddd xmm6, xmm5
+ paddd xmm2, xmm5
+ psrad xmm6, DESCALE_P2
+ psrad xmm2, DESCALE_P2
+ paddd xmm4, xmm5
+ paddd xmm0, xmm5
+ psrad xmm4, DESCALE_P2
+ psrad xmm0, DESCALE_P2
+
+ packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
+ packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+ movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
+ movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
+ movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
+ movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
+
+ movdqa xmm2, xmm3
+ movdqa xmm0, xmm1
+ paddd xmm3, xmm7 ; xmm3=data3L
+ paddd xmm1, xmm5 ; xmm1=data3H
+ psubd xmm2, xmm7 ; xmm2=data4L
+ psubd xmm0, xmm5 ; xmm0=data4H
+
+ movdqa xmm7, [rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2]
+
+ paddd xmm3, xmm7
+ paddd xmm1, xmm7
+ psrad xmm3, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm2, xmm7
+ paddd xmm0, xmm7
+ psrad xmm2, DESCALE_P2
+ psrad xmm0, DESCALE_P2
+
+ movdqa xmm5, [rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP]
+
+ packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
+ packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+ packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+ paddb xmm7, xmm5
+ paddb xmm1, xmm5
+ paddb xmm6, xmm5
+ paddb xmm3, xmm5
+
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
+ punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+ punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+ punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+ movdqa xmm4, xmm7 ; transpose coefficients(phase 2)
+ punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
+ punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+ punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ movdqa xmm3, xmm4 ; transpose coefficients(phase 3)
+ punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+ mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+ mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+ mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+ uncollect_args
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; --------------------------------------------------------------------------
-%define CONST_BITS 13
-%define PASS1_BITS 2
+%define CONST_BITS 13
+%define PASS1_BITS 2
-%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3)
+%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3)
%if CONST_BITS == 13
-F_0_298 equ 2446 ; FIX(0.298631336)
-F_0_390 equ 3196 ; FIX(0.390180644)
-F_0_541 equ 4433 ; FIX(0.541196100)
-F_0_765 equ 6270 ; FIX(0.765366865)
-F_0_899 equ 7373 ; FIX(0.899976223)
-F_1_175 equ 9633 ; FIX(1.175875602)
-F_1_501 equ 12299 ; FIX(1.501321110)
-F_1_847 equ 15137 ; FIX(1.847759065)
-F_1_961 equ 16069 ; FIX(1.961570560)
-F_2_053 equ 16819 ; FIX(2.053119869)
-F_2_562 equ 20995 ; FIX(2.562915447)
-F_3_072 equ 25172 ; FIX(3.072711026)
+F_0_298 equ 2446 ; FIX(0.298631336)
+F_0_390 equ 3196 ; FIX(0.390180644)
+F_0_541 equ 4433 ; FIX(0.541196100)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_175 equ 9633 ; FIX(1.175875602)
+F_1_501 equ 12299 ; FIX(1.501321110)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_1_961 equ 16069 ; FIX(1.961570560)
+F_2_053 equ 16819 ; FIX(2.053119869)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_072 equ 25172 ; FIX(3.072711026)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
-F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
-F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
-F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
-F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
-F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
-F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
-F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
-F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
-F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
+F_0_298 equ DESCALE( 320652955, 30-CONST_BITS) ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30-CONST_BITS) ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30-CONST_BITS) ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30-CONST_BITS) ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30-CONST_BITS) ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30-CONST_BITS) ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026)
%endif
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_idct_islow_sse2)
+ alignz 16
+ global EXTN(jconst_idct_islow_sse2)
EXTN(jconst_idct_islow_sse2):
-PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
-PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
-PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
-PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
-PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
+PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
+PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
+PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
- alignz 16
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
+ SECTION SEG_TEXT
+ BITS 32
;
; Perform dequantization and inverse DCT on one block of coefficients.
;
; JSAMPARRAY output_buf, JDIMENSION output_col)
;
-%define dct_table(b) (b)+8 ; jpeg_component_info *compptr
-%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
-%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
-%define output_col(b) (b)+20 ; JDIMENSION output_col
+%define dct_table(b) (b)+8 ; jpeg_component_info *compptr
+%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
+%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
+%define output_col(b) (b)+20 ; JDIMENSION output_col
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 12
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 12
- align 16
- global EXTN(jsimd_idct_islow_sse2)
+ align 16
+ global EXTN(jsimd_idct_islow_sse2)
EXTN(jsimd_idct_islow_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic ebx
-; push ecx ; unused
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process columns from input.
-
-; mov eax, [original_ebp]
- mov edx, POINTER [dct_table(eax)] ; quantptr
- mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
- jnz near .columnDCT
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- por xmm1,xmm0
- packsswb xmm1,xmm1
- packsswb xmm1,xmm1
- movd eax,xmm1
- test eax,eax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- psllw xmm5,PASS1_BITS
-
- movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
- punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
- punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
-
- pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
- pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
- pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
- pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
- pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
- pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
- pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
- pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
-
- movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
- movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
- movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
- movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
- jmp near .column_end
- alignx 16,7
+ mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz near .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por xmm1, xmm0
+ packsswb xmm1, xmm1
+ packsswb xmm1, xmm1
+ movd eax, xmm1
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw xmm5, PASS1_BITS
+
+ movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
+ pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
+ pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
+ pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
+ pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
+ pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
+ pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
+ pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+ movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
+ movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
+ jmp near .column_end
+ alignx 16, 7
%endif
.columnDCT:
- ; -- Even part
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- ; (Original)
- ; z1 = (z2 + z3) * 0.541196100;
- ; tmp2 = z1 + z3 * -1.847759065;
- ; tmp3 = z1 + z2 * 0.765366865;
- ;
- ; (This implementation)
- ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
- ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
- movdqa xmm4,xmm1 ; xmm1=in2=z2
- movdqa xmm5,xmm1
- punpcklwd xmm4,xmm3 ; xmm3=in6=z3
- punpckhwd xmm5,xmm3
- movdqa xmm1,xmm4
- movdqa xmm3,xmm5
- pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L
- pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
- pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
- pmaddwd xmm3,[GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H
-
- movdqa xmm6,xmm0
- paddw xmm0,xmm2 ; xmm0=in0+in4
- psubw xmm6,xmm2 ; xmm6=in0-in4
-
- pxor xmm7,xmm7
- pxor xmm2,xmm2
- punpcklwd xmm7,xmm0 ; xmm7=tmp0L
- punpckhwd xmm2,xmm0 ; xmm2=tmp0H
- psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
- psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
-
- movdqa xmm0,xmm7
- paddd xmm7,xmm4 ; xmm7=tmp10L
- psubd xmm0,xmm4 ; xmm0=tmp13L
- movdqa xmm4,xmm2
- paddd xmm2,xmm5 ; xmm2=tmp10H
- psubd xmm4,xmm5 ; xmm4=tmp13H
-
- movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
- movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
- movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
- movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
-
- pxor xmm5,xmm5
- pxor xmm7,xmm7
- punpcklwd xmm5,xmm6 ; xmm5=tmp1L
- punpckhwd xmm7,xmm6 ; xmm7=tmp1H
- psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
- psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
-
- movdqa xmm2,xmm5
- paddd xmm5,xmm1 ; xmm5=tmp11L
- psubd xmm2,xmm1 ; xmm2=tmp12L
- movdqa xmm0,xmm7
- paddd xmm7,xmm3 ; xmm7=tmp11H
- psubd xmm0,xmm3 ; xmm0=tmp12H
-
- movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
- movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
- movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L
- movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H
-
- ; -- Odd part
-
- movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- movdqa xmm5,xmm6
- movdqa xmm7,xmm4
- paddw xmm5,xmm3 ; xmm5=z3
- paddw xmm7,xmm1 ; xmm7=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movdqa xmm2,xmm5
- movdqa xmm0,xmm5
- punpcklwd xmm2,xmm7
- punpckhwd xmm0,xmm7
- movdqa xmm5,xmm2
- movdqa xmm7,xmm0
- pmaddwd xmm2,[GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L
- pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H
- pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
- pmaddwd xmm7,[GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H
-
- movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
- movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
-
- ; (Original)
- ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
- ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
- ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; tmp0 += z1 + z3; tmp1 += z2 + z4;
- ; tmp2 += z2 + z3; tmp3 += z1 + z4;
- ;
- ; (This implementation)
- ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
- ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
- ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
- ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
- ; tmp0 += z3; tmp1 += z4;
- ; tmp2 += z3; tmp3 += z4;
-
- movdqa xmm2,xmm3
- movdqa xmm0,xmm3
- punpcklwd xmm2,xmm4
- punpckhwd xmm0,xmm4
- movdqa xmm3,xmm2
- movdqa xmm4,xmm0
- pmaddwd xmm2,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L
- pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H
- pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L
- pmaddwd xmm4,[GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H
-
- paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
- paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
- paddd xmm3,xmm5 ; xmm3=tmp3L
- paddd xmm4,xmm7 ; xmm4=tmp3H
-
- movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
- movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
-
- movdqa xmm2,xmm1
- movdqa xmm0,xmm1
- punpcklwd xmm2,xmm6
- punpckhwd xmm0,xmm6
- movdqa xmm1,xmm2
- movdqa xmm6,xmm0
- pmaddwd xmm2,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L
- pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H
- pmaddwd xmm1,[GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L
- pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
-
- paddd xmm2,xmm5 ; xmm2=tmp1L
- paddd xmm0,xmm7 ; xmm0=tmp1H
- paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
- paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
-
- movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L
- movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H
-
- ; -- Final output stage
-
- movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
- movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
-
- movdqa xmm2,xmm5
- movdqa xmm0,xmm7
- paddd xmm5,xmm3 ; xmm5=data0L
- paddd xmm7,xmm4 ; xmm7=data0H
- psubd xmm2,xmm3 ; xmm2=data7L
- psubd xmm0,xmm4 ; xmm0=data7H
-
- movdqa xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1]
-
- paddd xmm5,xmm3
- paddd xmm7,xmm3
- psrad xmm5,DESCALE_P1
- psrad xmm7,DESCALE_P1
- paddd xmm2,xmm3
- paddd xmm0,xmm3
- psrad xmm2,DESCALE_P1
- psrad xmm0,DESCALE_P1
-
- packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
- packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
-
- movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
- movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
-
- movdqa xmm7,xmm4
- movdqa xmm0,xmm3
- paddd xmm4,xmm1 ; xmm4=data1L
- paddd xmm3,xmm6 ; xmm3=data1H
- psubd xmm7,xmm1 ; xmm7=data6L
- psubd xmm0,xmm6 ; xmm0=data6H
-
- movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1]
-
- paddd xmm4,xmm1
- paddd xmm3,xmm1
- psrad xmm4,DESCALE_P1
- psrad xmm3,DESCALE_P1
- paddd xmm7,xmm1
- paddd xmm0,xmm1
- psrad xmm7,DESCALE_P1
- psrad xmm0,DESCALE_P1
-
- packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
- packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
-
- movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
- punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
- punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
- movdqa xmm1,xmm7 ; transpose coefficients(phase 1)
- punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
- punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
-
- movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
- movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
- movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L
- movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H
-
- movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13)
- movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17)
- movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
- movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
-
- movdqa xmm5,xmm3
- movdqa xmm6,xmm0
- paddd xmm3,xmm4 ; xmm3=data2L
- paddd xmm0,xmm2 ; xmm0=data2H
- psubd xmm5,xmm4 ; xmm5=data5L
- psubd xmm6,xmm2 ; xmm6=data5H
-
- movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1]
-
- paddd xmm3,xmm7
- paddd xmm0,xmm7
- psrad xmm3,DESCALE_P1
- psrad xmm0,DESCALE_P1
- paddd xmm5,xmm7
- paddd xmm6,xmm7
- psrad xmm5,DESCALE_P1
- psrad xmm6,DESCALE_P1
-
- packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
- packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
-
- movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
- movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
- movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
- movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
-
- movdqa xmm0,xmm1
- movdqa xmm6,xmm4
- paddd xmm1,xmm2 ; xmm1=data3L
- paddd xmm4,xmm7 ; xmm4=data3H
- psubd xmm0,xmm2 ; xmm0=data4L
- psubd xmm6,xmm7 ; xmm6=data4H
-
- movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1]
-
- paddd xmm1,xmm2
- paddd xmm4,xmm2
- psrad xmm1,DESCALE_P1
- psrad xmm4,DESCALE_P1
- paddd xmm0,xmm2
- paddd xmm6,xmm2
- psrad xmm0,DESCALE_P1
- psrad xmm6,DESCALE_P1
-
- packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
- packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
- movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
-
- movdqa xmm4,xmm3 ; transpose coefficients(phase 1)
- punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
- punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
- movdqa xmm6,xmm0 ; transpose coefficients(phase 1)
- punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
- punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
-
- movdqa xmm1,xmm7 ; transpose coefficients(phase 2)
- punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
- punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
- movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
- punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
- punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
-
- movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
- movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
-
- movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
- movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
-
- movdqa xmm2,xmm0 ; transpose coefficients(phase 2)
- punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
- punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
- movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
- punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
-
- movdqa xmm3,xmm7 ; transpose coefficients(phase 3)
- punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
- punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
- movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
- punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
- punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
-
- movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
- movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
-
- movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
- movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
-
- movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
- punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
- punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
- movdqa xmm4,xmm2 ; transpose coefficients(phase 3)
- punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
- punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
-
- movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
- movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
+ ; -- Even part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movdqa xmm4, xmm1 ; xmm1=in2=z2
+ movdqa xmm5, xmm1
+ punpcklwd xmm4, xmm3 ; xmm3=in6=z3
+ punpckhwd xmm5, xmm3
+ movdqa xmm1, xmm4
+ movdqa xmm3, xmm5
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
+ pmaddwd xmm3, [GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H
+
+ movdqa xmm6, xmm0
+ paddw xmm0, xmm2 ; xmm0=in0+in4
+ psubw xmm6, xmm2 ; xmm6=in0-in4
+
+ pxor xmm7, xmm7
+ pxor xmm2, xmm2
+ punpcklwd xmm7, xmm0 ; xmm7=tmp0L
+ punpckhwd xmm2, xmm0 ; xmm2=tmp0H
+ psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+ psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+ movdqa xmm0, xmm7
+ paddd xmm7, xmm4 ; xmm7=tmp10L
+ psubd xmm0, xmm4 ; xmm0=tmp13L
+ movdqa xmm4, xmm2
+ paddd xmm2, xmm5 ; xmm2=tmp10H
+ psubd xmm4, xmm5 ; xmm4=tmp13H
+
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
+ movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
+ movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
+ movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
+
+ pxor xmm5, xmm5
+ pxor xmm7, xmm7
+ punpcklwd xmm5, xmm6 ; xmm5=tmp1L
+ punpckhwd xmm7, xmm6 ; xmm7=tmp1H
+ psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+ psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+ movdqa xmm2, xmm5
+ paddd xmm5, xmm1 ; xmm5=tmp11L
+ psubd xmm2, xmm1 ; xmm2=tmp12L
+ movdqa xmm0, xmm7
+ paddd xmm7, xmm3 ; xmm7=tmp11H
+ psubd xmm0, xmm3 ; xmm0=tmp12H
+
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
+ movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L
+ movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm5, xmm6
+ movdqa xmm7, xmm4
+ paddw xmm5, xmm3 ; xmm5=z3
+ paddw xmm7, xmm1 ; xmm7=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm2, xmm5
+ movdqa xmm0, xmm5
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm0, xmm7
+ movdqa xmm5, xmm2
+ movdqa xmm7, xmm0
+ pmaddwd xmm2, [GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
+ pmaddwd xmm7, [GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H
+
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movdqa xmm2, xmm3
+ movdqa xmm0, xmm3
+ punpcklwd xmm2, xmm4
+ punpckhwd xmm0, xmm4
+ movdqa xmm3, xmm2
+ movdqa xmm4, xmm0
+ pmaddwd xmm2, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H
+ pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L
+ pmaddwd xmm4, [GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H
+
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
+ paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
+ paddd xmm3, xmm5 ; xmm3=tmp3L
+ paddd xmm4, xmm7 ; xmm4=tmp3H
+
+ movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
+ movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
+
+ movdqa xmm2, xmm1
+ movdqa xmm0, xmm1
+ punpcklwd xmm2, xmm6
+ punpckhwd xmm0, xmm6
+ movdqa xmm1, xmm2
+ movdqa xmm6, xmm0
+ pmaddwd xmm2, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L
+ pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
+
+ paddd xmm2, xmm5 ; xmm2=tmp1L
+ paddd xmm0, xmm7 ; xmm0=tmp1H
+ paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
+
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
+ movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
+
+ movdqa xmm2, xmm5
+ movdqa xmm0, xmm7
+ paddd xmm5, xmm3 ; xmm5=data0L
+ paddd xmm7, xmm4 ; xmm7=data0H
+ psubd xmm2, xmm3 ; xmm2=data7L
+ psubd xmm0, xmm4 ; xmm0=data7H
+
+ movdqa xmm3, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1]
+
+ paddd xmm5, xmm3
+ paddd xmm7, xmm3
+ psrad xmm5, DESCALE_P1
+ psrad xmm7, DESCALE_P1
+ paddd xmm2, xmm3
+ paddd xmm0, xmm3
+ psrad xmm2, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
+ packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+ movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
+ movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
+
+ movdqa xmm7, xmm4
+ movdqa xmm0, xmm3
+ paddd xmm4, xmm1 ; xmm4=data1L
+ paddd xmm3, xmm6 ; xmm3=data1H
+ psubd xmm7, xmm1 ; xmm7=data6L
+ psubd xmm0, xmm6 ; xmm0=data6H
+
+ movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1]
+
+ paddd xmm4, xmm1
+ paddd xmm3, xmm1
+ psrad xmm4, DESCALE_P1
+ psrad xmm3, DESCALE_P1
+ paddd xmm7, xmm1
+ paddd xmm0, xmm1
+ psrad xmm7, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+
+ packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
+ packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+ movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
+ punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 1)
+ punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
+ punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
+ movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
+ movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L
+ movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13)
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17)
+ movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
+ movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
+
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm0
+ paddd xmm3, xmm4 ; xmm3=data2L
+ paddd xmm0, xmm2 ; xmm0=data2H
+ psubd xmm5, xmm4 ; xmm5=data5L
+ psubd xmm6, xmm2 ; xmm6=data5H
+
+ movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1]
+
+ paddd xmm3, xmm7
+ paddd xmm0, xmm7
+ psrad xmm3, DESCALE_P1
+ psrad xmm0, DESCALE_P1
+ paddd xmm5, xmm7
+ paddd xmm6, xmm7
+ psrad xmm5, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+
+ packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
+ packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
+ movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
+ movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
+ movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
+
+ movdqa xmm0, xmm1
+ movdqa xmm6, xmm4
+ paddd xmm1, xmm2 ; xmm1=data3L
+ paddd xmm4, xmm7 ; xmm4=data3H
+ psubd xmm0, xmm2 ; xmm0=data4L
+ psubd xmm6, xmm7 ; xmm6=data4H
+
+ movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1]
+
+ paddd xmm1, xmm2
+ paddd xmm4, xmm2
+ psrad xmm1, DESCALE_P1
+ psrad xmm4, DESCALE_P1
+ paddd xmm0, xmm2
+ paddd xmm6, xmm2
+ psrad xmm0, DESCALE_P1
+ psrad xmm6, DESCALE_P1
+
+ packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
+ packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
+ movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
+
+ movdqa xmm4, xmm3 ; transpose coefficients(phase 1)
+ punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
+ movdqa xmm6, xmm0 ; transpose coefficients(phase 1)
+ punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
+ punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 2)
+ punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
+ punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
+ movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
+
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
+ movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
+
+ movdqa xmm2, xmm0 ; transpose coefficients(phase 2)
+ punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
+ punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
+ punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
+
+ movdqa xmm3, xmm7 ; transpose coefficients(phase 3)
+ punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
+ punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
+ punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
+ punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+ movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
+
+ movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
+ movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
+
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
+ punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
+ punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
+ movdqa xmm4, xmm2 ; transpose coefficients(phase 3)
+ punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
+ punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
.column_end:
- ; -- Prefetch the next coefficient block
-
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
- ; ---- Pass 2: process rows from work array, store into output array.
-
- mov eax, [original_ebp]
- mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
- mov eax, JDIMENSION [output_col(eax)]
-
- ; -- Even part
-
- ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
-
- ; (Original)
- ; z1 = (z2 + z3) * 0.541196100;
- ; tmp2 = z1 + z3 * -1.847759065;
- ; tmp3 = z1 + z2 * 0.765366865;
- ;
- ; (This implementation)
- ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
- ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
- movdqa xmm6,xmm1 ; xmm1=in2=z2
- movdqa xmm5,xmm1
- punpcklwd xmm6,xmm2 ; xmm2=in6=z3
- punpckhwd xmm5,xmm2
- movdqa xmm1,xmm6
- movdqa xmm2,xmm5
- pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L
- pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
- pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
- pmaddwd xmm2,[GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H
-
- movdqa xmm3,xmm7
- paddw xmm7,xmm0 ; xmm7=in0+in4
- psubw xmm3,xmm0 ; xmm3=in0-in4
-
- pxor xmm4,xmm4
- pxor xmm0,xmm0
- punpcklwd xmm4,xmm7 ; xmm4=tmp0L
- punpckhwd xmm0,xmm7 ; xmm0=tmp0H
- psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
- psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
-
- movdqa xmm7,xmm4
- paddd xmm4,xmm6 ; xmm4=tmp10L
- psubd xmm7,xmm6 ; xmm7=tmp13L
- movdqa xmm6,xmm0
- paddd xmm0,xmm5 ; xmm0=tmp10H
- psubd xmm6,xmm5 ; xmm6=tmp13H
-
- movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
- movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
- movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
- movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
-
- pxor xmm5,xmm5
- pxor xmm4,xmm4
- punpcklwd xmm5,xmm3 ; xmm5=tmp1L
- punpckhwd xmm4,xmm3 ; xmm4=tmp1H
- psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
- psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
-
- movdqa xmm0,xmm5
- paddd xmm5,xmm1 ; xmm5=tmp11L
- psubd xmm0,xmm1 ; xmm0=tmp12L
- movdqa xmm7,xmm4
- paddd xmm4,xmm2 ; xmm4=tmp11H
- psubd xmm7,xmm2 ; xmm7=tmp12H
-
- movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
- movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
- movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L
- movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H
-
- ; -- Odd part
-
- movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3
- movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1
- movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
- movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
-
- movdqa xmm5,xmm6
- movdqa xmm4,xmm3
- paddw xmm5,xmm1 ; xmm5=z3
- paddw xmm4,xmm2 ; xmm4=z4
-
- ; (Original)
- ; z5 = (z3 + z4) * 1.175875602;
- ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
- ; z3 += z5; z4 += z5;
- ;
- ; (This implementation)
- ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
- ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
- movdqa xmm0,xmm5
- movdqa xmm7,xmm5
- punpcklwd xmm0,xmm4
- punpckhwd xmm7,xmm4
- movdqa xmm5,xmm0
- movdqa xmm4,xmm7
- pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L
- pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H
- pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
- pmaddwd xmm4,[GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H
-
- movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
- movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
-
- ; (Original)
- ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
- ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
- ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
- ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
- ; tmp0 += z1 + z3; tmp1 += z2 + z4;
- ; tmp2 += z2 + z3; tmp3 += z1 + z4;
- ;
- ; (This implementation)
- ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
- ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
- ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
- ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
- ; tmp0 += z3; tmp1 += z4;
- ; tmp2 += z3; tmp3 += z4;
-
- movdqa xmm0,xmm1
- movdqa xmm7,xmm1
- punpcklwd xmm0,xmm3
- punpckhwd xmm7,xmm3
- movdqa xmm1,xmm0
- movdqa xmm3,xmm7
- pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L
- pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H
- pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L
- pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H
-
- paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
- paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
- paddd xmm1,xmm5 ; xmm1=tmp3L
- paddd xmm3,xmm4 ; xmm3=tmp3H
-
- movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
- movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
-
- movdqa xmm0,xmm2
- movdqa xmm7,xmm2
- punpcklwd xmm0,xmm6
- punpckhwd xmm7,xmm6
- movdqa xmm2,xmm0
- movdqa xmm6,xmm7
- pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L
- pmaddwd xmm7,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H
- pmaddwd xmm2,[GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L
- pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
-
- paddd xmm0,xmm5 ; xmm0=tmp1L
- paddd xmm7,xmm4 ; xmm7=tmp1H
- paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
- paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
-
- movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L
- movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H
-
- ; -- Final output stage
-
- movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
- movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
-
- movdqa xmm0,xmm5
- movdqa xmm7,xmm4
- paddd xmm5,xmm1 ; xmm5=data0L
- paddd xmm4,xmm3 ; xmm4=data0H
- psubd xmm0,xmm1 ; xmm0=data7L
- psubd xmm7,xmm3 ; xmm7=data7H
-
- movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2]
-
- paddd xmm5,xmm1
- paddd xmm4,xmm1
- psrad xmm5,DESCALE_P2
- psrad xmm4,DESCALE_P2
- paddd xmm0,xmm1
- paddd xmm7,xmm1
- psrad xmm0,DESCALE_P2
- psrad xmm7,DESCALE_P2
-
- packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
- packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
-
- movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
- movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
-
- movdqa xmm4,xmm3
- movdqa xmm7,xmm1
- paddd xmm3,xmm2 ; xmm3=data1L
- paddd xmm1,xmm6 ; xmm1=data1H
- psubd xmm4,xmm2 ; xmm4=data6L
- psubd xmm7,xmm6 ; xmm7=data6H
-
- movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2]
-
- paddd xmm3,xmm2
- paddd xmm1,xmm2
- psrad xmm3,DESCALE_P2
- psrad xmm1,DESCALE_P2
- paddd xmm4,xmm2
- paddd xmm7,xmm2
- psrad xmm4,DESCALE_P2
- psrad xmm7,DESCALE_P2
-
- packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
- packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
-
- packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
- packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
- movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
- movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
- movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L
- movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H
-
- movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
- movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
- movdqa xmm4,xmm6
- movdqa xmm0,xmm2
- paddd xmm6,xmm1 ; xmm6=data2L
- paddd xmm2,xmm7 ; xmm2=data2H
- psubd xmm4,xmm1 ; xmm4=data5L
- psubd xmm0,xmm7 ; xmm0=data5H
-
- movdqa xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2]
-
- paddd xmm6,xmm5
- paddd xmm2,xmm5
- psrad xmm6,DESCALE_P2
- psrad xmm2,DESCALE_P2
- paddd xmm4,xmm5
- paddd xmm0,xmm5
- psrad xmm4,DESCALE_P2
- psrad xmm0,DESCALE_P2
-
- packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
- packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
-
- movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
- movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
- movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
- movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
-
- movdqa xmm2,xmm3
- movdqa xmm0,xmm1
- paddd xmm3,xmm7 ; xmm3=data3L
- paddd xmm1,xmm5 ; xmm1=data3H
- psubd xmm2,xmm7 ; xmm2=data4L
- psubd xmm0,xmm5 ; xmm0=data4H
-
- movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2]
-
- paddd xmm3,xmm7
- paddd xmm1,xmm7
- psrad xmm3,DESCALE_P2
- psrad xmm1,DESCALE_P2
- paddd xmm2,xmm7
- paddd xmm0,xmm7
- psrad xmm2,DESCALE_P2
- psrad xmm0,DESCALE_P2
-
- movdqa xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP]
-
- packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
- packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
- movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
- packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
- packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
- paddb xmm7,xmm5
- paddb xmm1,xmm5
- paddb xmm6,xmm5
- paddb xmm3,xmm5
-
- movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
- punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
- punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
- movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
- punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
- punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
- movdqa xmm4,xmm7 ; transpose coefficients(phase 2)
- punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
- punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
- movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
- punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
- punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
- movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
- punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
- punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
- movdqa xmm3,xmm4 ; transpose coefficients(phase 3)
- punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
- punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
- pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
- pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
- pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
- pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
- movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
- mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
- movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-
- mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
- movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
- mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
- movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
- movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; unused
- poppic ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
+ ; -- Prefetch the next coefficient block
+
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+ ; ---- Pass 2: process rows from work array, store into output array.
+
+ mov eax, [original_ebp]
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
+
+ ; -- Even part
+
+ ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+ ; (Original)
+ ; z1 = (z2 + z3) * 0.541196100;
+ ; tmp2 = z1 + z3 * -1.847759065;
+ ; tmp3 = z1 + z2 * 0.765366865;
+ ;
+ ; (This implementation)
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+ movdqa xmm6, xmm1 ; xmm1=in2=z2
+ movdqa xmm5, xmm1
+ punpcklwd xmm6, xmm2 ; xmm2=in6=z3
+ punpckhwd xmm5, xmm2
+ movdqa xmm1, xmm6
+ movdqa xmm2, xmm5
+ pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H
+
+ movdqa xmm3, xmm7
+ paddw xmm7, xmm0 ; xmm7=in0+in4
+ psubw xmm3, xmm0 ; xmm3=in0-in4
+
+ pxor xmm4, xmm4
+ pxor xmm0, xmm0
+ punpcklwd xmm4, xmm7 ; xmm4=tmp0L
+ punpckhwd xmm0, xmm7 ; xmm0=tmp0H
+ psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+ psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm6 ; xmm4=tmp10L
+ psubd xmm7, xmm6 ; xmm7=tmp13L
+ movdqa xmm6, xmm0
+ paddd xmm0, xmm5 ; xmm0=tmp10H
+ psubd xmm6, xmm5 ; xmm6=tmp13H
+
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
+ movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
+ movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
+
+ pxor xmm5, xmm5
+ pxor xmm4, xmm4
+ punpcklwd xmm5, xmm3 ; xmm5=tmp1L
+ punpckhwd xmm4, xmm3 ; xmm4=tmp1H
+ psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+ psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+ movdqa xmm0, xmm5
+ paddd xmm5, xmm1 ; xmm5=tmp11L
+ psubd xmm0, xmm1 ; xmm0=tmp12L
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm2 ; xmm4=tmp11H
+ psubd xmm7, xmm2 ; xmm7=tmp12H
+
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
+ movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L
+ movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H
+
+ ; -- Odd part
+
+ movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3
+ movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1
+ movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
+ movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
+
+ movdqa xmm5, xmm6
+ movdqa xmm4, xmm3
+ paddw xmm5, xmm1 ; xmm5=z3
+ paddw xmm4, xmm2 ; xmm4=z4
+
+ ; (Original)
+ ; z5 = (z3 + z4) * 1.175875602;
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ ; z3 += z5; z4 += z5;
+ ;
+ ; (This implementation)
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+ movdqa xmm0, xmm5
+ movdqa xmm7, xmm5
+ punpcklwd xmm0, xmm4
+ punpckhwd xmm7, xmm4
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm7
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H
+
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
+
+ ; (Original)
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
+ ;
+ ; (This implementation)
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ ; tmp0 += z3; tmp1 += z4;
+ ; tmp2 += z3; tmp3 += z4;
+
+ movdqa xmm0, xmm1
+ movdqa xmm7, xmm1
+ punpcklwd xmm0, xmm3
+ punpckhwd xmm7, xmm3
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm7
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H
+ pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L
+ pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H
+
+ paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
+ paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
+ paddd xmm1, xmm5 ; xmm1=tmp3L
+ paddd xmm3, xmm4 ; xmm3=tmp3H
+
+ movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
+ movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
+
+ movdqa xmm0, xmm2
+ movdqa xmm7, xmm2
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm7, xmm6
+ movdqa xmm2, xmm0
+ movdqa xmm6, xmm7
+ pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H
+ pmaddwd xmm2, [GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L
+ pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
+
+ paddd xmm0, xmm5 ; xmm0=tmp1L
+ paddd xmm7, xmm4 ; xmm7=tmp1H
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
+
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H
+
+ ; -- Final output stage
+
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
+ movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
+
+ movdqa xmm0, xmm5
+ movdqa xmm7, xmm4
+ paddd xmm5, xmm1 ; xmm5=data0L
+ paddd xmm4, xmm3 ; xmm4=data0H
+ psubd xmm0, xmm1 ; xmm0=data7L
+ psubd xmm7, xmm3 ; xmm7=data7H
+
+ movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2]
+
+ paddd xmm5, xmm1
+ paddd xmm4, xmm1
+ psrad xmm5, DESCALE_P2
+ psrad xmm4, DESCALE_P2
+ paddd xmm0, xmm1
+ paddd xmm7, xmm1
+ psrad xmm0, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
+ packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
+ movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
+
+ movdqa xmm4, xmm3
+ movdqa xmm7, xmm1
+ paddd xmm3, xmm2 ; xmm3=data1L
+ paddd xmm1, xmm6 ; xmm1=data1H
+ psubd xmm4, xmm2 ; xmm4=data6L
+ psubd xmm7, xmm6 ; xmm7=data6H
+
+ movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2]
+
+ paddd xmm3, xmm2
+ paddd xmm1, xmm2
+ psrad xmm3, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm4, xmm2
+ paddd xmm7, xmm2
+ psrad xmm4, DESCALE_P2
+ psrad xmm7, DESCALE_P2
+
+ packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
+ packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+ packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
+ movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L
+ movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H
+
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ movdqa xmm4, xmm6
+ movdqa xmm0, xmm2
+ paddd xmm6, xmm1 ; xmm6=data2L
+ paddd xmm2, xmm7 ; xmm2=data2H
+ psubd xmm4, xmm1 ; xmm4=data5L
+ psubd xmm0, xmm7 ; xmm0=data5H
+
+ movdqa xmm5, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2]
+
+ paddd xmm6, xmm5
+ paddd xmm2, xmm5
+ psrad xmm6, DESCALE_P2
+ psrad xmm2, DESCALE_P2
+ paddd xmm4, xmm5
+ paddd xmm0, xmm5
+ psrad xmm4, DESCALE_P2
+ psrad xmm0, DESCALE_P2
+
+ packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
+ packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+ movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
+ movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
+ movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
+ movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
+
+ movdqa xmm2, xmm3
+ movdqa xmm0, xmm1
+ paddd xmm3, xmm7 ; xmm3=data3L
+ paddd xmm1, xmm5 ; xmm1=data3H
+ psubd xmm2, xmm7 ; xmm2=data4L
+ psubd xmm0, xmm5 ; xmm0=data4H
+
+ movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2]
+
+ paddd xmm3, xmm7
+ paddd xmm1, xmm7
+ psrad xmm3, DESCALE_P2
+ psrad xmm1, DESCALE_P2
+ paddd xmm2, xmm7
+ paddd xmm0, xmm7
+ psrad xmm2, DESCALE_P2
+ psrad xmm0, DESCALE_P2
+
+ movdqa xmm5, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP]
+
+ packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
+ packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+ packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+ packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+ paddb xmm7, xmm5
+ paddb xmm1, xmm5
+ paddb xmm6, xmm5
+ paddb xmm3, xmm5
+
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
+ punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+ punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
+ punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+ punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+ movdqa xmm4, xmm7 ; transpose coefficients(phase 2)
+ punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+ punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
+ punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+ punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
+ punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+ punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+ movdqa xmm3, xmm4 ; transpose coefficients(phase 3)
+ punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+ punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+ pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+ pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+ pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+ pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+ mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+ mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+ mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; --------------------------------------------------------------------------
-%define CONST_BITS 13
-%define PASS1_BITS 2
+%define CONST_BITS 13
+%define PASS1_BITS 2
-%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
+%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
%if CONST_BITS == 13
-F_0_211 equ 1730 ; FIX(0.211164243)
-F_0_509 equ 4176 ; FIX(0.509795579)
-F_0_601 equ 4926 ; FIX(0.601344887)
-F_0_720 equ 5906 ; FIX(0.720959822)
-F_0_765 equ 6270 ; FIX(0.765366865)
-F_0_850 equ 6967 ; FIX(0.850430095)
-F_0_899 equ 7373 ; FIX(0.899976223)
-F_1_061 equ 8697 ; FIX(1.061594337)
-F_1_272 equ 10426 ; FIX(1.272758580)
-F_1_451 equ 11893 ; FIX(1.451774981)
-F_1_847 equ 15137 ; FIX(1.847759065)
-F_2_172 equ 17799 ; FIX(2.172734803)
-F_2_562 equ 20995 ; FIX(2.562915447)
-F_3_624 equ 29692 ; FIX(3.624509785)
+F_0_211 equ 1730 ; FIX(0.211164243)
+F_0_509 equ 4176 ; FIX(0.509795579)
+F_0_601 equ 4926 ; FIX(0.601344887)
+F_0_720 equ 5906 ; FIX(0.720959822)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_850 equ 6967 ; FIX(0.850430095)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_061 equ 8697 ; FIX(1.061594337)
+F_1_272 equ 10426 ; FIX(1.272758580)
+F_1_451 equ 11893 ; FIX(1.451774981)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_2_172 equ 17799 ; FIX(2.172734803)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_624 equ 29692 ; FIX(3.624509785)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243)
-F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579)
-F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887)
-F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822)
-F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
-F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095)
-F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
-F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337)
-F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580)
-F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803)
-F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
-F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785)
+F_0_211 equ DESCALE( 226735879, 30-CONST_BITS) ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30-CONST_BITS) ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30-CONST_BITS) ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30-CONST_BITS) ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30-CONST_BITS) ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30-CONST_BITS) ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30-CONST_BITS) ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30-CONST_BITS) ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30-CONST_BITS) ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30-CONST_BITS) ; FIX(3.624509785)
%endif
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_idct_red_sse2)
+ alignz 16
+ global EXTN(jconst_idct_red_sse2)
EXTN(jconst_idct_red_sse2):
-PW_F184_MF076 times 4 dw F_1_847,-F_0_765
-PW_F256_F089 times 4 dw F_2_562, F_0_899
-PW_F106_MF217 times 4 dw F_1_061,-F_2_172
-PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509
-PW_F145_MF021 times 4 dw F_1_451,-F_0_211
-PW_F362_MF127 times 4 dw F_3_624,-F_1_272
-PW_F085_MF072 times 4 dw F_0_850,-F_0_720
-PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP times 16 db CENTERJSAMPLE
-
- alignz 16
+PW_F184_MF076 times 4 dw F_1_847,-F_0_765
+PW_F256_F089 times 4 dw F_2_562, F_0_899
+PW_F106_MF217 times 4 dw F_1_061,-F_2_172
+PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509
+PW_F145_MF021 times 4 dw F_1_451,-F_0_211
+PW_F362_MF127 times 4 dw F_3_624,-F_1_272
+PW_F085_MF072 times 4 dw F_0_850,-F_0_720
+PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
+ SECTION SEG_TEXT
+ BITS 64
;
; Perform dequantization and inverse DCT on one block of coefficients,
; producing a reduced-size 4x4 output block.
; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col
-%define original_rbp rbp+0
-%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
+%define original_rbp rbp+0
+%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
- align 16
- global EXTN(jsimd_idct_4x4_sse2)
+ align 16
+ global EXTN(jsimd_idct_4x4_sse2)
EXTN(jsimd_idct_4x4_sse2):
- push rbp
- mov rax,rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp],rax
- mov rbp,rsp ; rbp = aligned rbp
- lea rsp, [wk(0)]
- collect_args
+ push rbp
+ mov rax, rsp ; rax = original rbp
+ sub rsp, byte 4
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [rsp], rax
+ mov rbp, rsp ; rbp = aligned rbp
+ lea rsp, [wk(0)]
+ collect_args
- ; ---- Pass 1: process columns from input.
+ ; ---- Pass 1: process columns from input.
- mov rdx, r10 ; quantptr
- mov rsi, r11 ; inptr
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
- mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- jnz short .columnDCT
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
- por xmm0,xmm1
- packsswb xmm0,xmm0
- packsswb xmm0,xmm0
- movd eax,xmm0
- test rax,rax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
- psllw xmm0,PASS1_BITS
-
- movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
- punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
- punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
-
- pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
- pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
- pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
- pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
-
- jmp near .column_end
+ mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ por xmm0, xmm1
+ packsswb xmm0, xmm0
+ packsswb xmm0, xmm0
+ movd eax, xmm0
+ test rax, rax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw xmm0, PASS1_BITS
+
+ movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+ pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+ pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+ pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+ jmp near .column_end
%endif
.columnDCT:
- ; -- Odd part
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
- movdqa xmm4,xmm0
- movdqa xmm5,xmm0
- punpcklwd xmm4,xmm1
- punpckhwd xmm5,xmm1
- movdqa xmm0,xmm4
- movdqa xmm1,xmm5
- pmaddwd xmm4,[rel PW_F256_F089] ; xmm4=(tmp2L)
- pmaddwd xmm5,[rel PW_F256_F089] ; xmm5=(tmp2H)
- pmaddwd xmm0,[rel PW_F106_MF217] ; xmm0=(tmp0L)
- pmaddwd xmm1,[rel PW_F106_MF217] ; xmm1=(tmp0H)
-
- movdqa xmm6,xmm2
- movdqa xmm7,xmm2
- punpcklwd xmm6,xmm3
- punpckhwd xmm7,xmm3
- movdqa xmm2,xmm6
- movdqa xmm3,xmm7
- pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2L)
- pmaddwd xmm7,[rel PW_MF060_MF050] ; xmm7=(tmp2H)
- pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0L)
- pmaddwd xmm3,[rel PW_F145_MF021] ; xmm3=(tmp0H)
-
- paddd xmm6,xmm4 ; xmm6=tmp2L
- paddd xmm7,xmm5 ; xmm7=tmp2H
- paddd xmm2,xmm0 ; xmm2=tmp0L
- paddd xmm3,xmm1 ; xmm3=tmp0H
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
- movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
-
- ; -- Even part
-
- movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
- pxor xmm1,xmm1
- pxor xmm2,xmm2
- punpcklwd xmm1,xmm4 ; xmm1=tmp0L
- punpckhwd xmm2,xmm4 ; xmm2=tmp0H
- psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
- psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
-
- movdqa xmm3,xmm5 ; xmm5=in2=z2
- punpcklwd xmm5,xmm0 ; xmm0=in6=z3
- punpckhwd xmm3,xmm0
- pmaddwd xmm5,[rel PW_F184_MF076] ; xmm5=tmp2L
- pmaddwd xmm3,[rel PW_F184_MF076] ; xmm3=tmp2H
-
- movdqa xmm4,xmm1
- movdqa xmm0,xmm2
- paddd xmm1,xmm5 ; xmm1=tmp10L
- paddd xmm2,xmm3 ; xmm2=tmp10H
- psubd xmm4,xmm5 ; xmm4=tmp12L
- psubd xmm0,xmm3 ; xmm0=tmp12H
-
- ; -- Final output stage
-
- movdqa xmm5,xmm1
- movdqa xmm3,xmm2
- paddd xmm1,xmm6 ; xmm1=data0L
- paddd xmm2,xmm7 ; xmm2=data0H
- psubd xmm5,xmm6 ; xmm5=data3L
- psubd xmm3,xmm7 ; xmm3=data3H
-
- movdqa xmm6,[rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4]
-
- paddd xmm1,xmm6
- paddd xmm2,xmm6
- psrad xmm1,DESCALE_P1_4
- psrad xmm2,DESCALE_P1_4
- paddd xmm5,xmm6
- paddd xmm3,xmm6
- psrad xmm5,DESCALE_P1_4
- psrad xmm3,DESCALE_P1_4
-
- packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
- packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
- movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
-
- movdqa xmm2,xmm4
- movdqa xmm3,xmm0
- paddd xmm4,xmm7 ; xmm4=data1L
- paddd xmm0,xmm6 ; xmm0=data1H
- psubd xmm2,xmm7 ; xmm2=data2L
- psubd xmm3,xmm6 ; xmm3=data2H
-
- movdqa xmm7,[rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4]
-
- paddd xmm4,xmm7
- paddd xmm0,xmm7
- psrad xmm4,DESCALE_P1_4
- psrad xmm0,DESCALE_P1_4
- paddd xmm2,xmm7
- paddd xmm3,xmm7
- psrad xmm2,DESCALE_P1_4
- psrad xmm3,DESCALE_P1_4
-
- packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
- packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
-
- movdqa xmm6,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
- punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
- movdqa xmm7,xmm2 ; transpose coefficients(phase 1)
- punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
- punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
-
- movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
- punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
- punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
- movdqa xmm3,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
- punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+ ; -- Odd part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm0
+ punpcklwd xmm4, xmm1
+ punpckhwd xmm5, xmm1
+ movdqa xmm0, xmm4
+ movdqa xmm1, xmm5
+ pmaddwd xmm4, [rel PW_F256_F089] ; xmm4=(tmp2L)
+ pmaddwd xmm5, [rel PW_F256_F089] ; xmm5=(tmp2H)
+ pmaddwd xmm0, [rel PW_F106_MF217] ; xmm0=(tmp0L)
+ pmaddwd xmm1, [rel PW_F106_MF217] ; xmm1=(tmp0H)
+
+ movdqa xmm6, xmm2
+ movdqa xmm7, xmm2
+ punpcklwd xmm6, xmm3
+ punpckhwd xmm7, xmm3
+ movdqa xmm2, xmm6
+ movdqa xmm3, xmm7
+ pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2L)
+ pmaddwd xmm7, [rel PW_MF060_MF050] ; xmm7=(tmp2H)
+ pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0L)
+ pmaddwd xmm3, [rel PW_F145_MF021] ; xmm3=(tmp0H)
+
+ paddd xmm6, xmm4 ; xmm6=tmp2L
+ paddd xmm7, xmm5 ; xmm7=tmp2H
+ paddd xmm2, xmm0 ; xmm2=tmp0L
+ paddd xmm3, xmm1 ; xmm3=tmp0H
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
+
+ ; -- Even part
+
+ movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ pxor xmm1, xmm1
+ pxor xmm2, xmm2
+ punpcklwd xmm1, xmm4 ; xmm1=tmp0L
+ punpckhwd xmm2, xmm4 ; xmm2=tmp0H
+ psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+ psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+ movdqa xmm3, xmm5 ; xmm5=in2=z2
+ punpcklwd xmm5, xmm0 ; xmm0=in6=z3
+ punpckhwd xmm3, xmm0
+ pmaddwd xmm5, [rel PW_F184_MF076] ; xmm5=tmp2L
+ pmaddwd xmm3, [rel PW_F184_MF076] ; xmm3=tmp2H
+
+ movdqa xmm4, xmm1
+ movdqa xmm0, xmm2
+ paddd xmm1, xmm5 ; xmm1=tmp10L
+ paddd xmm2, xmm3 ; xmm2=tmp10H
+ psubd xmm4, xmm5 ; xmm4=tmp12L
+ psubd xmm0, xmm3 ; xmm0=tmp12H
+
+ ; -- Final output stage
+
+ movdqa xmm5, xmm1
+ movdqa xmm3, xmm2
+ paddd xmm1, xmm6 ; xmm1=data0L
+ paddd xmm2, xmm7 ; xmm2=data0H
+ psubd xmm5, xmm6 ; xmm5=data3L
+ psubd xmm3, xmm7 ; xmm3=data3H
+
+ movdqa xmm6, [rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4]
+
+ paddd xmm1, xmm6
+ paddd xmm2, xmm6
+ psrad xmm1, DESCALE_P1_4
+ psrad xmm2, DESCALE_P1_4
+ paddd xmm5, xmm6
+ paddd xmm3, xmm6
+ psrad xmm5, DESCALE_P1_4
+ psrad xmm3, DESCALE_P1_4
+
+ packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
+ packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
+
+ movdqa xmm2, xmm4
+ movdqa xmm3, xmm0
+ paddd xmm4, xmm7 ; xmm4=data1L
+ paddd xmm0, xmm6 ; xmm0=data1H
+ psubd xmm2, xmm7 ; xmm2=data2L
+ psubd xmm3, xmm6 ; xmm3=data2H
+
+ movdqa xmm7, [rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4]
+
+ paddd xmm4, xmm7
+ paddd xmm0, xmm7
+ psrad xmm4, DESCALE_P1_4
+ psrad xmm0, DESCALE_P1_4
+ paddd xmm2, xmm7
+ paddd xmm3, xmm7
+ psrad xmm2, DESCALE_P1_4
+ psrad xmm3, DESCALE_P1_4
+
+ packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
+ packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
+ movdqa xmm7, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+ movdqa xmm3, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
.column_end:
- ; -- Prefetch the next coefficient block
+ ; -- Prefetch the next coefficient block
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
- ; ---- Pass 2: process rows, store into output array.
+ ; ---- Pass 2: process rows, store into output array.
- mov rax, [original_rbp]
- mov rdi, r12 ; (JSAMPROW *)
- mov eax, r13d
+ mov rax, [original_rbp]
+ mov rdi, r12 ; (JSAMPROW *)
+ mov eax, r13d
- ; -- Even part
+ ; -- Even part
- pxor xmm4,xmm4
- punpcklwd xmm4,xmm1 ; xmm4=tmp0
- psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+ pxor xmm4, xmm4
+ punpcklwd xmm4, xmm1 ; xmm4=tmp0
+ psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
- ; -- Odd part
+ ; -- Odd part
- punpckhwd xmm1,xmm0
- punpckhwd xmm6,xmm3
- movdqa xmm5,xmm1
- movdqa xmm2,xmm6
- pmaddwd xmm1,[rel PW_F256_F089] ; xmm1=(tmp2)
- pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2)
- pmaddwd xmm5,[rel PW_F106_MF217] ; xmm5=(tmp0)
- pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0)
+ punpckhwd xmm1, xmm0
+ punpckhwd xmm6, xmm3
+ movdqa xmm5, xmm1
+ movdqa xmm2, xmm6
+ pmaddwd xmm1, [rel PW_F256_F089] ; xmm1=(tmp2)
+ pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2)
+ pmaddwd xmm5, [rel PW_F106_MF217] ; xmm5=(tmp0)
+ pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0)
- paddd xmm6,xmm1 ; xmm6=tmp2
- paddd xmm2,xmm5 ; xmm2=tmp0
+ paddd xmm6, xmm1 ; xmm6=tmp2
+ paddd xmm2, xmm5 ; xmm2=tmp0
- ; -- Even part
+ ; -- Even part
- punpcklwd xmm0,xmm3
- pmaddwd xmm0,[rel PW_F184_MF076] ; xmm0=tmp2
+ punpcklwd xmm0, xmm3
+ pmaddwd xmm0, [rel PW_F184_MF076] ; xmm0=tmp2
- movdqa xmm7,xmm4
- paddd xmm4,xmm0 ; xmm4=tmp10
- psubd xmm7,xmm0 ; xmm7=tmp12
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm0 ; xmm4=tmp10
+ psubd xmm7, xmm0 ; xmm7=tmp12
- ; -- Final output stage
+ ; -- Final output stage
- movdqa xmm1,[rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4]
+ movdqa xmm1, [rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4]
- movdqa xmm5,xmm4
- movdqa xmm3,xmm7
- paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30)
- paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31)
- psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33)
- psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32)
+ movdqa xmm5, xmm4
+ movdqa xmm3, xmm7
+ paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30)
+ paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31)
+ psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33)
+ psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
- paddd xmm4,xmm1
- paddd xmm7,xmm1
- psrad xmm4,DESCALE_P2_4
- psrad xmm7,DESCALE_P2_4
- paddd xmm5,xmm1
- paddd xmm3,xmm1
- psrad xmm5,DESCALE_P2_4
- psrad xmm3,DESCALE_P2_4
+ paddd xmm4, xmm1
+ paddd xmm7, xmm1
+ psrad xmm4, DESCALE_P2_4
+ psrad xmm7, DESCALE_P2_4
+ paddd xmm5, xmm1
+ paddd xmm3, xmm1
+ psrad xmm5, DESCALE_P2_4
+ psrad xmm3, DESCALE_P2_4
- packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
- packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
+ packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
+ packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
- movdqa xmm0,xmm4 ; transpose coefficients(phase 1)
- punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
- punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 1)
+ punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
- movdqa xmm6,xmm4 ; transpose coefficients(phase 2)
- punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
- punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
+ movdqa xmm6, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
- packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
- paddb xmm4,[rel PB_CENTERJSAMP]
+ packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+ paddb xmm4, [rel PB_CENTERJSAMP]
- pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
- pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
- pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+ pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+ pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+ pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
- movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
- movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
- mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
- movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
- movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+ mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+ movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+ mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+ movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
- uncollect_args
- mov rsp,rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
- pop rbp
- ret
+ uncollect_args
+ mov rsp, rbp ; rsp <- aligned rbp
+ pop rsp ; rsp <- original rbp
+ pop rbp
+ ret
; --------------------------------------------------------------------------
; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col
- align 16
- global EXTN(jsimd_idct_2x2_sse2)
+ align 16
+ global EXTN(jsimd_idct_2x2_sse2)
EXTN(jsimd_idct_2x2_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
- push rbx
-
- ; ---- Pass 1: process columns from input.
-
- mov rdx, r10 ; quantptr
- mov rsi, r11 ; inptr
-
- ; | input: | result: |
- ; | 00 01 ** 03 ** 05 ** 07 | |
- ; | 10 11 ** 13 ** 15 ** 17 | |
- ; | ** ** ** ** ** ** ** ** | |
- ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
- ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
- ; | 50 51 ** 53 ** 55 ** 57 | |
- ; | ** ** ** ** ** ** ** ** | |
- ; | 70 71 ** 73 ** 75 ** 77 | |
-
- ; -- Odd part
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args
+ push rbx
+
+ ; ---- Pass 1: process columns from input.
+
+ mov rdx, r10 ; quantptr
+ mov rsi, r11 ; inptr
+
+ ; | input: | result: |
+ ; | 00 01 ** 03 ** 05 ** 07 | |
+ ; | 10 11 ** 13 ** 15 ** 17 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+ ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+ ; | 50 51 ** 53 ** 55 ** 57 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 70 71 ** 73 ** 75 ** 77 | |
+
+ ; -- Odd part
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
- ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+ ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+ ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
- pcmpeqd xmm7,xmm7
- pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+ pcmpeqd xmm7, xmm7
+ pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
- movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
- movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
- punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
- punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
- pmaddwd xmm4,[rel PW_F362_MF127]
- pmaddwd xmm5,[rel PW_F085_MF072]
+ movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
+ movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
+ punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
+ punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
+ pmaddwd xmm4, [rel PW_F362_MF127]
+ pmaddwd xmm5, [rel PW_F085_MF072]
- psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
- pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
- psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
- pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
- por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
- por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
- pmaddwd xmm0,[rel PW_F362_MF127]
- pmaddwd xmm2,[rel PW_F085_MF072]
+ psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
+ pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+ psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
+ pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+ por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
+ por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
+ pmaddwd xmm0, [rel PW_F362_MF127]
+ pmaddwd xmm2, [rel PW_F085_MF072]
- paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3]
- paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
+ paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3]
+ paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
- ; -- Even part
+ ; -- Even part
- movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
- pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+ pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
- ; xmm6=(00 01 ** 03 ** 05 ** 07)
+ ; xmm6=(00 01 ** 03 ** 05 ** 07)
- movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
- pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
- pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
- psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
- psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
+ movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
+ pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
+ pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+ psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
+ psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
- ; -- Final output stage
+ ; -- Final output stage
- movdqa xmm3,xmm6
- movdqa xmm5,xmm1
- paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
- paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
- psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
- psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+ movdqa xmm3, xmm6
+ movdqa xmm5, xmm1
+ paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+ paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+ psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+ psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
- movdqa xmm2,[rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2]
+ movdqa xmm2, [rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2]
- punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **)
+ punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **)
- movdqa xmm7,xmm1
- punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3)
- punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7)
+ movdqa xmm7, xmm1
+ punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3)
+ punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7)
- paddd xmm6,xmm2
- psrad xmm6,DESCALE_P1_2
+ paddd xmm6, xmm2
+ psrad xmm6, DESCALE_P1_2
- paddd xmm1,xmm2
- paddd xmm7,xmm2
- psrad xmm1,DESCALE_P1_2
- psrad xmm7,DESCALE_P1_2
+ paddd xmm1, xmm2
+ paddd xmm7, xmm2
+ psrad xmm1, DESCALE_P1_2
+ psrad xmm7, DESCALE_P1_2
- ; -- Prefetch the next coefficient block
+ ; -- Prefetch the next coefficient block
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
- prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
- ; ---- Pass 2: process rows, store into output array.
+ ; ---- Pass 2: process rows, store into output array.
- mov rdi, r12 ; (JSAMPROW *)
- mov eax, r13d
+ mov rdi, r12 ; (JSAMPROW *)
+ mov eax, r13d
- ; | input:| result:|
- ; | A0 B0 | |
- ; | A1 B1 | C0 C1 |
- ; | A3 B3 | D0 D1 |
- ; | A5 B5 | |
- ; | A7 B7 | |
+ ; | input:| result:|
+ ; | A0 B0 | |
+ ; | A1 B1 | C0 C1 |
+ ; | A3 B3 | D0 D1 |
+ ; | A5 B5 | |
+ ; | A7 B7 | |
- ; -- Odd part
+ ; -- Odd part
- packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
- packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
- pmaddwd xmm1,[rel PW_F362_MF127]
- pmaddwd xmm7,[rel PW_F085_MF072]
+ packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+ packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+ pmaddwd xmm1, [rel PW_F362_MF127]
+ pmaddwd xmm7, [rel PW_F085_MF072]
- paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
+ paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
- ; -- Even part
+ ; -- Even part
- pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
+ pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
- ; -- Final output stage
+ ; -- Final output stage
- movdqa xmm4,xmm6
- paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
- psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+ movdqa xmm4, xmm6
+ paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+ psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
- punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1)
+ punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1)
- paddd xmm6,[rel PD_DESCALE_P2_2]
- psrad xmm6,DESCALE_P2_2
+ paddd xmm6, [rel PD_DESCALE_P2_2]
+ psrad xmm6, DESCALE_P2_2
- packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
- packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
- paddb xmm6,[rel PB_CENTERJSAMP]
+ packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+ packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+ paddb xmm6, [rel PB_CENTERJSAMP]
- pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --)
- pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --)
+ pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
+ pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
- mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx
- mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx
+ mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx
+ mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx
- pop rbx
- uncollect_args
- pop rbp
- ret
+ pop rbx
+ uncollect_args
+ pop rbp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; --------------------------------------------------------------------------
-%define CONST_BITS 13
-%define PASS1_BITS 2
+%define CONST_BITS 13
+%define PASS1_BITS 2
-%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
+%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
%if CONST_BITS == 13
-F_0_211 equ 1730 ; FIX(0.211164243)
-F_0_509 equ 4176 ; FIX(0.509795579)
-F_0_601 equ 4926 ; FIX(0.601344887)
-F_0_720 equ 5906 ; FIX(0.720959822)
-F_0_765 equ 6270 ; FIX(0.765366865)
-F_0_850 equ 6967 ; FIX(0.850430095)
-F_0_899 equ 7373 ; FIX(0.899976223)
-F_1_061 equ 8697 ; FIX(1.061594337)
-F_1_272 equ 10426 ; FIX(1.272758580)
-F_1_451 equ 11893 ; FIX(1.451774981)
-F_1_847 equ 15137 ; FIX(1.847759065)
-F_2_172 equ 17799 ; FIX(2.172734803)
-F_2_562 equ 20995 ; FIX(2.562915447)
-F_3_624 equ 29692 ; FIX(3.624509785)
+F_0_211 equ 1730 ; FIX(0.211164243)
+F_0_509 equ 4176 ; FIX(0.509795579)
+F_0_601 equ 4926 ; FIX(0.601344887)
+F_0_720 equ 5906 ; FIX(0.720959822)
+F_0_765 equ 6270 ; FIX(0.765366865)
+F_0_850 equ 6967 ; FIX(0.850430095)
+F_0_899 equ 7373 ; FIX(0.899976223)
+F_1_061 equ 8697 ; FIX(1.061594337)
+F_1_272 equ 10426 ; FIX(1.272758580)
+F_1_451 equ 11893 ; FIX(1.451774981)
+F_1_847 equ 15137 ; FIX(1.847759065)
+F_2_172 equ 17799 ; FIX(2.172734803)
+F_2_562 equ 20995 ; FIX(2.562915447)
+F_3_624 equ 29692 ; FIX(3.624509785)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
-F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243)
-F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579)
-F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887)
-F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822)
-F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
-F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095)
-F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
-F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337)
-F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580)
-F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981)
-F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
-F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803)
-F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
-F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785)
+F_0_211 equ DESCALE( 226735879, 30-CONST_BITS) ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30-CONST_BITS) ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30-CONST_BITS) ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30-CONST_BITS) ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30-CONST_BITS) ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30-CONST_BITS) ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30-CONST_BITS) ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30-CONST_BITS) ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30-CONST_BITS) ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30-CONST_BITS) ; FIX(3.624509785)
%endif
; --------------------------------------------------------------------------
- SECTION SEG_CONST
+ SECTION SEG_CONST
- alignz 16
- global EXTN(jconst_idct_red_sse2)
+ alignz 16
+ global EXTN(jconst_idct_red_sse2)
EXTN(jconst_idct_red_sse2):
-PW_F184_MF076 times 4 dw F_1_847,-F_0_765
-PW_F256_F089 times 4 dw F_2_562, F_0_899
-PW_F106_MF217 times 4 dw F_1_061,-F_2_172
-PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509
-PW_F145_MF021 times 4 dw F_1_451,-F_0_211
-PW_F362_MF127 times 4 dw F_3_624,-F_1_272
-PW_F085_MF072 times 4 dw F_0_850,-F_0_720
-PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP times 16 db CENTERJSAMPLE
-
- alignz 16
+PW_F184_MF076 times 4 dw F_1_847,-F_0_765
+PW_F256_F089 times 4 dw F_2_562, F_0_899
+PW_F106_MF217 times 4 dw F_1_061,-F_2_172
+PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509
+PW_F145_MF021 times 4 dw F_1_451,-F_0_211
+PW_F362_MF127 times 4 dw F_3_624,-F_1_272
+PW_F085_MF072 times 4 dw F_0_850,-F_0_720
+PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+ alignz 16
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
+ SECTION SEG_TEXT
+ BITS 32
;
; Perform dequantization and inverse DCT on one block of coefficients,
; producing a reduced-size 4x4 output block.
; JSAMPARRAY output_buf, JDIMENSION output_col)
;
-%define dct_table(b) (b)+8 ; void *dct_table
-%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
-%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
-%define output_col(b) (b)+20 ; JDIMENSION output_col
+%define dct_table(b) (b)+8 ; void *dct_table
+%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
+%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
+%define output_col(b) (b)+20 ; JDIMENSION output_col
-%define original_ebp ebp+0
-%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM 2
+%define original_ebp ebp+0
+%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM 2
- align 16
- global EXTN(jsimd_idct_4x4_sse2)
+ align 16
+ global EXTN(jsimd_idct_4x4_sse2)
EXTN(jsimd_idct_4x4_sse2):
- push ebp
- mov eax,esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp],eax
- mov ebp,esp ; ebp = aligned ebp
- lea esp, [wk(0)]
- pushpic ebx
-; push ecx ; unused
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process columns from input.
-
-; mov eax, [original_ebp]
- mov edx, POINTER [dct_table(eax)] ; quantptr
- mov esi, JCOEFPTR [coef_block(eax)] ; inptr
+ push ebp
+ mov eax, esp ; eax = original ebp
+ sub esp, byte 4
+ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
+ mov [esp], eax
+ mov ebp, esp ; ebp = aligned ebp
+ lea esp, [wk(0)]
+ pushpic ebx
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+; mov eax, [original_ebp]
+ mov edx, POINTER [dct_table(eax)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(eax)] ; inptr
%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
- jnz short .columnDCT
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- por xmm0,xmm1
- packsswb xmm0,xmm0
- packsswb xmm0,xmm0
- movd eax,xmm0
- test eax,eax
- jnz short .columnDCT
-
- ; -- AC terms all zero
-
- movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- psllw xmm0,PASS1_BITS
-
- movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
- punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
- punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
-
- pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
- pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
- pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
- pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
-
- jmp near .column_end
- alignx 16,7
+ mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ jnz short .columnDCT
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ por xmm0, xmm1
+ packsswb xmm0, xmm0
+ packsswb xmm0, xmm0
+ movd eax, xmm0
+ test eax, eax
+ jnz short .columnDCT
+
+ ; -- AC terms all zero
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ psllw xmm0, PASS1_BITS
+
+ movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
+ punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
+ punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
+
+ pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+ pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+ pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+ pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+ jmp near .column_end
+ alignx 16, 7
%endif
.columnDCT:
- ; -- Odd part
-
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- movdqa xmm4,xmm0
- movdqa xmm5,xmm0
- punpcklwd xmm4,xmm1
- punpckhwd xmm5,xmm1
- movdqa xmm0,xmm4
- movdqa xmm1,xmm5
- pmaddwd xmm4,[GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L)
- pmaddwd xmm5,[GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H)
- pmaddwd xmm0,[GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L)
- pmaddwd xmm1,[GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H)
-
- movdqa xmm6,xmm2
- movdqa xmm7,xmm2
- punpcklwd xmm6,xmm3
- punpckhwd xmm7,xmm3
- movdqa xmm2,xmm6
- movdqa xmm3,xmm7
- pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L)
- pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H)
- pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L)
- pmaddwd xmm3,[GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H)
-
- paddd xmm6,xmm4 ; xmm6=tmp2L
- paddd xmm7,xmm5 ; xmm7=tmp2H
- paddd xmm2,xmm0 ; xmm2=tmp0L
- paddd xmm3,xmm1 ; xmm3=tmp0H
-
- movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
- movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
-
- ; -- Even part
-
- movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
- movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
- pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
- pxor xmm1,xmm1
- pxor xmm2,xmm2
- punpcklwd xmm1,xmm4 ; xmm1=tmp0L
- punpckhwd xmm2,xmm4 ; xmm2=tmp0H
- psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
- psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
-
- movdqa xmm3,xmm5 ; xmm5=in2=z2
- punpcklwd xmm5,xmm0 ; xmm0=in6=z3
- punpckhwd xmm3,xmm0
- pmaddwd xmm5,[GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L
- pmaddwd xmm3,[GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H
-
- movdqa xmm4,xmm1
- movdqa xmm0,xmm2
- paddd xmm1,xmm5 ; xmm1=tmp10L
- paddd xmm2,xmm3 ; xmm2=tmp10H
- psubd xmm4,xmm5 ; xmm4=tmp12L
- psubd xmm0,xmm3 ; xmm0=tmp12H
-
- ; -- Final output stage
-
- movdqa xmm5,xmm1
- movdqa xmm3,xmm2
- paddd xmm1,xmm6 ; xmm1=data0L
- paddd xmm2,xmm7 ; xmm2=data0H
- psubd xmm5,xmm6 ; xmm5=data3L
- psubd xmm3,xmm7 ; xmm3=data3H
-
- movdqa xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4]
-
- paddd xmm1,xmm6
- paddd xmm2,xmm6
- psrad xmm1,DESCALE_P1_4
- psrad xmm2,DESCALE_P1_4
- paddd xmm5,xmm6
- paddd xmm3,xmm6
- psrad xmm5,DESCALE_P1_4
- psrad xmm3,DESCALE_P1_4
-
- packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
- packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
-
- movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
- movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
-
- movdqa xmm2,xmm4
- movdqa xmm3,xmm0
- paddd xmm4,xmm7 ; xmm4=data1L
- paddd xmm0,xmm6 ; xmm0=data1H
- psubd xmm2,xmm7 ; xmm2=data2L
- psubd xmm3,xmm6 ; xmm3=data2H
-
- movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4]
-
- paddd xmm4,xmm7
- paddd xmm0,xmm7
- psrad xmm4,DESCALE_P1_4
- psrad xmm0,DESCALE_P1_4
- paddd xmm2,xmm7
- paddd xmm3,xmm7
- psrad xmm2,DESCALE_P1_4
- psrad xmm3,DESCALE_P1_4
-
- packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
- packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
-
- movdqa xmm6,xmm1 ; transpose coefficients(phase 1)
- punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
- punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
- movdqa xmm7,xmm2 ; transpose coefficients(phase 1)
- punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
- punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
-
- movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
- punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
- punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
- movdqa xmm3,xmm6 ; transpose coefficients(phase 2)
- punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
- punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+ ; -- Odd part
+
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm0
+ punpcklwd xmm4, xmm1
+ punpckhwd xmm5, xmm1
+ movdqa xmm0, xmm4
+ movdqa xmm1, xmm5
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L)
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H)
+ pmaddwd xmm0, [GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L)
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H)
+
+ movdqa xmm6, xmm2
+ movdqa xmm7, xmm2
+ punpcklwd xmm6, xmm3
+ punpckhwd xmm7, xmm3
+ movdqa xmm2, xmm6
+ movdqa xmm3, xmm7
+ pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L)
+ pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H)
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L)
+ pmaddwd xmm3, [GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H)
+
+ paddd xmm6, xmm4 ; xmm6=tmp2L
+ paddd xmm7, xmm5 ; xmm7=tmp2H
+ paddd xmm2, xmm0 ; xmm2=tmp0L
+ paddd xmm3, xmm1 ; xmm3=tmp0H
+
+ movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
+
+ ; -- Even part
+
+ movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+ pxor xmm1, xmm1
+ pxor xmm2, xmm2
+ punpcklwd xmm1, xmm4 ; xmm1=tmp0L
+ punpckhwd xmm2, xmm4 ; xmm2=tmp0H
+ psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+ psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+ movdqa xmm3, xmm5 ; xmm5=in2=z2
+ punpcklwd xmm5, xmm0 ; xmm0=in6=z3
+ punpckhwd xmm3, xmm0
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L
+ pmaddwd xmm3, [GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H
+
+ movdqa xmm4, xmm1
+ movdqa xmm0, xmm2
+ paddd xmm1, xmm5 ; xmm1=tmp10L
+ paddd xmm2, xmm3 ; xmm2=tmp10H
+ psubd xmm4, xmm5 ; xmm4=tmp12L
+ psubd xmm0, xmm3 ; xmm0=tmp12H
+
+ ; -- Final output stage
+
+ movdqa xmm5, xmm1
+ movdqa xmm3, xmm2
+ paddd xmm1, xmm6 ; xmm1=data0L
+ paddd xmm2, xmm7 ; xmm2=data0H
+ psubd xmm5, xmm6 ; xmm5=data3L
+ psubd xmm3, xmm7 ; xmm3=data3H
+
+ movdqa xmm6, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4]
+
+ paddd xmm1, xmm6
+ paddd xmm2, xmm6
+ psrad xmm1, DESCALE_P1_4
+ psrad xmm2, DESCALE_P1_4
+ paddd xmm5, xmm6
+ paddd xmm3, xmm6
+ psrad xmm5, DESCALE_P1_4
+ psrad xmm3, DESCALE_P1_4
+
+ packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
+ packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
+ movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
+
+ movdqa xmm2, xmm4
+ movdqa xmm3, xmm0
+ paddd xmm4, xmm7 ; xmm4=data1L
+ paddd xmm0, xmm6 ; xmm0=data1H
+ psubd xmm2, xmm7 ; xmm2=data2L
+ psubd xmm3, xmm6 ; xmm3=data2H
+
+ movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4]
+
+ paddd xmm4, xmm7
+ paddd xmm0, xmm7
+ psrad xmm4, DESCALE_P1_4
+ psrad xmm0, DESCALE_P1_4
+ paddd xmm2, xmm7
+ paddd xmm3, xmm7
+ psrad xmm2, DESCALE_P1_4
+ psrad xmm3, DESCALE_P1_4
+
+ packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
+ packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+ movdqa xmm6, xmm1 ; transpose coefficients(phase 1)
+ punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
+ punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
+ movdqa xmm7, xmm2 ; transpose coefficients(phase 1)
+ punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
+ punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
+
+ movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
+ punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+ punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+ movdqa xmm3, xmm6 ; transpose coefficients(phase 2)
+ punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+ punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
.column_end:
- ; -- Prefetch the next coefficient block
+ ; -- Prefetch the next coefficient block
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
- ; ---- Pass 2: process rows, store into output array.
+ ; ---- Pass 2: process rows, store into output array.
- mov eax, [original_ebp]
- mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
- mov eax, JDIMENSION [output_col(eax)]
+ mov eax, [original_ebp]
+ mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(eax)]
- ; -- Even part
+ ; -- Even part
- pxor xmm4,xmm4
- punpcklwd xmm4,xmm1 ; xmm4=tmp0
- psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+ pxor xmm4, xmm4
+ punpcklwd xmm4, xmm1 ; xmm4=tmp0
+ psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
- ; -- Odd part
+ ; -- Odd part
- punpckhwd xmm1,xmm0
- punpckhwd xmm6,xmm3
- movdqa xmm5,xmm1
- movdqa xmm2,xmm6
- pmaddwd xmm1,[GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2)
- pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2)
- pmaddwd xmm5,[GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0)
- pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0)
+ punpckhwd xmm1, xmm0
+ punpckhwd xmm6, xmm3
+ movdqa xmm5, xmm1
+ movdqa xmm2, xmm6
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2)
+ pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2)
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0)
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0)
- paddd xmm6,xmm1 ; xmm6=tmp2
- paddd xmm2,xmm5 ; xmm2=tmp0
+ paddd xmm6, xmm1 ; xmm6=tmp2
+ paddd xmm2, xmm5 ; xmm2=tmp0
- ; -- Even part
+ ; -- Even part
- punpcklwd xmm0,xmm3
- pmaddwd xmm0,[GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2
+ punpcklwd xmm0, xmm3
+ pmaddwd xmm0, [GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2
- movdqa xmm7,xmm4
- paddd xmm4,xmm0 ; xmm4=tmp10
- psubd xmm7,xmm0 ; xmm7=tmp12
+ movdqa xmm7, xmm4
+ paddd xmm4, xmm0 ; xmm4=tmp10
+ psubd xmm7, xmm0 ; xmm7=tmp12
- ; -- Final output stage
+ ; -- Final output stage
- movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4]
+ movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4]
- movdqa xmm5,xmm4
- movdqa xmm3,xmm7
- paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30)
- paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31)
- psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33)
- psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32)
+ movdqa xmm5, xmm4
+ movdqa xmm3, xmm7
+ paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30)
+ paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31)
+ psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33)
+ psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32)
- paddd xmm4,xmm1
- paddd xmm7,xmm1
- psrad xmm4,DESCALE_P2_4
- psrad xmm7,DESCALE_P2_4
- paddd xmm5,xmm1
- paddd xmm3,xmm1
- psrad xmm5,DESCALE_P2_4
- psrad xmm3,DESCALE_P2_4
+ paddd xmm4, xmm1
+ paddd xmm7, xmm1
+ psrad xmm4, DESCALE_P2_4
+ psrad xmm7, DESCALE_P2_4
+ paddd xmm5, xmm1
+ paddd xmm3, xmm1
+ psrad xmm5, DESCALE_P2_4
+ psrad xmm3, DESCALE_P2_4
- packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
- packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
+ packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
+ packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
- movdqa xmm0,xmm4 ; transpose coefficients(phase 1)
- punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
- punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
+ movdqa xmm0, xmm4 ; transpose coefficients(phase 1)
+ punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
+ punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
- movdqa xmm6,xmm4 ; transpose coefficients(phase 2)
- punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
- punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
+ movdqa xmm6, xmm4 ; transpose coefficients(phase 2)
+ punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
+ punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
- packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
- paddb xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)]
+ packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+ paddb xmm4, [GOTOFF(ebx,PB_CENTERJSAMP)]
- pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
- pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
- pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+ pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+ pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+ pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
- movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
- mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
- movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
- movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+ movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+ mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+ movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; unused
- poppic ebx
- mov esp,ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
- ret
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+ poppic ebx
+ mov esp, ebp ; esp <- aligned ebp
+ pop esp ; esp <- original ebp
+ pop ebp
+ ret
; --------------------------------------------------------------------------
; JSAMPARRAY output_buf, JDIMENSION output_col)
;
-%define dct_table(b) (b)+8 ; void *dct_table
-%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
-%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
-%define output_col(b) (b)+20 ; JDIMENSION output_col
+%define dct_table(b) (b)+8 ; void *dct_table
+%define coef_block(b) (b)+12 ; JCOEFPTR coef_block
+%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
+%define output_col(b) (b)+20 ; JDIMENSION output_col
- align 16
- global EXTN(jsimd_idct_2x2_sse2)
+ align 16
+ global EXTN(jsimd_idct_2x2_sse2)
EXTN(jsimd_idct_2x2_sse2):
- push ebp
- mov ebp,esp
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- get_GOT ebx ; get GOT address
-
- ; ---- Pass 1: process columns from input.
-
- mov edx, POINTER [dct_table(ebp)] ; quantptr
- mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
-
- ; | input: | result: |
- ; | 00 01 ** 03 ** 05 ** 07 | |
- ; | 10 11 ** 13 ** 15 ** 17 | |
- ; | ** ** ** ** ** ** ** ** | |
- ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
- ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
- ; | 50 51 ** 53 ** 55 ** 57 | |
- ; | ** ** ** ** ** ** ** ** | |
- ; | 70 71 ** 73 ** 75 ** 77 | |
-
- ; -- Odd part
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ get_GOT ebx ; get GOT address
+
+ ; ---- Pass 1: process columns from input.
+
+ mov edx, POINTER [dct_table(ebp)] ; quantptr
+ mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
+
+ ; | input: | result: |
+ ; | 00 01 ** 03 ** 05 ** 07 | |
+ ; | 10 11 ** 13 ** 15 ** 17 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+ ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+ ; | 50 51 ** 53 ** 55 ** 57 | |
+ ; | ** ** ** ** ** ** ** ** | |
+ ; | 70 71 ** 73 ** 75 ** 77 | |
+
+ ; -- Odd part
- movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
- pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
- pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
- ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+ ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+ ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
- pcmpeqd xmm7,xmm7
- pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+ pcmpeqd xmm7, xmm7
+ pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
- movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
- movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
- punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
- punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
- pmaddwd xmm4,[GOTOFF(ebx,PW_F362_MF127)]
- pmaddwd xmm5,[GOTOFF(ebx,PW_F085_MF072)]
+ movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
+ movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
+ punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
+ punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
+ pmaddwd xmm4, [GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd xmm5, [GOTOFF(ebx,PW_F085_MF072)]
- psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
- pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
- psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
- pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
- por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
- por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
- pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)]
- pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)]
+ psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
+ pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+ psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
+ pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+ por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
+ por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
+ pmaddwd xmm0, [GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd xmm2, [GOTOFF(ebx,PW_F085_MF072)]
- paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3]
- paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
+ paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3]
+ paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
- ; -- Even part
+ ; -- Even part
- movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
- pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
- ; xmm6=(00 01 ** 03 ** 05 ** 07)
+ ; xmm6=(00 01 ** 03 ** 05 ** 07)
- movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
- pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
- pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
- psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
- psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
+ movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
+ pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
+ pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+ psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
+ psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
- ; -- Final output stage
+ ; -- Final output stage
- movdqa xmm3,xmm6
- movdqa xmm5,xmm1
- paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
- paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
- psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
- psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+ movdqa xmm3, xmm6
+ movdqa xmm5, xmm1
+ paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+ paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+ psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+ psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
- movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2]
+ movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2]
- punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **)
+ punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **)
- movdqa xmm7,xmm1
- punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3)
- punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7)
+ movdqa xmm7, xmm1
+ punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3)
+ punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7)
- paddd xmm6,xmm2
- psrad xmm6,DESCALE_P1_2
+ paddd xmm6, xmm2
+ psrad xmm6, DESCALE_P1_2
- paddd xmm1,xmm2
- paddd xmm7,xmm2
- psrad xmm1,DESCALE_P1_2
- psrad xmm7,DESCALE_P1_2
+ paddd xmm1, xmm2
+ paddd xmm7, xmm2
+ psrad xmm1, DESCALE_P1_2
+ psrad xmm7, DESCALE_P1_2
- ; -- Prefetch the next coefficient block
+ ; -- Prefetch the next coefficient block
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
- prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+ prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
- ; ---- Pass 2: process rows, store into output array.
+ ; ---- Pass 2: process rows, store into output array.
- mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
- mov eax, JDIMENSION [output_col(ebp)]
+ mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
+ mov eax, JDIMENSION [output_col(ebp)]
- ; | input:| result:|
- ; | A0 B0 | |
- ; | A1 B1 | C0 C1 |
- ; | A3 B3 | D0 D1 |
- ; | A5 B5 | |
- ; | A7 B7 | |
+ ; | input:| result:|
+ ; | A0 B0 | |
+ ; | A1 B1 | C0 C1 |
+ ; | A3 B3 | D0 D1 |
+ ; | A5 B5 | |
+ ; | A7 B7 | |
- ; -- Odd part
+ ; -- Odd part
- packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
- packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
- pmaddwd xmm1,[GOTOFF(ebx,PW_F362_MF127)]
- pmaddwd xmm7,[GOTOFF(ebx,PW_F085_MF072)]
+ packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+ packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+ pmaddwd xmm1, [GOTOFF(ebx,PW_F362_MF127)]
+ pmaddwd xmm7, [GOTOFF(ebx,PW_F085_MF072)]
- paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
+ paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
- ; -- Even part
+ ; -- Even part
- pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
+ pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
- ; -- Final output stage
+ ; -- Final output stage
- movdqa xmm4,xmm6
- paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
- psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+ movdqa xmm4, xmm6
+ paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+ psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
- punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1)
+ punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1)
- paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)]
- psrad xmm6,DESCALE_P2_2
+ paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2_2)]
+ psrad xmm6, DESCALE_P2_2
- packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
- packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
- paddb xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)]
+ packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+ packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+ paddb xmm6, [GOTOFF(ebx,PB_CENTERJSAMP)]
- pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --)
- pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --)
+ pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
+ pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
- mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
- mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- mov WORD [edx+eax*SIZEOF_JSAMPLE], bx
- mov WORD [esi+eax*SIZEOF_JSAMPLE], cx
+ mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ mov WORD [edx+eax*SIZEOF_JSAMPLE], bx
+ mov WORD [esi+eax*SIZEOF_JSAMPLE], cx
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- pop ebp
- ret
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
%include "jdct.inc"
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
+ SECTION SEG_TEXT
+ BITS 64
;
; Load data into workspace, applying unsigned->signed conversion
;
; r11 = JDIMENSION start_col
; r12 = FAST_FLOAT *workspace
- align 16
- global EXTN(jsimd_convsamp_float_sse2)
+ align 16
+ global EXTN(jsimd_convsamp_float_sse2)
EXTN(jsimd_convsamp_float_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
- push rbx
-
- pcmpeqw xmm7,xmm7
- psllw xmm7,7
- packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
-
- mov rsi, r10
- mov eax, r11d
- mov rdi, r12
- mov rcx, DCTSIZE/2
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args
+ push rbx
+
+ pcmpeqw xmm7, xmm7
+ psllw xmm7, 7
+ packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+ mov rsi, r10
+ mov eax, r11d
+ mov rdi, r12
+ mov rcx, DCTSIZE/2
.convloop:
- mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
- movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
+ movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
+ movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
- psubb xmm0,xmm7 ; xmm0=(01234567)
- psubb xmm1,xmm7 ; xmm1=(89ABCDEF)
+ psubb xmm0, xmm7 ; xmm0=(01234567)
+ psubb xmm1, xmm7 ; xmm1=(89ABCDEF)
- punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
- punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
+ punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
+ punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
- punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3)
- punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7)
- punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B)
- punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F)
+ punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3)
+ punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7)
+ punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B)
+ punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F)
- psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
- psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
- cvtdq2ps xmm2,xmm2 ; xmm2=(0123)
- cvtdq2ps xmm0,xmm0 ; xmm0=(4567)
- psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
- psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
- cvtdq2ps xmm3,xmm3 ; xmm3=(89AB)
- cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF)
+ psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
+ psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
+ cvtdq2ps xmm2, xmm2 ; xmm2=(0123)
+ cvtdq2ps xmm0, xmm0 ; xmm0=(4567)
+ psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
+ psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
+ cvtdq2ps xmm3, xmm3 ; xmm3=(89AB)
+ cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF)
- movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
- movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
- movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+ movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
- add rsi, byte 2*SIZEOF_JSAMPROW
- add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
- dec rcx
- jnz short .convloop
+ add rsi, byte 2*SIZEOF_JSAMPROW
+ add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec rcx
+ jnz short .convloop
- pop rbx
- uncollect_args
- pop rbp
- ret
+ pop rbx
+ uncollect_args
+ pop rbp
+ ret
; --------------------------------------------------------------------------
; r11 = FAST_FLOAT *divisors
; r12 = FAST_FLOAT *workspace
- align 16
- global EXTN(jsimd_quantize_float_sse2)
+ align 16
+ global EXTN(jsimd_quantize_float_sse2)
EXTN(jsimd_quantize_float_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
-
- mov rsi, r12
- mov rdx, r11
- mov rdi, r10
- mov rax, DCTSIZE2/16
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args
+
+ mov rsi, r12
+ mov rdx, r11
+ mov rdi, r10
+ mov rax, DCTSIZE2/16
.quantloop:
- movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
- mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
- mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
- mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
- mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
-
- cvtps2dq xmm0,xmm0
- cvtps2dq xmm1,xmm1
- cvtps2dq xmm2,xmm2
- cvtps2dq xmm3,xmm3
-
- packssdw xmm0,xmm1
- packssdw xmm2,xmm3
-
- movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
- movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
-
- add rsi, byte 16*SIZEOF_FAST_FLOAT
- add rdx, byte 16*SIZEOF_FAST_FLOAT
- add rdi, byte 16*SIZEOF_JCOEF
- dec rax
- jnz short .quantloop
-
- uncollect_args
- pop rbp
- ret
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+ mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+ mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+ cvtps2dq xmm0, xmm0
+ cvtps2dq xmm1, xmm1
+ cvtps2dq xmm2, xmm2
+ cvtps2dq xmm3, xmm3
+
+ packssdw xmm0, xmm1
+ packssdw xmm2, xmm3
+
+ movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
+
+ add rsi, byte 16*SIZEOF_FAST_FLOAT
+ add rdx, byte 16*SIZEOF_FAST_FLOAT
+ add rdi, byte 16*SIZEOF_JCOEF
+ dec rax
+ jnz short .quantloop
+
+ uncollect_args
+ pop rbp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
%include "jdct.inc"
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
+ SECTION SEG_TEXT
+ BITS 32
;
; Load data into workspace, applying unsigned->signed conversion
;
; FAST_FLOAT *workspace);
;
-%define sample_data ebp+8 ; JSAMPARRAY sample_data
-%define start_col ebp+12 ; JDIMENSION start_col
-%define workspace ebp+16 ; FAST_FLOAT *workspace
+%define sample_data ebp+8 ; JSAMPARRAY sample_data
+%define start_col ebp+12 ; JDIMENSION start_col
+%define workspace ebp+16 ; FAST_FLOAT *workspace
- align 16
- global EXTN(jsimd_convsamp_float_sse2)
+ align 16
+ global EXTN(jsimd_convsamp_float_sse2)
EXTN(jsimd_convsamp_float_sse2):
- push ebp
- mov ebp,esp
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- pcmpeqw xmm7,xmm7
- psllw xmm7,7
- packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
-
- mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
- mov eax, JDIMENSION [start_col]
- mov edi, POINTER [workspace] ; (DCTELEM *)
- mov ecx, DCTSIZE/2
- alignx 16,7
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ pcmpeqw xmm7, xmm7
+ psllw xmm7, 7
+ packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+ mov ecx, DCTSIZE/2
+ alignx 16, 7
.convloop:
- mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
-
- movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
- movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
- psubb xmm0,xmm7 ; xmm0=(01234567)
- psubb xmm1,xmm7 ; xmm1=(89ABCDEF)
-
- punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
- punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
-
- punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3)
- punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7)
- punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B)
- punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F)
-
- psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
- psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
- cvtdq2ps xmm2,xmm2 ; xmm2=(0123)
- cvtdq2ps xmm0,xmm0 ; xmm0=(4567)
- psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
- psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
- cvtdq2ps xmm3,xmm3 ; xmm3=(89AB)
- cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF)
-
- movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
- movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
- movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
- movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
-
- add esi, byte 2*SIZEOF_JSAMPROW
- add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
- dec ecx
- jnz short .convloop
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- pop ebp
- ret
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+ psubb xmm0, xmm7 ; xmm0=(01234567)
+ psubb xmm1, xmm7 ; xmm1=(89ABCDEF)
+
+ punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
+ punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
+
+ punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3)
+ punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7)
+ punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B)
+ punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F)
+
+ psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
+ psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
+ cvtdq2ps xmm2, xmm2 ; xmm2=(0123)
+ cvtdq2ps xmm0, xmm0 ; xmm0=(4567)
+ psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
+ psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
+ cvtdq2ps xmm3, xmm3 ; xmm3=(89AB)
+ cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF)
+
+ movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+ movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+ movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+
+ add esi, byte 2*SIZEOF_JSAMPROW
+ add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ dec ecx
+ jnz short .convloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
; --------------------------------------------------------------------------
; FAST_FLOAT *workspace);
;
-%define coef_block ebp+8 ; JCOEFPTR coef_block
-%define divisors ebp+12 ; FAST_FLOAT *divisors
-%define workspace ebp+16 ; FAST_FLOAT *workspace
+%define coef_block ebp+8 ; JCOEFPTR coef_block
+%define divisors ebp+12 ; FAST_FLOAT *divisors
+%define workspace ebp+16 ; FAST_FLOAT *workspace
- align 16
- global EXTN(jsimd_quantize_float_sse2)
+ align 16
+ global EXTN(jsimd_quantize_float_sse2)
EXTN(jsimd_quantize_float_sse2):
- push ebp
- mov ebp,esp
-; push ebx ; unused
-; push ecx ; unused
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov esi, POINTER [workspace]
- mov edx, POINTER [divisors]
- mov edi, JCOEFPTR [coef_block]
- mov eax, DCTSIZE2/16
- alignx 16,7
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+ mov eax, DCTSIZE2/16
+ alignx 16, 7
.quantloop:
- movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
- mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
- mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
- movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
- movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
- mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
- mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
- cvtps2dq xmm0,xmm0
- cvtps2dq xmm1,xmm1
- cvtps2dq xmm2,xmm2
- cvtps2dq xmm3,xmm3
-
- packssdw xmm0,xmm1
- packssdw xmm2,xmm3
-
- movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
- movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
-
- add esi, byte 16*SIZEOF_FAST_FLOAT
- add edx, byte 16*SIZEOF_FAST_FLOAT
- add edi, byte 16*SIZEOF_JCOEF
- dec eax
- jnz short .quantloop
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; unused
-; pop ebx ; unused
- pop ebp
- ret
+ movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+ mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+ mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+ cvtps2dq xmm0, xmm0
+ cvtps2dq xmm1, xmm1
+ cvtps2dq xmm2, xmm2
+ cvtps2dq xmm3, xmm3
+
+ packssdw xmm0, xmm1
+ packssdw xmm2, xmm3
+
+ movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
+
+ add esi, byte 16*SIZEOF_FAST_FLOAT
+ add edx, byte 16*SIZEOF_FAST_FLOAT
+ add edi, byte 16*SIZEOF_JCOEF
+ dec eax
+ jnz short .quantloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
%include "jdct.inc"
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
+ SECTION SEG_TEXT
+ BITS 64
;
; Load data into workspace, applying unsigned->signed conversion
;
; r11 = JDIMENSION start_col
; r12 = DCTELEM *workspace
- align 16
- global EXTN(jsimd_convsamp_sse2)
+ align 16
+ global EXTN(jsimd_convsamp_sse2)
EXTN(jsimd_convsamp_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
- push rbx
-
- pxor xmm6,xmm6 ; xmm6=(all 0's)
- pcmpeqw xmm7,xmm7
- psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
- mov rsi, r10
- mov eax, r11d
- mov rdi, r12
- mov rcx, DCTSIZE/4
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args
+ push rbx
+
+ pxor xmm6, xmm6 ; xmm6=(all 0's)
+ pcmpeqw xmm7, xmm7
+ psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ mov rsi, r10
+ mov eax, r11d
+ mov rdi, r12
+ mov rcx, DCTSIZE/4
.convloop:
- mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
-
- movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
- movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
-
- mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
-
- movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
- movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
-
- punpcklbw xmm0,xmm6 ; xmm0=(01234567)
- punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF)
- paddw xmm0,xmm7
- paddw xmm1,xmm7
- punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN)
- punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV)
- paddw xmm2,xmm7
- paddw xmm3,xmm7
-
- movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
- movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
- movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
- movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
-
- add rsi, byte 4*SIZEOF_JSAMPROW
- add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
- dec rcx
- jnz short .convloop
-
- pop rbx
- uncollect_args
- pop rbp
- ret
+ mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
+ movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
+
+ mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
+ movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
+
+ punpcklbw xmm0, xmm6 ; xmm0=(01234567)
+ punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+ punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
+ punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
+ paddw xmm2, xmm7
+ paddw xmm3, xmm7
+
+ movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+ movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+ add rsi, byte 4*SIZEOF_JSAMPROW
+ add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+ dec rcx
+ jnz short .convloop
+
+ pop rbx
+ uncollect_args
+ pop rbp
+ ret
; --------------------------------------------------------------------------
;
; DCTELEM *workspace);
;
-%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
; r10 = JCOEFPTR coef_block
; r11 = DCTELEM *divisors
; r12 = DCTELEM *workspace
- align 16
- global EXTN(jsimd_quantize_sse2)
+ align 16
+ global EXTN(jsimd_quantize_sse2)
EXTN(jsimd_quantize_sse2):
- push rbp
- mov rax,rsp
- mov rbp,rsp
- collect_args
-
- mov rsi, r12
- mov rdx, r11
- mov rdi, r10
- mov rax, DCTSIZE2/32
+ push rbp
+ mov rax, rsp
+ mov rbp, rsp
+ collect_args
+
+ mov rsi, r12
+ mov rdx, r11
+ mov rdi, r10
+ mov rax, DCTSIZE2/32
.quantloop:
- movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
- movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
- movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
- movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
- movdqa xmm0,xmm4
- movdqa xmm1,xmm5
- movdqa xmm2,xmm6
- movdqa xmm3,xmm7
- psraw xmm4,(WORD_BIT-1)
- psraw xmm5,(WORD_BIT-1)
- psraw xmm6,(WORD_BIT-1)
- psraw xmm7,(WORD_BIT-1)
- pxor xmm0,xmm4
- pxor xmm1,xmm5
- pxor xmm2,xmm6
- pxor xmm3,xmm7
- psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
- psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
- psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
- psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
-
- paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor
- paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]
- paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)]
- paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)]
- pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal
- pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
- pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
- pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
- pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale
- pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
- pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
- pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
-
- pxor xmm0,xmm4
- pxor xmm1,xmm5
- pxor xmm2,xmm6
- pxor xmm3,xmm7
- psubw xmm0,xmm4
- psubw xmm1,xmm5
- psubw xmm2,xmm6
- psubw xmm3,xmm7
- movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
- movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
- movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
- movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
-
- add rsi, byte 32*SIZEOF_DCTELEM
- add rdx, byte 32*SIZEOF_DCTELEM
- add rdi, byte 32*SIZEOF_JCOEF
- dec rax
- jnz near .quantloop
-
- uncollect_args
- pop rbp
- ret
+ movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
+ movdqa xmm0, xmm4
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm6
+ movdqa xmm3, xmm7
+ psraw xmm4, (WORD_BIT-1)
+ psraw xmm5, (WORD_BIT-1)
+ psraw xmm6, (WORD_BIT-1)
+ psraw xmm7, (WORD_BIT-1)
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+ pxor xmm2, xmm6
+ pxor xmm3, xmm7
+ psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
+ psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
+ psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
+ psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
+
+ paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor
+ paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]
+ paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)]
+ paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)]
+ pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal
+ pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
+ pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
+ pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
+ pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale
+ pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
+ pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
+ pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
+
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+ pxor xmm2, xmm6
+ pxor xmm3, xmm7
+ psubw xmm0, xmm4
+ psubw xmm1, xmm5
+ psubw xmm2, xmm6
+ psubw xmm3, xmm7
+ movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+ movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+ add rsi, byte 32*SIZEOF_DCTELEM
+ add rdx, byte 32*SIZEOF_DCTELEM
+ add rdi, byte 32*SIZEOF_JCOEF
+ dec rax
+ jnz near .quantloop
+
+ uncollect_args
+ pop rbp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
%include "jdct.inc"
; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 32
+ SECTION SEG_TEXT
+ BITS 32
;
; Load data into workspace, applying unsigned->signed conversion
;
; DCTELEM *workspace);
;
-%define sample_data ebp+8 ; JSAMPARRAY sample_data
-%define start_col ebp+12 ; JDIMENSION start_col
-%define workspace ebp+16 ; DCTELEM *workspace
+%define sample_data ebp+8 ; JSAMPARRAY sample_data
+%define start_col ebp+12 ; JDIMENSION start_col
+%define workspace ebp+16 ; DCTELEM *workspace
- align 16
- global EXTN(jsimd_convsamp_sse2)
+ align 16
+ global EXTN(jsimd_convsamp_sse2)
EXTN(jsimd_convsamp_sse2):
- push ebp
- mov ebp,esp
- push ebx
-; push ecx ; need not be preserved
-; push edx ; need not be preserved
- push esi
- push edi
-
- pxor xmm6,xmm6 ; xmm6=(all 0's)
- pcmpeqw xmm7,xmm7
- psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
- mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
- mov eax, JDIMENSION [start_col]
- mov edi, POINTER [workspace] ; (DCTELEM *)
- mov ecx, DCTSIZE/4
- alignx 16,7
+ push ebp
+ mov ebp, esp
+ push ebx
+; push ecx ; need not be preserved
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ pxor xmm6, xmm6 ; xmm6=(all 0's)
+ pcmpeqw xmm7, xmm7
+ psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+ mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
+ mov eax, JDIMENSION [start_col]
+ mov edi, POINTER [workspace] ; (DCTELEM *)
+ mov ecx, DCTSIZE/4
+ alignx 16, 7
.convloop:
- mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
-
- movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
- movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
-
- mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
-
- movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
- movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
-
- punpcklbw xmm0,xmm6 ; xmm0=(01234567)
- punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF)
- paddw xmm0,xmm7
- paddw xmm1,xmm7
- punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN)
- punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV)
- paddw xmm2,xmm7
- paddw xmm3,xmm7
-
- movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
- movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
- movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
- movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
-
- add esi, byte 4*SIZEOF_JSAMPROW
- add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
- dec ecx
- jnz short .convloop
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; need not be preserved
- pop ebx
- pop ebp
- ret
+ mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
+ movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
+
+ mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+
+ movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
+ movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
+
+ punpcklbw xmm0, xmm6 ; xmm0=(01234567)
+ punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
+ paddw xmm0, xmm7
+ paddw xmm1, xmm7
+ punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
+ punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
+ paddw xmm2, xmm7
+ paddw xmm3, xmm7
+
+ movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+ movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+ add esi, byte 4*SIZEOF_JSAMPROW
+ add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+ dec ecx
+ jnz short .convloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; need not be preserved
+ pop ebx
+ pop ebp
+ ret
; --------------------------------------------------------------------------
;
; DCTELEM *workspace);
;
-%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-%define coef_block ebp+8 ; JCOEFPTR coef_block
-%define divisors ebp+12 ; DCTELEM *divisors
-%define workspace ebp+16 ; DCTELEM *workspace
+%define coef_block ebp+8 ; JCOEFPTR coef_block
+%define divisors ebp+12 ; DCTELEM *divisors
+%define workspace ebp+16 ; DCTELEM *workspace
- align 16
- global EXTN(jsimd_quantize_sse2)
+ align 16
+ global EXTN(jsimd_quantize_sse2)
EXTN(jsimd_quantize_sse2):
- push ebp
- mov ebp,esp
-; push ebx ; unused
-; push ecx ; unused
-; push edx ; need not be preserved
- push esi
- push edi
-
- mov esi, POINTER [workspace]
- mov edx, POINTER [divisors]
- mov edi, JCOEFPTR [coef_block]
- mov eax, DCTSIZE2/32
- alignx 16,7
+ push ebp
+ mov ebp, esp
+; push ebx ; unused
+; push ecx ; unused
+; push edx ; need not be preserved
+ push esi
+ push edi
+
+ mov esi, POINTER [workspace]
+ mov edx, POINTER [divisors]
+ mov edi, JCOEFPTR [coef_block]
+ mov eax, DCTSIZE2/32
+ alignx 16, 7
.quantloop:
- movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
- movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
- movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
- movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
- movdqa xmm0,xmm4
- movdqa xmm1,xmm5
- movdqa xmm2,xmm6
- movdqa xmm3,xmm7
- psraw xmm4,(WORD_BIT-1)
- psraw xmm5,(WORD_BIT-1)
- psraw xmm6,(WORD_BIT-1)
- psraw xmm7,(WORD_BIT-1)
- pxor xmm0,xmm4
- pxor xmm1,xmm5
- pxor xmm2,xmm6
- pxor xmm3,xmm7
- psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
- psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
- psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
- psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
-
- paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
- paddw xmm1, XMMWORD [CORRECTION(1,0,edx)]
- paddw xmm2, XMMWORD [CORRECTION(2,0,edx)]
- paddw xmm3, XMMWORD [CORRECTION(3,0,edx)]
- pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
- pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
- pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
- pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
- pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale
- pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
- pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
- pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
-
- pxor xmm0,xmm4
- pxor xmm1,xmm5
- pxor xmm2,xmm6
- pxor xmm3,xmm7
- psubw xmm0,xmm4
- psubw xmm1,xmm5
- psubw xmm2,xmm6
- psubw xmm3,xmm7
- movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
- movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
- movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
- movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
-
- add esi, byte 32*SIZEOF_DCTELEM
- add edx, byte 32*SIZEOF_DCTELEM
- add edi, byte 32*SIZEOF_JCOEF
- dec eax
- jnz near .quantloop
-
- pop edi
- pop esi
-; pop edx ; need not be preserved
-; pop ecx ; unused
-; pop ebx ; unused
- pop ebp
- ret
+ movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+ movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
+ movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+ movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
+ movdqa xmm0, xmm4
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm6
+ movdqa xmm3, xmm7
+ psraw xmm4, (WORD_BIT-1)
+ psraw xmm5, (WORD_BIT-1)
+ psraw xmm6, (WORD_BIT-1)
+ psraw xmm7, (WORD_BIT-1)
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+ pxor xmm2, xmm6
+ pxor xmm3, xmm7
+ psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
+ psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
+ psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
+ psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
+
+ paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
+ paddw xmm1, XMMWORD [CORRECTION(1,0,edx)]
+ paddw xmm2, XMMWORD [CORRECTION(2,0,edx)]
+ paddw xmm3, XMMWORD [CORRECTION(3,0,edx)]
+ pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
+ pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
+ pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
+ pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
+ pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale
+ pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
+ pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
+ pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
+
+ pxor xmm0, xmm4
+ pxor xmm1, xmm5
+ pxor xmm2, xmm6
+ pxor xmm3, xmm7
+ psubw xmm0, xmm4
+ psubw xmm1, xmm5
+ psubw xmm2, xmm6
+ psubw xmm3, xmm7
+ movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+ movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+ movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+ movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+ add esi, byte 32*SIZEOF_DCTELEM
+ add edx, byte 32*SIZEOF_DCTELEM
+ add edi, byte 32*SIZEOF_JCOEF
+ dec eax
+ jnz near .quantloop
+
+ pop edi
+ pop esi
+; pop edx ; need not be preserved
+; pop ecx ; unused
+; pop ebx ; unused
+ pop ebp
+ ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
- align 16
+ align 16
; -- jpeglib.h
;
-%define _cpp_protection_DCTSIZE DCTSIZE
-%define _cpp_protection_DCTSIZE2 DCTSIZE2
+%define _cpp_protection_DCTSIZE DCTSIZE
+%define _cpp_protection_DCTSIZE2 DCTSIZE2
;
; -- jmorecfg.h
;
-%define _cpp_protection_RGB_RED RGB_RED
-%define _cpp_protection_RGB_GREEN RGB_GREEN
-%define _cpp_protection_RGB_BLUE RGB_BLUE
-%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE
+%define _cpp_protection_RGB_RED RGB_RED
+%define _cpp_protection_RGB_GREEN RGB_GREEN
+%define _cpp_protection_RGB_BLUE RGB_BLUE
+%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE
-%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED
-%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN
-%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE
-%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED
+%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN
+%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE
+%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED
-%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN
-%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE
-%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED
+%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN
+%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE
+%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED
-%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN
-%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE
-%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE
+%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED
+%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN
+%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE
+%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE
-%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED
-%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN
-%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE
-%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED
+%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN
+%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE
+%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED
-%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN
-%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE
-%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED
+%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN
+%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE
+%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED
-%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN
-%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE
-%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED
+%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN
+%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE
+%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define RGBX_FILLER_0XFF 1
+%define RGBX_FILLER_0XFF 1
; Representation of a single sample (pixel element value).
; On this SIMD implementation, this must be 'unsigned char'.
;
-%define JSAMPLE byte ; unsigned char
-%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
+%define JSAMPLE byte ; unsigned char
+%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
-%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE
+%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE
; Representation of a DCT frequency coefficient.
; On this SIMD implementation, this must be 'short'.
;
-%define JCOEF word ; short
-%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
+%define JCOEF word ; short
+%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)
; Datatype used for image dimensions.
; On this SIMD implementation, this must be 'unsigned int'.
;
-%define JDIMENSION dword ; unsigned int
-%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
-
-%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
-%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
-%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
-%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
-%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
-%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
-%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
-%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
+%define JDIMENSION dword ; unsigned int
+%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)
+
+%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)
+%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)
+%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)
+%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)
+%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)
;
; -- jdct.h
; the DCT is to be performed in-place in that buffer.
; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
;
-%define DCTELEM word ; short
-%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
+%define DCTELEM word ; short
+%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)
-%define FAST_FLOAT FP32 ; float
-%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT)
+%define FAST_FLOAT FP32 ; float
+%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT)
; To maximize parallelism, Type MULTIPLIER is changed to short.
;
-%define ISLOW_MULT_TYPE word ; must be short
-%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
+%define ISLOW_MULT_TYPE word ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)
-%define IFAST_MULT_TYPE word ; must be short
-%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
-%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
+%define IFAST_MULT_TYPE word ; must be short
+%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors
-%define FLOAT_MULT_TYPE FP32 ; must be float
-%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
+%define FLOAT_MULT_TYPE FP32 ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)
;
; -- jsimd.h
;
-%define _cpp_protection_JSIMD_NONE JSIMD_NONE
-%define _cpp_protection_JSIMD_MMX JSIMD_MMX
-%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW
-%define _cpp_protection_JSIMD_SSE JSIMD_SSE
-%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2
+%define _cpp_protection_JSIMD_NONE JSIMD_NONE
+%define _cpp_protection_JSIMD_MMX JSIMD_MMX
+%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW
+%define _cpp_protection_JSIMD_SSE JSIMD_SSE
+%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2
; -- segment definition --
;
%ifdef __YASM_VER__
-%define SEG_TEXT .text align=16
-%define SEG_CONST .rdata align=16
+%define SEG_TEXT .text align=16
+%define SEG_CONST .rdata align=16
%else
-%define SEG_TEXT .text align=16 public use32 class=CODE
-%define SEG_CONST .rdata align=16 public use32 class=CONST
+%define SEG_TEXT .text align=16 public use32 class=CODE
+%define SEG_CONST .rdata align=16 public use32 class=CONST
%endif
%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)--------
%define SEG_TEXT .text align=16 public use64 class=CODE
%define SEG_CONST .rdata align=16 public use64 class=CONST
%endif
-%define EXTN(name) name ; foo() -> foo
+%define EXTN(name) name ; foo() -> foo
%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)----------
; * Borland C++ (Win32)
; -- segment definition --
;
-%define SEG_TEXT _text align=16 public use32 class=CODE
-%define SEG_CONST _data align=16 public use32 class=DATA
+%define SEG_TEXT _text align=16 public use32 class=CODE
+%define SEG_CONST _data align=16 public use32 class=DATA
%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
; * Linux
; -- segment definition --
;
%ifdef __x86_64__
-%define SEG_TEXT .text progbits align=16
-%define SEG_CONST .rodata progbits align=16
+%define SEG_TEXT .text progbits align=16
+%define SEG_CONST .rodata progbits align=16
%else
-%define SEG_TEXT .text progbits alloc exec nowrite align=16
-%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
+%define SEG_TEXT .text progbits alloc exec nowrite align=16
+%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
%endif
; To make the code position-independent, append -DPIC to the commandline
;
-%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
-%define EXTN(name) name ; foo() -> foo
+%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
+%define EXTN(name) name ; foo() -> foo
%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)----
; * Older Linux using a.out format (nasm -f aout -DAOUT ...)
; -- segment definition --
;
-%define SEG_TEXT .text
-%define SEG_CONST .data
+%define SEG_TEXT .text
+%define SEG_CONST .data
; To make the code position-independent, append -DPIC to the commandline
;
-%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
+%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
; -- segment definition --
;
-%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why?
-%define SEG_CONST .rodata align=16
+%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why?
+%define SEG_CONST .rodata align=16
; The generation of position-independent code (PIC) is the default on Darwin.
;
; -- segment definition --
;
-%define SEG_TEXT .text
-%define SEG_CONST .data
+%define SEG_TEXT .text
+%define SEG_CONST .data
-%endif ; ----------------------------------------------
+%endif ; ----------------------------------------------
; ==========================================================================
; Common types
;
%ifdef __x86_64__
-%define POINTER qword ; general pointer type
-%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
-%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
+%define POINTER qword ; general pointer type
+%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
+%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
%else
-%define POINTER dword ; general pointer type
-%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
-%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
+%define POINTER dword ; general pointer type
+%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
+%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
%endif
-%define INT dword ; signed integer type
-%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
-%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
+%define INT dword ; signed integer type
+%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
+%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
-%define FP32 dword ; IEEE754 single
-%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
-%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
+%define FP32 dword ; IEEE754 single
+%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
+%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
-%define MMWORD qword ; int64 (MMX register)
-%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
-%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
+%define MMWORD qword ; int64 (MMX register)
+%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
+%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
; NASM is buggy and doesn't properly handle operand sizes for SSE
; instructions, so for now we have to define XMMWORD as blank.
-%define XMMWORD ; int128 (SSE register)
-%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
-%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
+%define XMMWORD ; int128 (SSE register)
+%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
+%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
; Similar hacks for when we load a dword or MMWORD into an xmm# register
%define XMM_DWORD
%define XMM_MMWORD
-%define SIZEOF_BYTE 1 ; sizeof(BYTE)
-%define SIZEOF_WORD 2 ; sizeof(WORD)
-%define SIZEOF_DWORD 4 ; sizeof(DWORD)
-%define SIZEOF_QWORD 8 ; sizeof(QWORD)
-%define SIZEOF_OWORD 16 ; sizeof(OWORD)
+%define SIZEOF_BYTE 1 ; sizeof(BYTE)
+%define SIZEOF_WORD 2 ; sizeof(WORD)
+%define SIZEOF_DWORD 4 ; sizeof(DWORD)
+%define SIZEOF_QWORD 8 ; sizeof(QWORD)
+%define SIZEOF_OWORD 16 ; sizeof(OWORD)
-%define BYTE_BIT 8 ; CHAR_BIT in C
-%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT
-%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT
-%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT
-%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT
+%define BYTE_BIT 8 ; CHAR_BIT in C
+%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT
+%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT
+%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT
+%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT
; --------------------------------------------------------------------------
; External Symbol Name
;
%ifndef EXTN
-%define EXTN(name) _ %+ name ; foo() -> _foo
+%define EXTN(name) _ %+ name ; foo() -> _foo
%endif
; --------------------------------------------------------------------------
%undef PIC
%endif
-%ifdef PIC ; -------------------------------------------
+%ifdef PIC ; -------------------------------------------
-%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
+%ifidn GOT_SYMBOL, _MACHO_PIC_ ; --------------------
; At present, nasm doesn't seem to support PIC generation for Mach-O.
; The PIC support code below is a little tricky.
- SECTION SEG_CONST
+ SECTION SEG_CONST
const_base:
-%define GOTOFF(got,sym) (got) + (sym) - const_base
+%define GOTOFF(got,sym) (got) + (sym) - const_base
%imacro get_GOT 1
- ; NOTE: this macro destroys ecx resister.
- call %%geteip
- add ecx, byte (%%ref - $)
- jmp short %%adjust
+ ; NOTE: this macro destroys ecx resister.
+ call %%geteip
+ add ecx, byte (%%ref - $)
+ jmp short %%adjust
%%geteip:
- mov ecx, POINTER [esp]
- ret
+ mov ecx, POINTER [esp]
+ ret
%%adjust:
- push ebp
- xor ebp,ebp ; ebp = 0
-%ifidni %1,ebx ; (%1 == ebx)
- ; db 0x8D,0x9C + jmp near const_base =
- ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
- db 0x8D,0x9C ; 8D,9C
- jmp near const_base ; E9,(const_base-%%ref)
+ push ebp
+ xor ebp, ebp ; ebp = 0
+%ifidni %1, ebx ; (%1 == ebx)
+ ; db 0x8D,0x9C + jmp near const_base =
+ ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
+ db 0x8D, 0x9C ; 8D,9C
+ jmp near const_base ; E9,(const_base-%%ref)
%%ref:
%else ; (%1 != ebx)
- ; db 0x8D,0x8C + jmp near const_base =
- ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
- db 0x8D,0x8C ; 8D,8C
- jmp near const_base ; E9,(const_base-%%ref)
-%%ref: mov %1, ecx
-%endif ; (%1 == ebx)
- pop ebp
+ ; db 0x8D,0x8C + jmp near const_base =
+ ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
+ db 0x8D, 0x8C ; 8D,8C
+ jmp near const_base ; E9,(const_base-%%ref)
+%%ref:
+ mov %1, ecx
+%endif ; (%1 == ebx)
+ pop ebp
%endmacro
-%else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
+%else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
-%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
+%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
%imacro get_GOT 1
- extern GOT_SYMBOL
- call %%geteip
- add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
- jmp short %%done
+ extern GOT_SYMBOL
+ call %%geteip
+ add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
+ jmp short %%done
%%geteip:
- mov %1, POINTER [esp]
- ret
+ mov %1, POINTER [esp]
+ ret
%%done:
%endmacro
-%endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
+%endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
%imacro pushpic 1.nolist
- push %1
+ push %1
%endmacro
%imacro poppic 1.nolist
- pop %1
+ pop %1
%endmacro
%imacro movpic 2.nolist
- mov %1,%2
+ mov %1, %2
%endmacro
-%else ; !PIC -----------------------------------------
+%else ; !PIC -----------------------------------------
-%define GOTOFF(got,sym) (sym)
+%define GOTOFF(got,sym) (sym)
%imacro get_GOT 1.nolist
%endmacro
%imacro movpic 2.nolist
%endmacro
-%endif ; PIC -----------------------------------------
+%endif ; PIC -----------------------------------------
; --------------------------------------------------------------------------
; Align the next instruction on {2,4,8,16,..}-byte boundary.
%define FILLB(b,n) (($$-(b)) & ((n)-1))
%imacro alignx 1-2.nolist 0xFFFF
-%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
- db 0x90 ; nop
- times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
- db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
- times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
- db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
- times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
- db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
- times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
- db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00]
- times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
- db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00]
- times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
- db 0x8B,0xED ; mov ebp,ebp
- times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
- db 0x90 ; nop
+%%bs: \
+ times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
+ db 0x90 ; nop
+ times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
+ db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
+ times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
+ db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
+ times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
+ db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
+ times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
+ db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00]
+ times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
+ db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00]
+ times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
+ db 0x8B,0xED ; mov ebp,ebp
+ times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
+ db 0x90 ; nop
%endmacro
; Align the next data on {2,4,8,16,..}-byte boundary.
;
%imacro alignz 1.nolist
- align %1, db 0 ; filling zeros
+ align %1, db 0 ; filling zeros
%endmacro
%ifdef __x86_64__
%ifdef WIN64
%imacro collect_args 0
- push r12
- push r13
- push r14
- push r15
- mov r10, rcx
- mov r11, rdx
- mov r12, r8
- mov r13, r9
- mov r14, [rax+48]
- mov r15, [rax+56]
- push rsi
- push rdi
- sub rsp, SIZEOF_XMMWORD
- movaps XMMWORD [rsp], xmm6
- sub rsp, SIZEOF_XMMWORD
- movaps XMMWORD [rsp], xmm7
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r10, rcx
+ mov r11, rdx
+ mov r12, r8
+ mov r13, r9
+ mov r14, [rax+48]
+ mov r15, [rax+56]
+ push rsi
+ push rdi
+ sub rsp, SIZEOF_XMMWORD
+ movaps XMMWORD [rsp], xmm6
+ sub rsp, SIZEOF_XMMWORD
+ movaps XMMWORD [rsp], xmm7
%endmacro
%imacro uncollect_args 0
- movaps xmm7, XMMWORD [rsp]
- add rsp, SIZEOF_XMMWORD
- movaps xmm6, XMMWORD [rsp]
- add rsp, SIZEOF_XMMWORD
- pop rdi
- pop rsi
- pop r15
- pop r14
- pop r13
- pop r12
+ movaps xmm7, XMMWORD [rsp]
+ add rsp, SIZEOF_XMMWORD
+ movaps xmm6, XMMWORD [rsp]
+ add rsp, SIZEOF_XMMWORD
+ pop rdi
+ pop rsi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
%endmacro
%else
%imacro collect_args 0
- push r10
- push r11
- push r12
- push r13
- push r14
- push r15
- mov r10, rdi
- mov r11, rsi
- mov r12, rdx
- mov r13, rcx
- mov r14, r8
- mov r15, r9
+ push r10
+ push r11
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r10, rdi
+ mov r11, rsi
+ mov r12, rdx
+ mov r13, rcx
+ mov r14, r8
+ mov r15, r9
%endmacro
%imacro uncollect_args 0
- pop r15
- pop r14
- pop r13
- pop r12
- pop r11
- pop r10
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop r11
+ pop r10
%endmacro
%endif