From: DRC Date: Fri, 27 May 2016 21:58:23 +0000 (-0500) Subject: Reformat SSE/SSE2 SIMD code to improve readability X-Git-Tag: 1.5.90~138 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ff5685d5344273df321eb63a005eaae19d2496e3;p=libjpeg-turbo Reformat SSE/SSE2 SIMD code to improve readability --- diff --git a/simd/jccolext-sse2-64.asm b/simd/jccolext-sse2-64.asm index 8e4642d..bf60459 100644 --- a/simd/jccolext-sse2-64.asm +++ b/simd/jccolext-sse2-64.asm @@ -33,454 +33,454 @@ ; r13 = JDIMENSION output_row ; r14 = int num_rows -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 8 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 8 - align 16 + align 16 - global EXTN(jsimd_rgb_ycc_convert_sse2) + global EXTN(jsimd_rgb_ycc_convert_sse2) EXTN(jsimd_rgb_ycc_convert_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args - push rbx - - mov ecx, r10d - test rcx,rcx - jz near .return - - push rcx - - mov rsi, r12 - mov ecx, r13d - mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] - mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] - mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] - lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] - lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] - lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] - - pop rcx - - mov rsi, r11 - mov eax, r14d - test rax,rax - jle near .return + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov ecx, r10d + test rcx, rcx + jz near .return + + push rcx + + mov rsi, r12 + mov ecx, r13d + mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] + lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax, rax + jle near .return .rowloop: - push rdx - push rbx - push rdi - push rsi - push rcx ; col + push rdx + push rbx + push rdi + push rsi + push rcx ; col - mov rsi, JSAMPROW [rsi] ; inptr - mov rdi, JSAMPROW [rdi] ; outptr0 - mov rbx, JSAMPROW [rbx] ; outptr1 - mov rdx, JSAMPROW [rdx] ; outptr2 + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr0 + mov rbx, JSAMPROW [rbx] ; outptr1 + mov rdx, JSAMPROW [rdx] ; outptr2 - cmp rcx, byte SIZEOF_XMMWORD - jae near .columnloop + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop -%if RGB_PIXELSIZE == 3 ; --------------- +%if RGB_PIXELSIZE == 3 ; --------------- .column_ld1: - push rax - push rdx - lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE - test cl, SIZEOF_BYTE - jz short .column_ld2 - sub rcx, byte SIZEOF_BYTE - movzx rax, BYTE [rsi+rcx] + push rax + push rdx + lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, BYTE [rsi+rcx] .column_ld2: - test cl, SIZEOF_WORD - jz short .column_ld4 - sub rcx, byte SIZEOF_WORD - movzx rdx, WORD [rsi+rcx] - shl rax, WORD_BIT - or rax,rdx + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, WORD [rsi+rcx] + shl rax, WORD_BIT + or rax, rdx .column_ld4: - movd xmmA,eax - pop rdx - pop rax - test cl, SIZEOF_DWORD - jz short .column_ld8 - sub rcx, byte SIZEOF_DWORD - movd xmmF, XMM_DWORD [rsi+rcx] - pslldq xmmA, SIZEOF_DWORD - por xmmA,xmmF + movd xmmA, eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [rsi+rcx] + pslldq xmmA, SIZEOF_DWORD + por xmmA, xmmF .column_ld8: - test cl, SIZEOF_MMWORD - jz short .column_ld16 - sub rcx, byte SIZEOF_MMWORD - movq xmmB, XMM_MMWORD [rsi+rcx] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmB + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [rsi+rcx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmB .column_ld16: - test cl, SIZEOF_XMMWORD - jz short .column_ld32 - movdqa xmmF,xmmA - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - mov rcx, SIZEOF_XMMWORD - jmp short .rgb_ycc_cnv + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF, xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + mov rcx, SIZEOF_XMMWORD + jmp short .rgb_ycc_cnv .column_ld32: - test cl, 2*SIZEOF_XMMWORD - mov rcx, SIZEOF_XMMWORD - jz short .rgb_ycc_cnv - movdqa xmmB,xmmA - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] - jmp short .rgb_ycc_cnv + test cl, 2*SIZEOF_XMMWORD + mov rcx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmB, xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv .columnloop: - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] - movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] .rgb_ycc_cnv: - ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - movdqa xmmG,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) - psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + movdqa xmmG, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) - pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) - punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) - punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) - movdqa xmmD,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) - psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + movdqa xmmD, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) - pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) - punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) - punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) - movdqa xmmE,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) - psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + movdqa xmmE, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) - punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) - pxor xmmH,xmmH + pxor xmmH, xmmH - movdqa xmmC,xmmA - punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + movdqa xmmC, xmmA + punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) - movdqa xmmB,xmmE - punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + movdqa xmmB, xmmE + punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) - movdqa xmmF,xmmD - punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) - punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + movdqa xmmF, xmmD + punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) -%else ; RGB_PIXELSIZE == 4 ; ----------- +%else ; RGB_PIXELSIZE == 4 ; ----------- .column_ld1: - test cl, SIZEOF_XMMWORD/16 - jz short .column_ld2 - sub rcx, byte SIZEOF_XMMWORD/16 - movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] .column_ld2: - test cl, SIZEOF_XMMWORD/8 - jz short .column_ld4 - sub rcx, byte SIZEOF_XMMWORD/8 - movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmE + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmE .column_ld4: - test cl, SIZEOF_XMMWORD/4 - jz short .column_ld8 - sub rcx, byte SIZEOF_XMMWORD/4 - movdqa xmmE,xmmA - movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + movdqa xmmE, xmmA + movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] .column_ld8: - test cl, SIZEOF_XMMWORD/2 - mov rcx, SIZEOF_XMMWORD - jz short .rgb_ycc_cnv - movdqa xmmF,xmmA - movdqa xmmH,xmmE - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] - jmp short .rgb_ycc_cnv + test cl, SIZEOF_XMMWORD/2 + mov rcx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmF, xmmA + movdqa xmmH, xmmE + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv .columnloop: - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] - movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] .rgb_ycc_cnv: - ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - - movdqa xmmD,xmmA - punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) - punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) - - movdqa xmmC,xmmF - punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) - punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) - - movdqa xmmB,xmmA - punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) - punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) - - movdqa xmmG,xmmD - punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) - punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) - - movdqa xmmE,xmmA - punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) - - movdqa xmmH,xmmB - punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) - punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) - - pxor xmmF,xmmF - - movdqa xmmC,xmmA - punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) - - movdqa xmmD,xmmB - punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) - - movdqa xmmG,xmmE - punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) - - punpcklbw xmmF,xmmH - punpckhbw xmmH,xmmH - psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) - psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) - -%endif ; RGB_PIXELSIZE ; --------------- - - ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE - ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO - - ; (Original) - ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B - ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE - ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE - ; - ; (This implementation) - ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G - ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE - ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE - - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE - movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO - movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE - movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO - - movdqa xmm6,xmm1 - punpcklwd xmm1,xmm3 - punpckhwd xmm6,xmm3 - movdqa xmm7,xmm1 - movdqa xmm4,xmm6 - pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) - pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) - pmaddwd xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) - pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) - - movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) - movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) - - pxor xmm1,xmm1 - pxor xmm6,xmm6 - punpcklwd xmm1,xmm5 ; xmm1=BOL - punpckhwd xmm6,xmm5 ; xmm6=BOH - psrld xmm1,1 ; xmm1=BOL*FIX(0.500) - psrld xmm6,1 ; xmm6=BOH*FIX(0.500) - - movdqa xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ] - - paddd xmm7,xmm1 - paddd xmm4,xmm6 - paddd xmm7,xmm5 - paddd xmm4,xmm5 - psrld xmm7,SCALEBITS ; xmm7=CbOL - psrld xmm4,SCALEBITS ; xmm4=CbOH - packssdw xmm7,xmm4 ; xmm7=CbO - - movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE - - movdqa xmm6,xmm0 - punpcklwd xmm0,xmm2 - punpckhwd xmm6,xmm2 - movdqa xmm5,xmm0 - movdqa xmm4,xmm6 - pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) - pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) - pmaddwd xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) - pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) - - movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) - movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) - - pxor xmm0,xmm0 - pxor xmm6,xmm6 - punpcklwd xmm0,xmm1 ; xmm0=BEL - punpckhwd xmm6,xmm1 ; xmm6=BEH - psrld xmm0,1 ; xmm0=BEL*FIX(0.500) - psrld xmm6,1 ; xmm6=BEH*FIX(0.500) - - movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] - - paddd xmm5,xmm0 - paddd xmm4,xmm6 - paddd xmm5,xmm1 - paddd xmm4,xmm1 - psrld xmm5,SCALEBITS ; xmm5=CbEL - psrld xmm4,SCALEBITS ; xmm4=CbEH - packssdw xmm5,xmm4 ; xmm5=CbE - - psllw xmm7,BYTE_BIT - por xmm5,xmm7 ; xmm5=Cb - movdqa XMMWORD [rbx], xmm5 ; Save Cb - - movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO - movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO - - movdqa xmm4,xmm0 - punpcklwd xmm0,xmm3 - punpckhwd xmm4,xmm3 - movdqa xmm7,xmm0 - movdqa xmm5,xmm4 - pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) - pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) - pmaddwd xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) - pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) - - movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] - - paddd xmm0, XMMWORD [wk(4)] - paddd xmm4, XMMWORD [wk(5)] - paddd xmm0,xmm3 - paddd xmm4,xmm3 - psrld xmm0,SCALEBITS ; xmm0=YOL - psrld xmm4,SCALEBITS ; xmm4=YOH - packssdw xmm0,xmm4 ; xmm0=YO - - pxor xmm3,xmm3 - pxor xmm4,xmm4 - punpcklwd xmm3,xmm1 ; xmm3=ROL - punpckhwd xmm4,xmm1 ; xmm4=ROH - psrld xmm3,1 ; xmm3=ROL*FIX(0.500) - psrld xmm4,1 ; xmm4=ROH*FIX(0.500) - - movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] - - paddd xmm7,xmm3 - paddd xmm5,xmm4 - paddd xmm7,xmm1 - paddd xmm5,xmm1 - psrld xmm7,SCALEBITS ; xmm7=CrOL - psrld xmm5,SCALEBITS ; xmm5=CrOH - packssdw xmm7,xmm5 ; xmm7=CrO - - movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE - - movdqa xmm4,xmm6 - punpcklwd xmm6,xmm2 - punpckhwd xmm4,xmm2 - movdqa xmm1,xmm6 - movdqa xmm5,xmm4 - pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) - pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) - pmaddwd xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) - pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) - - movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] - - paddd xmm6, XMMWORD [wk(6)] - paddd xmm4, XMMWORD [wk(7)] - paddd xmm6,xmm2 - paddd xmm4,xmm2 - psrld xmm6,SCALEBITS ; xmm6=YEL - psrld xmm4,SCALEBITS ; xmm4=YEH - packssdw xmm6,xmm4 ; xmm6=YE - - psllw xmm0,BYTE_BIT - por xmm6,xmm0 ; xmm6=Y - movdqa XMMWORD [rdi], xmm6 ; Save Y - - pxor xmm2,xmm2 - pxor xmm4,xmm4 - punpcklwd xmm2,xmm3 ; xmm2=REL - punpckhwd xmm4,xmm3 ; xmm4=REH - psrld xmm2,1 ; xmm2=REL*FIX(0.500) - psrld xmm4,1 ; xmm4=REH*FIX(0.500) - - movdqa xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ] - - paddd xmm1,xmm2 - paddd xmm5,xmm4 - paddd xmm1,xmm0 - paddd xmm5,xmm0 - psrld xmm1,SCALEBITS ; xmm1=CrEL - psrld xmm5,SCALEBITS ; xmm5=CrEH - packssdw xmm1,xmm5 ; xmm1=CrE - - psllw xmm7,BYTE_BIT - por xmm1,xmm7 ; xmm1=Cr - movdqa XMMWORD [rdx], xmm1 ; Save Cr - - sub rcx, byte SIZEOF_XMMWORD - add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr - add rdi, byte SIZEOF_XMMWORD ; outptr0 - add rbx, byte SIZEOF_XMMWORD ; outptr1 - add rdx, byte SIZEOF_XMMWORD ; outptr2 - cmp rcx, byte SIZEOF_XMMWORD - jae near .columnloop - test rcx,rcx - jnz near .column_ld1 - - pop rcx ; col - pop rsi - pop rdi - pop rbx - pop rdx - - add rsi, byte SIZEOF_JSAMPROW ; input_buf - add rdi, byte SIZEOF_JSAMPROW - add rbx, byte SIZEOF_JSAMPROW - add rdx, byte SIZEOF_JSAMPROW - dec rax ; num_rows - jg near .rowloop + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC, xmmF + punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB, xmmA + punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG, xmmD + punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE, xmmA + punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH, xmmB + punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF, xmmF + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD, xmmB + punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG, xmmE + punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF, xmmH + punpckhbw xmmH, xmmH + psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE + movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO + + movdqa xmm6, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm6, xmm3 + movdqa xmm7, xmm1 + movdqa xmm4, xmm6 + pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd xmm7, [rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor xmm1, xmm1 + pxor xmm6, xmm6 + punpcklwd xmm1, xmm5 ; xmm1=BOL + punpckhwd xmm6, xmm5 ; xmm6=BOH + psrld xmm1, 1 ; xmm1=BOL*FIX(0.500) + psrld xmm6, 1 ; xmm6=BOH*FIX(0.500) + + movdqa xmm5, [rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ] + + paddd xmm7, xmm1 + paddd xmm4, xmm6 + paddd xmm7, xmm5 + paddd xmm4, xmm5 + psrld xmm7, SCALEBITS ; xmm7=CbOL + psrld xmm4, SCALEBITS ; xmm4=CbOH + packssdw xmm7, xmm4 ; xmm7=CbO + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE + + movdqa xmm6, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm6, xmm2 + movdqa xmm5, xmm0 + movdqa xmm4, xmm6 + pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd xmm5, [rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor xmm0, xmm0 + pxor xmm6, xmm6 + punpcklwd xmm0, xmm1 ; xmm0=BEL + punpckhwd xmm6, xmm1 ; xmm6=BEH + psrld xmm0, 1 ; xmm0=BEL*FIX(0.500) + psrld xmm6, 1 ; xmm6=BEH*FIX(0.500) + + movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm5, xmm0 + paddd xmm4, xmm6 + paddd xmm5, xmm1 + paddd xmm4, xmm1 + psrld xmm5, SCALEBITS ; xmm5=CbEL + psrld xmm4, SCALEBITS ; xmm4=CbEH + packssdw xmm5, xmm4 ; xmm5=CbE + + psllw xmm7, BYTE_BIT + por xmm5, xmm7 ; xmm5=Cb + movdqa XMMWORD [rbx], xmm5 ; Save Cb + + movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 + punpckhwd xmm4, xmm3 + movdqa xmm7, xmm0 + movdqa xmm5, xmm4 + pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd xmm7, [rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] + + paddd xmm0, XMMWORD [wk(4)] + paddd xmm4, XMMWORD [wk(5)] + paddd xmm0, xmm3 + paddd xmm4, xmm3 + psrld xmm0, SCALEBITS ; xmm0=YOL + psrld xmm4, SCALEBITS ; xmm4=YOH + packssdw xmm0, xmm4 ; xmm0=YO + + pxor xmm3, xmm3 + pxor xmm4, xmm4 + punpcklwd xmm3, xmm1 ; xmm3=ROL + punpckhwd xmm4, xmm1 ; xmm4=ROH + psrld xmm3, 1 ; xmm3=ROL*FIX(0.500) + psrld xmm4, 1 ; xmm4=ROH*FIX(0.500) + + movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm7, xmm3 + paddd xmm5, xmm4 + paddd xmm7, xmm1 + paddd xmm5, xmm1 + psrld xmm7, SCALEBITS ; xmm7=CrOL + psrld xmm5, SCALEBITS ; xmm5=CrOH + packssdw xmm7, xmm5 ; xmm7=CrO + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE + + movdqa xmm4, xmm6 + punpcklwd xmm6, xmm2 + punpckhwd xmm4, xmm2 + movdqa xmm1, xmm6 + movdqa xmm5, xmm4 + pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd xmm1, [rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(6)] + paddd xmm4, XMMWORD [wk(7)] + paddd xmm6, xmm2 + paddd xmm4, xmm2 + psrld xmm6, SCALEBITS ; xmm6=YEL + psrld xmm4, SCALEBITS ; xmm4=YEH + packssdw xmm6, xmm4 ; xmm6=YE + + psllw xmm0, BYTE_BIT + por xmm6, xmm0 ; xmm6=Y + movdqa XMMWORD [rdi], xmm6 ; Save Y + + pxor xmm2, xmm2 + pxor xmm4, xmm4 + punpcklwd xmm2, xmm3 ; xmm2=REL + punpckhwd xmm4, xmm3 ; xmm4=REH + psrld xmm2, 1 ; xmm2=REL*FIX(0.500) + psrld xmm4, 1 ; xmm4=REH*FIX(0.500) + + movdqa xmm0, [rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ] + + paddd xmm1, xmm2 + paddd xmm5, xmm4 + paddd xmm1, xmm0 + paddd xmm5, xmm0 + psrld xmm1, SCALEBITS ; xmm1=CrEL + psrld xmm5, SCALEBITS ; xmm5=CrEH + packssdw xmm1, xmm5 ; xmm1=CrE + + psllw xmm7, BYTE_BIT + por xmm1, xmm7 ; xmm1=Cr + movdqa XMMWORD [rdx], xmm1 ; Save Cr + + sub rcx, byte SIZEOF_XMMWORD + add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add rdi, byte SIZEOF_XMMWORD ; outptr0 + add rbx, byte SIZEOF_XMMWORD ; outptr1 + add rdx, byte SIZEOF_XMMWORD ; outptr2 + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx, rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + pop rbx + pop rdx + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + add rbx, byte SIZEOF_JSAMPROW + add rdx, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop .return: - pop rbx - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + pop rbx + uncollect_args + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jccolext-sse2.asm b/simd/jccolext-sse2.asm index cc38e98..d8496dc 100644 --- a/simd/jccolext-sse2.asm +++ b/simd/jccolext-sse2.asm @@ -25,479 +25,479 @@ ; JDIMENSION output_row, int num_rows); ; -%define img_width(b) (b)+8 ; JDIMENSION img_width -%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf -%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf -%define output_row(b) (b)+20 ; JDIMENSION output_row -%define num_rows(b) (b)+24 ; int num_rows +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 8 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 8 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 + align 16 - global EXTN(jsimd_rgb_ycc_convert_sse2) + global EXTN(jsimd_rgb_ycc_convert_sse2) EXTN(jsimd_rgb_ycc_convert_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [img_width(eax)] - test ecx,ecx - jz near .return - - push ecx - - mov esi, JSAMPIMAGE [output_buf(eax)] - mov ecx, JDIMENSION [output_row(eax)] - mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] - lea edi, [edi+ecx*SIZEOF_JSAMPROW] - lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] - lea edx, [edx+ecx*SIZEOF_JSAMPROW] - - pop ecx - - mov esi, JSAMPARRAY [input_buf(eax)] - mov eax, INT [num_rows(eax)] - test eax,eax - jle near .return - alignx 16,7 + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx, ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 .rowloop: - pushpic eax - push edx - push ebx - push edi - push esi - push ecx ; col + pushpic eax + push edx + push ebx + push edi + push esi + push ecx ; col - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr0 - mov ebx, JSAMPROW [ebx] ; outptr1 - mov edx, JSAMPROW [edx] ; outptr2 - movpic eax, POINTER [gotptr] ; load GOT address (eax) + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + mov ebx, JSAMPROW [ebx] ; outptr1 + mov edx, JSAMPROW [edx] ; outptr2 + movpic eax, POINTER [gotptr] ; load GOT address (eax) - cmp ecx, byte SIZEOF_XMMWORD - jae near .columnloop - alignx 16,7 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + alignx 16, 7 -%if RGB_PIXELSIZE == 3 ; --------------- +%if RGB_PIXELSIZE == 3 ; --------------- .column_ld1: - push eax - push edx - lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE - test cl, SIZEOF_BYTE - jz short .column_ld2 - sub ecx, byte SIZEOF_BYTE - movzx eax, BYTE [esi+ecx] + push eax + push edx + lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, BYTE [esi+ecx] .column_ld2: - test cl, SIZEOF_WORD - jz short .column_ld4 - sub ecx, byte SIZEOF_WORD - movzx edx, WORD [esi+ecx] - shl eax, WORD_BIT - or eax,edx + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax, edx .column_ld4: - movd xmmA,eax - pop edx - pop eax - test cl, SIZEOF_DWORD - jz short .column_ld8 - sub ecx, byte SIZEOF_DWORD - movd xmmF, XMM_DWORD [esi+ecx] - pslldq xmmA, SIZEOF_DWORD - por xmmA,xmmF + movd xmmA, eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [esi+ecx] + pslldq xmmA, SIZEOF_DWORD + por xmmA, xmmF .column_ld8: - test cl, SIZEOF_MMWORD - jz short .column_ld16 - sub ecx, byte SIZEOF_MMWORD - movq xmmB, XMM_MMWORD [esi+ecx] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmB + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [esi+ecx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmB .column_ld16: - test cl, SIZEOF_XMMWORD - jz short .column_ld32 - movdqa xmmF,xmmA - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - mov ecx, SIZEOF_XMMWORD - jmp short .rgb_ycc_cnv + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF, xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + mov ecx, SIZEOF_XMMWORD + jmp short .rgb_ycc_cnv .column_ld32: - test cl, 2*SIZEOF_XMMWORD - mov ecx, SIZEOF_XMMWORD - jz short .rgb_ycc_cnv - movdqa xmmB,xmmA - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] - jmp short .rgb_ycc_cnv - alignx 16,7 + test cl, 2*SIZEOF_XMMWORD + mov ecx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmB, xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + alignx 16, 7 .columnloop: - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] - movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] .rgb_ycc_cnv: - ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - movdqa xmmG,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) - psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + movdqa xmmG, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) - pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) - punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) - punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) - movdqa xmmD,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) - psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + movdqa xmmD, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) - pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) - punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) - punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) - movdqa xmmE,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) - psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + movdqa xmmE, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) - punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) - pxor xmmH,xmmH + pxor xmmH, xmmH - movdqa xmmC,xmmA - punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + movdqa xmmC, xmmA + punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) - movdqa xmmB,xmmE - punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + movdqa xmmB, xmmE + punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) - movdqa xmmF,xmmD - punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) - punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + movdqa xmmF, xmmD + punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) -%else ; RGB_PIXELSIZE == 4 ; ----------- +%else ; RGB_PIXELSIZE == 4 ; ----------- .column_ld1: - test cl, SIZEOF_XMMWORD/16 - jz short .column_ld2 - sub ecx, byte SIZEOF_XMMWORD/16 - movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] .column_ld2: - test cl, SIZEOF_XMMWORD/8 - jz short .column_ld4 - sub ecx, byte SIZEOF_XMMWORD/8 - movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmE + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmE .column_ld4: - test cl, SIZEOF_XMMWORD/4 - jz short .column_ld8 - sub ecx, byte SIZEOF_XMMWORD/4 - movdqa xmmE,xmmA - movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + movdqa xmmE, xmmA + movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] .column_ld8: - test cl, SIZEOF_XMMWORD/2 - mov ecx, SIZEOF_XMMWORD - jz short .rgb_ycc_cnv - movdqa xmmF,xmmA - movdqa xmmH,xmmE - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] - jmp short .rgb_ycc_cnv - alignx 16,7 + test cl, SIZEOF_XMMWORD/2 + mov ecx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmF, xmmA + movdqa xmmH, xmmE + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + alignx 16, 7 .columnloop: - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] - movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] .rgb_ycc_cnv: - ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - - movdqa xmmD,xmmA - punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) - punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) - - movdqa xmmC,xmmF - punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) - punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) - - movdqa xmmB,xmmA - punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) - punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) - - movdqa xmmG,xmmD - punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) - punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) - - movdqa xmmE,xmmA - punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) - - movdqa xmmH,xmmB - punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) - punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) - - pxor xmmF,xmmF - - movdqa xmmC,xmmA - punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) - - movdqa xmmD,xmmB - punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) - - movdqa xmmG,xmmE - punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) - - punpcklbw xmmF,xmmH - punpckhbw xmmH,xmmH - psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) - psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) - -%endif ; RGB_PIXELSIZE ; --------------- - - ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE - ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO - - ; (Original) - ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B - ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE - ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE - ; - ; (This implementation) - ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G - ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE - ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE - - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE - movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO - movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE - movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO - - movdqa xmm6,xmm1 - punpcklwd xmm1,xmm3 - punpckhwd xmm6,xmm3 - movdqa xmm7,xmm1 - movdqa xmm4,xmm6 - pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) - pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) - pmaddwd xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) - pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) - - movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) - movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) - - pxor xmm1,xmm1 - pxor xmm6,xmm6 - punpcklwd xmm1,xmm5 ; xmm1=BOL - punpckhwd xmm6,xmm5 ; xmm6=BOH - psrld xmm1,1 ; xmm1=BOL*FIX(0.500) - psrld xmm6,1 ; xmm6=BOH*FIX(0.500) - - movdqa xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ] - - paddd xmm7,xmm1 - paddd xmm4,xmm6 - paddd xmm7,xmm5 - paddd xmm4,xmm5 - psrld xmm7,SCALEBITS ; xmm7=CbOL - psrld xmm4,SCALEBITS ; xmm4=CbOH - packssdw xmm7,xmm4 ; xmm7=CbO - - movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE - - movdqa xmm6,xmm0 - punpcklwd xmm0,xmm2 - punpckhwd xmm6,xmm2 - movdqa xmm5,xmm0 - movdqa xmm4,xmm6 - pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) - pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) - pmaddwd xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) - pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) - - movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) - movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) - - pxor xmm0,xmm0 - pxor xmm6,xmm6 - punpcklwd xmm0,xmm1 ; xmm0=BEL - punpckhwd xmm6,xmm1 ; xmm6=BEH - psrld xmm0,1 ; xmm0=BEL*FIX(0.500) - psrld xmm6,1 ; xmm6=BEH*FIX(0.500) - - movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] - - paddd xmm5,xmm0 - paddd xmm4,xmm6 - paddd xmm5,xmm1 - paddd xmm4,xmm1 - psrld xmm5,SCALEBITS ; xmm5=CbEL - psrld xmm4,SCALEBITS ; xmm4=CbEH - packssdw xmm5,xmm4 ; xmm5=CbE - - psllw xmm7,BYTE_BIT - por xmm5,xmm7 ; xmm5=Cb - movdqa XMMWORD [ebx], xmm5 ; Save Cb - - movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO - movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO - - movdqa xmm4,xmm0 - punpcklwd xmm0,xmm3 - punpckhwd xmm4,xmm3 - movdqa xmm7,xmm0 - movdqa xmm5,xmm4 - pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) - pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) - pmaddwd xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) - pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) - - movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] - - paddd xmm0, XMMWORD [wk(4)] - paddd xmm4, XMMWORD [wk(5)] - paddd xmm0,xmm3 - paddd xmm4,xmm3 - psrld xmm0,SCALEBITS ; xmm0=YOL - psrld xmm4,SCALEBITS ; xmm4=YOH - packssdw xmm0,xmm4 ; xmm0=YO - - pxor xmm3,xmm3 - pxor xmm4,xmm4 - punpcklwd xmm3,xmm1 ; xmm3=ROL - punpckhwd xmm4,xmm1 ; xmm4=ROH - psrld xmm3,1 ; xmm3=ROL*FIX(0.500) - psrld xmm4,1 ; xmm4=ROH*FIX(0.500) - - movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] - - paddd xmm7,xmm3 - paddd xmm5,xmm4 - paddd xmm7,xmm1 - paddd xmm5,xmm1 - psrld xmm7,SCALEBITS ; xmm7=CrOL - psrld xmm5,SCALEBITS ; xmm5=CrOH - packssdw xmm7,xmm5 ; xmm7=CrO - - movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE - - movdqa xmm4,xmm6 - punpcklwd xmm6,xmm2 - punpckhwd xmm4,xmm2 - movdqa xmm1,xmm6 - movdqa xmm5,xmm4 - pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) - pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) - pmaddwd xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) - pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) - - movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] - - paddd xmm6, XMMWORD [wk(6)] - paddd xmm4, XMMWORD [wk(7)] - paddd xmm6,xmm2 - paddd xmm4,xmm2 - psrld xmm6,SCALEBITS ; xmm6=YEL - psrld xmm4,SCALEBITS ; xmm4=YEH - packssdw xmm6,xmm4 ; xmm6=YE - - psllw xmm0,BYTE_BIT - por xmm6,xmm0 ; xmm6=Y - movdqa XMMWORD [edi], xmm6 ; Save Y - - pxor xmm2,xmm2 - pxor xmm4,xmm4 - punpcklwd xmm2,xmm3 ; xmm2=REL - punpckhwd xmm4,xmm3 ; xmm4=REH - psrld xmm2,1 ; xmm2=REL*FIX(0.500) - psrld xmm4,1 ; xmm4=REH*FIX(0.500) - - movdqa xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ] - - paddd xmm1,xmm2 - paddd xmm5,xmm4 - paddd xmm1,xmm0 - paddd xmm5,xmm0 - psrld xmm1,SCALEBITS ; xmm1=CrEL - psrld xmm5,SCALEBITS ; xmm5=CrEH - packssdw xmm1,xmm5 ; xmm1=CrE - - psllw xmm7,BYTE_BIT - por xmm1,xmm7 ; xmm1=Cr - movdqa XMMWORD [edx], xmm1 ; Save Cr - - sub ecx, byte SIZEOF_XMMWORD - add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr - add edi, byte SIZEOF_XMMWORD ; outptr0 - add ebx, byte SIZEOF_XMMWORD ; outptr1 - add edx, byte SIZEOF_XMMWORD ; outptr2 - cmp ecx, byte SIZEOF_XMMWORD - jae near .columnloop - test ecx,ecx - jnz near .column_ld1 - - pop ecx ; col - pop esi - pop edi - pop ebx - pop edx - poppic eax - - add esi, byte SIZEOF_JSAMPROW ; input_buf - add edi, byte SIZEOF_JSAMPROW - add ebx, byte SIZEOF_JSAMPROW - add edx, byte SIZEOF_JSAMPROW - dec eax ; num_rows - jg near .rowloop + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC, xmmF + punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB, xmmA + punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG, xmmD + punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE, xmmA + punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH, xmmB + punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF, xmmF + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD, xmmB + punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG, xmmE + punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF, xmmH + punpckhbw xmmH, xmmH + psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE + movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO + + movdqa xmm6, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm6, xmm3 + movdqa xmm7, xmm1 + movdqa xmm4, xmm6 + pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd xmm7, [GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor xmm1, xmm1 + pxor xmm6, xmm6 + punpcklwd xmm1, xmm5 ; xmm1=BOL + punpckhwd xmm6, xmm5 ; xmm6=BOH + psrld xmm1, 1 ; xmm1=BOL*FIX(0.500) + psrld xmm6, 1 ; xmm6=BOH*FIX(0.500) + + movdqa xmm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ] + + paddd xmm7, xmm1 + paddd xmm4, xmm6 + paddd xmm7, xmm5 + paddd xmm4, xmm5 + psrld xmm7, SCALEBITS ; xmm7=CbOL + psrld xmm4, SCALEBITS ; xmm4=CbOH + packssdw xmm7, xmm4 ; xmm7=CbO + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE + + movdqa xmm6, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm6, xmm2 + movdqa xmm5, xmm0 + movdqa xmm4, xmm6 + pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd xmm5, [GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor xmm0, xmm0 + pxor xmm6, xmm6 + punpcklwd xmm0, xmm1 ; xmm0=BEL + punpckhwd xmm6, xmm1 ; xmm6=BEH + psrld xmm0, 1 ; xmm0=BEL*FIX(0.500) + psrld xmm6, 1 ; xmm6=BEH*FIX(0.500) + + movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm5, xmm0 + paddd xmm4, xmm6 + paddd xmm5, xmm1 + paddd xmm4, xmm1 + psrld xmm5, SCALEBITS ; xmm5=CbEL + psrld xmm4, SCALEBITS ; xmm4=CbEH + packssdw xmm5, xmm4 ; xmm5=CbE + + psllw xmm7, BYTE_BIT + por xmm5, xmm7 ; xmm5=Cb + movdqa XMMWORD [ebx], xmm5 ; Save Cb + + movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 + punpckhwd xmm4, xmm3 + movdqa xmm7, xmm0 + movdqa xmm5, xmm4 + pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd xmm7, [GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] + + paddd xmm0, XMMWORD [wk(4)] + paddd xmm4, XMMWORD [wk(5)] + paddd xmm0, xmm3 + paddd xmm4, xmm3 + psrld xmm0, SCALEBITS ; xmm0=YOL + psrld xmm4, SCALEBITS ; xmm4=YOH + packssdw xmm0, xmm4 ; xmm0=YO + + pxor xmm3, xmm3 + pxor xmm4, xmm4 + punpcklwd xmm3, xmm1 ; xmm3=ROL + punpckhwd xmm4, xmm1 ; xmm4=ROH + psrld xmm3, 1 ; xmm3=ROL*FIX(0.500) + psrld xmm4, 1 ; xmm4=ROH*FIX(0.500) + + movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm7, xmm3 + paddd xmm5, xmm4 + paddd xmm7, xmm1 + paddd xmm5, xmm1 + psrld xmm7, SCALEBITS ; xmm7=CrOL + psrld xmm5, SCALEBITS ; xmm5=CrOH + packssdw xmm7, xmm5 ; xmm7=CrO + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE + + movdqa xmm4, xmm6 + punpcklwd xmm6, xmm2 + punpckhwd xmm4, xmm2 + movdqa xmm1, xmm6 + movdqa xmm5, xmm4 + pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd xmm1, [GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(6)] + paddd xmm4, XMMWORD [wk(7)] + paddd xmm6, xmm2 + paddd xmm4, xmm2 + psrld xmm6, SCALEBITS ; xmm6=YEL + psrld xmm4, SCALEBITS ; xmm4=YEH + packssdw xmm6, xmm4 ; xmm6=YE + + psllw xmm0, BYTE_BIT + por xmm6, xmm0 ; xmm6=Y + movdqa XMMWORD [edi], xmm6 ; Save Y + + pxor xmm2, xmm2 + pxor xmm4, xmm4 + punpcklwd xmm2, xmm3 ; xmm2=REL + punpckhwd xmm4, xmm3 ; xmm4=REH + psrld xmm2, 1 ; xmm2=REL*FIX(0.500) + psrld xmm4, 1 ; xmm4=REH*FIX(0.500) + + movdqa xmm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ] + + paddd xmm1, xmm2 + paddd xmm5, xmm4 + paddd xmm1, xmm0 + paddd xmm5, xmm0 + psrld xmm1, SCALEBITS ; xmm1=CrEL + psrld xmm5, SCALEBITS ; xmm5=CrEH + packssdw xmm1, xmm5 ; xmm1=CrE + + psllw xmm7, BYTE_BIT + por xmm1, xmm7 ; xmm1=Cr + movdqa XMMWORD [edx], xmm1 ; Save Cr + + sub ecx, byte SIZEOF_XMMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add edi, byte SIZEOF_XMMWORD ; outptr0 + add ebx, byte SIZEOF_XMMWORD ; outptr1 + add edx, byte SIZEOF_XMMWORD ; outptr2 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx, ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + pop ebx + pop edx + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jccolor-sse2-64.asm b/simd/jccolor-sse2-64.asm index bd2188b..af6e1e2 100644 --- a/simd/jccolor-sse2-64.asm +++ b/simd/jccolor-sse2-64.asm @@ -19,23 +19,23 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 - -F_0_081 equ 5329 ; FIX(0.08131) -F_0_114 equ 7471 ; FIX(0.11400) -F_0_168 equ 11059 ; FIX(0.16874) -F_0_250 equ 16384 ; FIX(0.25000) -F_0_299 equ 19595 ; FIX(0.29900) -F_0_331 equ 21709 ; FIX(0.33126) -F_0_418 equ 27439 ; FIX(0.41869) -F_0_587 equ 38470 ; FIX(0.58700) -F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_rgb_ycc_convert_sse2) + alignz 16 + global EXTN(jconst_rgb_ycc_convert_sse2) EXTN(jconst_rgb_ycc_convert_sse2): @@ -46,11 +46,11 @@ PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418 PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 %include "jccolext-sse2-64.asm" diff --git a/simd/jccolor-sse2.asm b/simd/jccolor-sse2.asm index 13124d1..aae51ba 100644 --- a/simd/jccolor-sse2.asm +++ b/simd/jccolor-sse2.asm @@ -19,23 +19,23 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 - -F_0_081 equ 5329 ; FIX(0.08131) -F_0_114 equ 7471 ; FIX(0.11400) -F_0_168 equ 11059 ; FIX(0.16874) -F_0_250 equ 16384 ; FIX(0.25000) -F_0_299 equ 19595 ; FIX(0.29900) -F_0_331 equ 21709 ; FIX(0.33126) -F_0_418 equ 27439 ; FIX(0.41869) -F_0_587 equ 38470 ; FIX(0.58700) -F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_rgb_ycc_convert_sse2) + alignz 16 + global EXTN(jconst_rgb_ycc_convert_sse2) EXTN(jconst_rgb_ycc_convert_sse2): @@ -46,11 +46,11 @@ PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418 PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 %include "jccolext-sse2.asm" diff --git a/simd/jcgray-sse2-64.asm b/simd/jcgray-sse2-64.asm index bafd302..61c9682 100644 --- a/simd/jcgray-sse2-64.asm +++ b/simd/jcgray-sse2-64.asm @@ -19,31 +19,31 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_114 equ 7471 ; FIX(0.11400) -F_0_250 equ 16384 ; FIX(0.25000) -F_0_299 equ 19595 ; FIX(0.29900) -F_0_587 equ 38470 ; FIX(0.58700) -F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_rgb_gray_convert_sse2) + alignz 16 + global EXTN(jconst_rgb_gray_convert_sse2) EXTN(jconst_rgb_gray_convert_sse2): -PW_F0299_F0337 times 4 dw F_0_299, F_0_337 -PW_F0114_F0250 times 4 dw F_0_114, F_0_250 -PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 %include "jcgryext-sse2-64.asm" diff --git a/simd/jcgray-sse2.asm b/simd/jcgray-sse2.asm index 5b0b466..831fda6 100644 --- a/simd/jcgray-sse2.asm +++ b/simd/jcgray-sse2.asm @@ -19,31 +19,31 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_114 equ 7471 ; FIX(0.11400) -F_0_250 equ 16384 ; FIX(0.25000) -F_0_299 equ 19595 ; FIX(0.29900) -F_0_587 equ 38470 ; FIX(0.58700) -F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_rgb_gray_convert_sse2) + alignz 16 + global EXTN(jconst_rgb_gray_convert_sse2) EXTN(jconst_rgb_gray_convert_sse2): -PW_F0299_F0337 times 4 dw F_0_299, F_0_337 -PW_F0114_F0250 times 4 dw F_0_114, F_0_250 -PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 %include "jcgryext-sse2.asm" diff --git a/simd/jcgryext-sse2-64.asm b/simd/jcgryext-sse2-64.asm index 541355a..504e295 100644 --- a/simd/jcgryext-sse2-64.asm +++ b/simd/jcgryext-sse2-64.asm @@ -33,333 +33,333 @@ ; r13 = JDIMENSION output_row ; r14 = int num_rows -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 + align 16 - global EXTN(jsimd_rgb_gray_convert_sse2) + global EXTN(jsimd_rgb_gray_convert_sse2) EXTN(jsimd_rgb_gray_convert_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args - push rbx - - mov ecx, r10d - test rcx,rcx - jz near .return - - push rcx - - mov rsi, r12 - mov ecx, r13d - mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] - lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] - - pop rcx - - mov rsi, r11 - mov eax, r14d - test rax,rax - jle near .return + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov ecx, r10d + test rcx, rcx + jz near .return + + push rcx + + mov rsi, r12 + mov ecx, r13d + mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax, rax + jle near .return .rowloop: - push rdi - push rsi - push rcx ; col + push rdi + push rsi + push rcx ; col - mov rsi, JSAMPROW [rsi] ; inptr - mov rdi, JSAMPROW [rdi] ; outptr0 + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr0 - cmp rcx, byte SIZEOF_XMMWORD - jae near .columnloop + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop -%if RGB_PIXELSIZE == 3 ; --------------- +%if RGB_PIXELSIZE == 3 ; --------------- .column_ld1: - push rax - push rdx - lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE - test cl, SIZEOF_BYTE - jz short .column_ld2 - sub rcx, byte SIZEOF_BYTE - movzx rax, BYTE [rsi+rcx] + push rax + push rdx + lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, BYTE [rsi+rcx] .column_ld2: - test cl, SIZEOF_WORD - jz short .column_ld4 - sub rcx, byte SIZEOF_WORD - movzx rdx, WORD [rsi+rcx] - shl rax, WORD_BIT - or rax,rdx + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, WORD [rsi+rcx] + shl rax, WORD_BIT + or rax, rdx .column_ld4: - movd xmmA,eax - pop rdx - pop rax - test cl, SIZEOF_DWORD - jz short .column_ld8 - sub rcx, byte SIZEOF_DWORD - movd xmmF, XMM_DWORD [rsi+rcx] - pslldq xmmA, SIZEOF_DWORD - por xmmA,xmmF + movd xmmA, eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [rsi+rcx] + pslldq xmmA, SIZEOF_DWORD + por xmmA, xmmF .column_ld8: - test cl, SIZEOF_MMWORD - jz short .column_ld16 - sub rcx, byte SIZEOF_MMWORD - movq xmmB, XMM_MMWORD [rsi+rcx] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmB + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [rsi+rcx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmB .column_ld16: - test cl, SIZEOF_XMMWORD - jz short .column_ld32 - movdqa xmmF,xmmA - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - mov rcx, SIZEOF_XMMWORD - jmp short .rgb_gray_cnv + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF, xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + mov rcx, SIZEOF_XMMWORD + jmp short .rgb_gray_cnv .column_ld32: - test cl, 2*SIZEOF_XMMWORD - mov rcx, SIZEOF_XMMWORD - jz short .rgb_gray_cnv - movdqa xmmB,xmmA - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] - jmp short .rgb_gray_cnv + test cl, 2*SIZEOF_XMMWORD + mov rcx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmB, xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv .columnloop: - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] - movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] .rgb_gray_cnv: - ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - movdqa xmmG,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) - psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + movdqa xmmG, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) - pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) - punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) - punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) - movdqa xmmD,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) - psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + movdqa xmmD, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) - pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) - punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) - punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) - movdqa xmmE,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) - psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + movdqa xmmE, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) - punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) - pxor xmmH,xmmH + pxor xmmH, xmmH - movdqa xmmC,xmmA - punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + movdqa xmmC, xmmA + punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) - movdqa xmmB,xmmE - punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + movdqa xmmB, xmmE + punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) - movdqa xmmF,xmmD - punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) - punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + movdqa xmmF, xmmD + punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) -%else ; RGB_PIXELSIZE == 4 ; ----------- +%else ; RGB_PIXELSIZE == 4 ; ----------- .column_ld1: - test cl, SIZEOF_XMMWORD/16 - jz short .column_ld2 - sub rcx, byte SIZEOF_XMMWORD/16 - movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] .column_ld2: - test cl, SIZEOF_XMMWORD/8 - jz short .column_ld4 - sub rcx, byte SIZEOF_XMMWORD/8 - movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmE + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmE .column_ld4: - test cl, SIZEOF_XMMWORD/4 - jz short .column_ld8 - sub rcx, byte SIZEOF_XMMWORD/4 - movdqa xmmE,xmmA - movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + movdqa xmmE, xmmA + movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] .column_ld8: - test cl, SIZEOF_XMMWORD/2 - mov rcx, SIZEOF_XMMWORD - jz short .rgb_gray_cnv - movdqa xmmF,xmmA - movdqa xmmH,xmmE - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] - jmp short .rgb_gray_cnv + test cl, SIZEOF_XMMWORD/2 + mov rcx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmF, xmmA + movdqa xmmH, xmmE + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv .columnloop: - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] - movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] .rgb_gray_cnv: - ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - - movdqa xmmD,xmmA - punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) - punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) - - movdqa xmmC,xmmF - punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) - punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) - - movdqa xmmB,xmmA - punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) - punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) - - movdqa xmmG,xmmD - punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) - punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) - - movdqa xmmE,xmmA - punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) - - movdqa xmmH,xmmB - punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) - punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) - - pxor xmmF,xmmF - - movdqa xmmC,xmmA - punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) - - movdqa xmmD,xmmB - punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) - - movdqa xmmG,xmmE - punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) - - punpcklbw xmmF,xmmH - punpckhbw xmmH,xmmH - psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) - psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) - -%endif ; RGB_PIXELSIZE ; --------------- - - ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE - ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO - - ; (Original) - ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B - ; - ; (This implementation) - ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G - - movdqa xmm6,xmm1 - punpcklwd xmm1,xmm3 - punpckhwd xmm6,xmm3 - pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) - pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) - - movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) - - movdqa xmm6,xmm0 - punpcklwd xmm0,xmm2 - punpckhwd xmm6,xmm2 - pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) - pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) - - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) - - movdqa xmm0, xmm5 ; xmm0=BO - movdqa xmm6, xmm4 ; xmm6=BE - - movdqa xmm4,xmm0 - punpcklwd xmm0,xmm3 - punpckhwd xmm4,xmm3 - pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) - pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) - - movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] - - paddd xmm0, xmm1 - paddd xmm4, xmm7 - paddd xmm0,xmm3 - paddd xmm4,xmm3 - psrld xmm0,SCALEBITS ; xmm0=YOL - psrld xmm4,SCALEBITS ; xmm4=YOH - packssdw xmm0,xmm4 ; xmm0=YO - - movdqa xmm4,xmm6 - punpcklwd xmm6,xmm2 - punpckhwd xmm4,xmm2 - pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) - pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) - - movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] - - paddd xmm6, XMMWORD [wk(0)] - paddd xmm4, XMMWORD [wk(1)] - paddd xmm6,xmm2 - paddd xmm4,xmm2 - psrld xmm6,SCALEBITS ; xmm6=YEL - psrld xmm4,SCALEBITS ; xmm4=YEH - packssdw xmm6,xmm4 ; xmm6=YE - - psllw xmm0,BYTE_BIT - por xmm6,xmm0 ; xmm6=Y - movdqa XMMWORD [rdi], xmm6 ; Save Y - - sub rcx, byte SIZEOF_XMMWORD - add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr - add rdi, byte SIZEOF_XMMWORD ; outptr0 - cmp rcx, byte SIZEOF_XMMWORD - jae near .columnloop - test rcx,rcx - jnz near .column_ld1 - - pop rcx ; col - pop rsi - pop rdi - - add rsi, byte SIZEOF_JSAMPROW ; input_buf - add rdi, byte SIZEOF_JSAMPROW - dec rax ; num_rows - jg near .rowloop + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC, xmmF + punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB, xmmA + punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG, xmmD + punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE, xmmA + punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH, xmmB + punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF, xmmF + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD, xmmB + punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG, xmmE + punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF, xmmH + punpckhbw xmmH, xmmH + psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movdqa xmm6, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm6, xmm3 + pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm6, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm6, xmm2 + pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa xmm0, xmm5 ; xmm0=BO + movdqa xmm6, xmm4 ; xmm6=BE + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 + punpckhwd xmm4, xmm3 + pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] + + paddd xmm0, xmm1 + paddd xmm4, xmm7 + paddd xmm0, xmm3 + paddd xmm4, xmm3 + psrld xmm0, SCALEBITS ; xmm0=YOL + psrld xmm4, SCALEBITS ; xmm4=YOH + packssdw xmm0, xmm4 ; xmm0=YO + + movdqa xmm4, xmm6 + punpcklwd xmm6, xmm2 + punpckhwd xmm4, xmm2 + pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(0)] + paddd xmm4, XMMWORD [wk(1)] + paddd xmm6, xmm2 + paddd xmm4, xmm2 + psrld xmm6, SCALEBITS ; xmm6=YEL + psrld xmm4, SCALEBITS ; xmm4=YEH + packssdw xmm6, xmm4 ; xmm6=YE + + psllw xmm0, BYTE_BIT + por xmm6, xmm0 ; xmm6=Y + movdqa XMMWORD [rdi], xmm6 ; Save Y + + sub rcx, byte SIZEOF_XMMWORD + add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add rdi, byte SIZEOF_XMMWORD ; outptr0 + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx, rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop .return: - pop rbx - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + pop rbx + uncollect_args + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jcgryext-sse2.asm b/simd/jcgryext-sse2.asm index cd16dd1..78beac7 100644 --- a/simd/jcgryext-sse2.asm +++ b/simd/jcgryext-sse2.asm @@ -27,358 +27,358 @@ ; JDIMENSION output_row, int num_rows); ; -%define img_width(b) (b)+8 ; JDIMENSION img_width -%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf -%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf -%define output_row(b) (b)+20 ; JDIMENSION output_row -%define num_rows(b) (b)+24 ; int num_rows +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 + align 16 - global EXTN(jsimd_rgb_gray_convert_sse2) + global EXTN(jsimd_rgb_gray_convert_sse2) EXTN(jsimd_rgb_gray_convert_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [img_width(eax)] - test ecx,ecx - jz near .return - - push ecx - - mov esi, JSAMPIMAGE [output_buf(eax)] - mov ecx, JDIMENSION [output_row(eax)] - mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] - lea edi, [edi+ecx*SIZEOF_JSAMPROW] - - pop ecx - - mov esi, JSAMPARRAY [input_buf(eax)] - mov eax, INT [num_rows(eax)] - test eax,eax - jle near .return - alignx 16,7 + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx, ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 .rowloop: - pushpic eax - push edi - push esi - push ecx ; col + pushpic eax + push edi + push esi + push ecx ; col - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr0 - movpic eax, POINTER [gotptr] ; load GOT address (eax) + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + movpic eax, POINTER [gotptr] ; load GOT address (eax) - cmp ecx, byte SIZEOF_XMMWORD - jae near .columnloop - alignx 16,7 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + alignx 16, 7 -%if RGB_PIXELSIZE == 3 ; --------------- +%if RGB_PIXELSIZE == 3 ; --------------- .column_ld1: - push eax - push edx - lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE - test cl, SIZEOF_BYTE - jz short .column_ld2 - sub ecx, byte SIZEOF_BYTE - movzx eax, BYTE [esi+ecx] + push eax + push edx + lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, BYTE [esi+ecx] .column_ld2: - test cl, SIZEOF_WORD - jz short .column_ld4 - sub ecx, byte SIZEOF_WORD - movzx edx, WORD [esi+ecx] - shl eax, WORD_BIT - or eax,edx + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax, edx .column_ld4: - movd xmmA,eax - pop edx - pop eax - test cl, SIZEOF_DWORD - jz short .column_ld8 - sub ecx, byte SIZEOF_DWORD - movd xmmF, XMM_DWORD [esi+ecx] - pslldq xmmA, SIZEOF_DWORD - por xmmA,xmmF + movd xmmA, eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [esi+ecx] + pslldq xmmA, SIZEOF_DWORD + por xmmA, xmmF .column_ld8: - test cl, SIZEOF_MMWORD - jz short .column_ld16 - sub ecx, byte SIZEOF_MMWORD - movq xmmB, XMM_MMWORD [esi+ecx] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmB + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [esi+ecx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmB .column_ld16: - test cl, SIZEOF_XMMWORD - jz short .column_ld32 - movdqa xmmF,xmmA - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - mov ecx, SIZEOF_XMMWORD - jmp short .rgb_gray_cnv + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF, xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + mov ecx, SIZEOF_XMMWORD + jmp short .rgb_gray_cnv .column_ld32: - test cl, 2*SIZEOF_XMMWORD - mov ecx, SIZEOF_XMMWORD - jz short .rgb_gray_cnv - movdqa xmmB,xmmA - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] - jmp short .rgb_gray_cnv - alignx 16,7 + test cl, 2*SIZEOF_XMMWORD + mov ecx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmB, xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + alignx 16, 7 .columnloop: - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] - movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] .rgb_gray_cnv: - ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - movdqa xmmG,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) - psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + movdqa xmmG, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) - pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) - punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) - punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) - movdqa xmmD,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) - psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + movdqa xmmD, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) - pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) - punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) - punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) - movdqa xmmE,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) - psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + movdqa xmmE, xmmA + pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) - punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) - pxor xmmH,xmmH + pxor xmmH, xmmH - movdqa xmmC,xmmA - punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + movdqa xmmC, xmmA + punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) - movdqa xmmB,xmmE - punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + movdqa xmmB, xmmE + punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) - movdqa xmmF,xmmD - punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) - punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + movdqa xmmF, xmmD + punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) -%else ; RGB_PIXELSIZE == 4 ; ----------- +%else ; RGB_PIXELSIZE == 4 ; ----------- .column_ld1: - test cl, SIZEOF_XMMWORD/16 - jz short .column_ld2 - sub ecx, byte SIZEOF_XMMWORD/16 - movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] .column_ld2: - test cl, SIZEOF_XMMWORD/8 - jz short .column_ld4 - sub ecx, byte SIZEOF_XMMWORD/8 - movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmE + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA, xmmE .column_ld4: - test cl, SIZEOF_XMMWORD/4 - jz short .column_ld8 - sub ecx, byte SIZEOF_XMMWORD/4 - movdqa xmmE,xmmA - movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + movdqa xmmE, xmmA + movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] .column_ld8: - test cl, SIZEOF_XMMWORD/2 - mov ecx, SIZEOF_XMMWORD - jz short .rgb_gray_cnv - movdqa xmmF,xmmA - movdqa xmmH,xmmE - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] - jmp short .rgb_gray_cnv - alignx 16,7 + test cl, SIZEOF_XMMWORD/2 + mov ecx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmF, xmmA + movdqa xmmH, xmmE + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + alignx 16, 7 .columnloop: - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] - movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] .rgb_gray_cnv: - ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - - movdqa xmmD,xmmA - punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) - punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) - - movdqa xmmC,xmmF - punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) - punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) - - movdqa xmmB,xmmA - punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) - punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) - - movdqa xmmG,xmmD - punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) - punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) - - movdqa xmmE,xmmA - punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) - - movdqa xmmH,xmmB - punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) - punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) - - pxor xmmF,xmmF - - movdqa xmmC,xmmA - punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) - - movdqa xmmD,xmmB - punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) - - movdqa xmmG,xmmE - punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) - - punpcklbw xmmF,xmmH - punpckhbw xmmH,xmmH - psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) - psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) - -%endif ; RGB_PIXELSIZE ; --------------- - - ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE - ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO - - ; (Original) - ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B - ; - ; (This implementation) - ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G - - movdqa xmm6,xmm1 - punpcklwd xmm1,xmm3 - punpckhwd xmm6,xmm3 - pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) - pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) - - movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) - - movdqa xmm6,xmm0 - punpcklwd xmm0,xmm2 - punpckhwd xmm6,xmm2 - pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) - pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) - - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) - - movdqa xmm0, xmm5 ; xmm0=BO - movdqa xmm6, xmm4 ; xmm6=BE - - movdqa xmm4,xmm0 - punpcklwd xmm0,xmm3 - punpckhwd xmm4,xmm3 - pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) - pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) - - movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] - - paddd xmm0, xmm1 - paddd xmm4, xmm7 - paddd xmm0,xmm3 - paddd xmm4,xmm3 - psrld xmm0,SCALEBITS ; xmm0=YOL - psrld xmm4,SCALEBITS ; xmm4=YOH - packssdw xmm0,xmm4 ; xmm0=YO - - movdqa xmm4,xmm6 - punpcklwd xmm6,xmm2 - punpckhwd xmm4,xmm2 - pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) - pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) - - movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] - - paddd xmm6, XMMWORD [wk(0)] - paddd xmm4, XMMWORD [wk(1)] - paddd xmm6,xmm2 - paddd xmm4,xmm2 - psrld xmm6,SCALEBITS ; xmm6=YEL - psrld xmm4,SCALEBITS ; xmm4=YEH - packssdw xmm6,xmm4 ; xmm6=YE - - psllw xmm0,BYTE_BIT - por xmm6,xmm0 ; xmm6=Y - movdqa XMMWORD [edi], xmm6 ; Save Y - - sub ecx, byte SIZEOF_XMMWORD - add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr - add edi, byte SIZEOF_XMMWORD ; outptr0 - cmp ecx, byte SIZEOF_XMMWORD - jae near .columnloop - test ecx,ecx - jnz near .column_ld1 - - pop ecx ; col - pop esi - pop edi - poppic eax - - add esi, byte SIZEOF_JSAMPROW ; input_buf - add edi, byte SIZEOF_JSAMPROW - dec eax ; num_rows - jg near .rowloop + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC, xmmF + punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB, xmmA + punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG, xmmD + punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE, xmmA + punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH, xmmB + punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF, xmmF + + movdqa xmmC, xmmA + punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD, xmmB + punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG, xmmE + punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF, xmmH + punpckhbw xmmH, xmmH + psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + +%endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movdqa xmm6, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm6, xmm3 + pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm6, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm6, xmm2 + pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa xmm0, xmm5 ; xmm0=BO + movdqa xmm6, xmm4 ; xmm6=BE + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 + punpckhwd xmm4, xmm3 + pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] + + paddd xmm0, xmm1 + paddd xmm4, xmm7 + paddd xmm0, xmm3 + paddd xmm4, xmm3 + psrld xmm0, SCALEBITS ; xmm0=YOL + psrld xmm4, SCALEBITS ; xmm4=YOH + packssdw xmm0, xmm4 ; xmm0=YO + + movdqa xmm4, xmm6 + punpcklwd xmm6, xmm2 + punpckhwd xmm4, xmm2 + pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(0)] + paddd xmm4, XMMWORD [wk(1)] + paddd xmm6, xmm2 + paddd xmm4, xmm2 + psrld xmm6, SCALEBITS ; xmm6=YEL + psrld xmm4, SCALEBITS ; xmm4=YEH + packssdw xmm6, xmm4 ; xmm6=YE + + psllw xmm0, BYTE_BIT + por xmm6, xmm0 ; xmm6=Y + movdqa XMMWORD [edi], xmm6 ; Save Y + + sub ecx, byte SIZEOF_XMMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add edi, byte SIZEOF_XMMWORD ; outptr0 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx, ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jchuff-sse2-64.asm b/simd/jchuff-sse2-64.asm index b1144d1..701cbc2 100644 --- a/simd/jchuff-sse2-64.asm +++ b/simd/jchuff-sse2-64.asm @@ -23,20 +23,20 @@ %include "jsimdext.inc" ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_huff_encode_one_block) + alignz 16 + global EXTN(jconst_huff_encode_one_block) EXTN(jconst_huff_encode_one_block): %include "jpeg_nbits_table.inc" - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; These macros perform the same task as the emit_bits() function in the ; original libjpeg code. In addition to reducing overhead by explicitly @@ -46,118 +46,118 @@ EXTN(jconst_huff_encode_one_block): ; bytes can be stored in a 64-bit bit buffer before it has to be emptied. %macro EMIT_BYTE 0 - sub put_bits, 8 ; put_bits -= 8; - mov rdx, put_buffer - mov ecx, put_bits - shr rdx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits); - mov byte [buffer], dl ; *buffer++ = c; - add buffer, 1 - cmp dl, 0xFF ; need to stuff a zero byte? - jne %%.EMIT_BYTE_END - mov byte [buffer], 0 ; *buffer++ = 0; - add buffer, 1 + sub put_bits, 8 ; put_bits -= 8; + mov rdx, put_buffer + mov ecx, put_bits + shr rdx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits); + mov byte [buffer], dl ; *buffer++ = c; + add buffer, 1 + cmp dl, 0xFF ; need to stuff a zero byte? + jne %%.EMIT_BYTE_END + mov byte [buffer], 0 ; *buffer++ = 0; + add buffer, 1 %%.EMIT_BYTE_END: %endmacro %macro PUT_BITS 1 - add put_bits, ecx ; put_bits += size; - shl put_buffer, cl ; put_buffer = (put_buffer << size); - or put_buffer, %1 + add put_bits, ecx ; put_bits += size; + shl put_buffer, cl ; put_buffer = (put_buffer << size); + or put_buffer, %1 %endmacro %macro CHECKBUF31 0 - cmp put_bits, 32 ; if (put_bits > 31) { - jl %%.CHECKBUF31_END - EMIT_BYTE - EMIT_BYTE - EMIT_BYTE - EMIT_BYTE + cmp put_bits, 32 ; if (put_bits > 31) { + jl %%.CHECKBUF31_END + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE %%.CHECKBUF31_END: %endmacro %macro CHECKBUF47 0 - cmp put_bits, 48 ; if (put_bits > 47) { - jl %%.CHECKBUF47_END - EMIT_BYTE - EMIT_BYTE - EMIT_BYTE - EMIT_BYTE - EMIT_BYTE - EMIT_BYTE + cmp put_bits, 48 ; if (put_bits > 47) { + jl %%.CHECKBUF47_END + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE + EMIT_BYTE %%.CHECKBUF47_END: %endmacro %macro EMIT_BITS 2 - CHECKBUF47 - mov ecx, %2 - PUT_BITS %1 + CHECKBUF47 + mov ecx, %2 + PUT_BITS %1 %endmacro -%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3) - pxor xmm8, xmm8 ; __m128i neg = _mm_setzero_si128(); - pxor xmm9, xmm9 ; __m128i neg = _mm_setzero_si128(); - pxor xmm10, xmm10 ; __m128i neg = _mm_setzero_si128(); - pxor xmm11, xmm11 ; __m128i neg = _mm_setzero_si128(); - pinsrw %34, word [r12 + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0]; - pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8]; - pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16]; - pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24]; - pinsrw %34, word [r12 + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1]; - pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9]; - pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17]; - pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25]; - pinsrw %34, word [r12 + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2]; - pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10]; - pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18]; - pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26]; - pinsrw %34, word [r12 + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3]; - pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11]; - pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19]; - pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27]; - pinsrw %34, word [r12 + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4]; - pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12]; - pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20]; - pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28]; - pinsrw %34, word [r12 + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5]; - pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13]; - pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21]; - pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29]; - pinsrw %34, word [r12 + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6]; - pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14]; - pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22]; - pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30]; - pinsrw %34, word [r12 + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7]; - pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15]; - pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23]; +%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3) + pxor xmm8, xmm8 ; __m128i neg = _mm_setzero_si128(); + pxor xmm9, xmm9 ; __m128i neg = _mm_setzero_si128(); + pxor xmm10, xmm10 ; __m128i neg = _mm_setzero_si128(); + pxor xmm11, xmm11 ; __m128i neg = _mm_setzero_si128(); + pinsrw %34, word [r12 + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0]; + pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8]; + pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16]; + pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24]; + pinsrw %34, word [r12 + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1]; + pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9]; + pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17]; + pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25]; + pinsrw %34, word [r12 + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2]; + pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10]; + pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18]; + pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26]; + pinsrw %34, word [r12 + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3]; + pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11]; + pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19]; + pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27]; + pinsrw %34, word [r12 + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4]; + pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12]; + pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20]; + pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28]; + pinsrw %34, word [r12 + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5]; + pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13]; + pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21]; + pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29]; + pinsrw %34, word [r12 + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6]; + pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14]; + pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22]; + pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30]; + pinsrw %34, word [r12 + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7]; + pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15]; + pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23]; %if %1 != 32 - pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31]; + pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31]; %else - pinsrw %37, ebx, 7 ; xmm_shadow[31] = block[jno31]; + pinsrw %37, ebx, 7 ; xmm_shadow[31] = block[jno31]; %endif - pcmpgtw xmm8, %34 ; neg = _mm_cmpgt_epi16(neg, x1); - pcmpgtw xmm9, %35 ; neg = _mm_cmpgt_epi16(neg, x1); - pcmpgtw xmm10, %36 ; neg = _mm_cmpgt_epi16(neg, x1); - pcmpgtw xmm11, %37 ; neg = _mm_cmpgt_epi16(neg, x1); - paddw %34, xmm8 ; x1 = _mm_add_epi16(x1, neg); - paddw %35, xmm9 ; x1 = _mm_add_epi16(x1, neg); - paddw %36, xmm10 ; x1 = _mm_add_epi16(x1, neg); - paddw %37, xmm11 ; x1 = _mm_add_epi16(x1, neg); - pxor %34, xmm8 ; x1 = _mm_xor_si128(x1, neg); - pxor %35, xmm9 ; x1 = _mm_xor_si128(x1, neg); - pxor %36, xmm10 ; x1 = _mm_xor_si128(x1, neg); - pxor %37, xmm11 ; x1 = _mm_xor_si128(x1, neg); - pxor xmm8, %34 ; neg = _mm_xor_si128(neg, x1); - pxor xmm9, %35 ; neg = _mm_xor_si128(neg, x1); - pxor xmm10, %36 ; neg = _mm_xor_si128(neg, x1); - pxor xmm11, %37 ; neg = _mm_xor_si128(neg, x1); - movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1); - movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1); - movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1); - movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1); - movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg); - movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg); - movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg); - movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg); + pcmpgtw xmm8, %34 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm9, %35 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm10, %36 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm11, %37 ; neg = _mm_cmpgt_epi16(neg, x1); + paddw %34, xmm8 ; x1 = _mm_add_epi16(x1, neg); + paddw %35, xmm9 ; x1 = _mm_add_epi16(x1, neg); + paddw %36, xmm10 ; x1 = _mm_add_epi16(x1, neg); + paddw %37, xmm11 ; x1 = _mm_add_epi16(x1, neg); + pxor %34, xmm8 ; x1 = _mm_xor_si128(x1, neg); + pxor %35, xmm9 ; x1 = _mm_xor_si128(x1, neg); + pxor %36, xmm10 ; x1 = _mm_xor_si128(x1, neg); + pxor %37, xmm11 ; x1 = _mm_xor_si128(x1, neg); + pxor xmm8, %34 ; neg = _mm_xor_si128(neg, x1); + pxor xmm9, %35 ; neg = _mm_xor_si128(neg, x1); + pxor xmm10, %36 ; neg = _mm_xor_si128(neg, x1); + pxor xmm11, %37 ; neg = _mm_xor_si128(neg, x1); + movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1); + movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1); + movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1); + movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1); + movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg); + movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg); + movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg); + movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg); %endmacro ; @@ -176,185 +176,185 @@ EXTN(jconst_huff_encode_one_block): ; r14 = c_derived_tbl *dctbl ; r15 = c_derived_tbl *actbl -%define t1 rbp-(DCTSIZE2*SIZEOF_WORD) -%define t2 t1-(DCTSIZE2*SIZEOF_WORD) -%define put_buffer r8 -%define put_bits r9d -%define buffer rax +%define t1 rbp-(DCTSIZE2*SIZEOF_WORD) +%define t2 t1-(DCTSIZE2*SIZEOF_WORD) +%define put_buffer r8 +%define put_bits r9d +%define buffer rax - align 16 - global EXTN(jsimd_huff_encode_one_block_sse2) + align 16 + global EXTN(jsimd_huff_encode_one_block_sse2) EXTN(jsimd_huff_encode_one_block_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [t2] - collect_args + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [t2] + collect_args %ifdef WIN64 - movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8 - movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9 - movaps XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10 - movaps XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11 - sub rsp, 4*SIZEOF_XMMWORD + movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8 + movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9 + movaps XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10 + movaps XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11 + sub rsp, 4*SIZEOF_XMMWORD %endif - push rbx - - mov buffer, r11 ; r11 is now sratch - - mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer; - mov put_bits, DWORD [r10+24] ; put_bits = state->cur.put_bits; - push r10 ; r10 is now scratch - - ; Encode the DC coefficient difference per section F.1.2.1 - movsx edi, word [r12] ; temp = temp2 = block[0] - last_dc_val; - sub edi, r13d ; r13 is not used anymore - mov ebx, edi - - ; This is a well-known technique for obtaining the absolute value - ; without a branch. It is derived from an assembly language technique - ; presented in "How to Optimize for the Pentium Processors", - ; Copyright (c) 1996, 1997 by Agner Fog. - mov esi, edi - sar esi, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); - xor edi, esi ; temp ^= temp3; - sub edi, esi ; temp -= temp3; - - ; For a negative input, want temp2 = bitwise complement of abs(input) - ; This code assumes we are on a two's complement machine - add ebx, esi ; temp2 += temp3; - - ; Find the number of bits needed for the magnitude of the coefficient - lea r11, [rel jpeg_nbits_table] - movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp); - ; Emit the Huffman-coded symbol for the number of bits - mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits]; - movzx esi, byte [r14 + rdi + 1024] ; size = dctbl->ehufsi[nbits]; - EMIT_BITS r11, esi ; EMIT_BITS(code, size) - - ; Mask off any extra bits in code - mov esi, 1 - mov ecx, edi - shl esi, cl - dec esi - and ebx, esi ; temp2 &= (((JLONG) 1)<ehufco[0xf0]; - movzx r14d, byte [r15 + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; - lea rsi, [t1] + push rbx + + mov buffer, r11 ; r11 is now sratch + + mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer; + mov put_bits, DWORD [r10+24] ; put_bits = state->cur.put_bits; + push r10 ; r10 is now scratch + + ; Encode the DC coefficient difference per section F.1.2.1 + movsx edi, word [r12] ; temp = temp2 = block[0] - last_dc_val; + sub edi, r13d ; r13 is not used anymore + mov ebx, edi + + ; This is a well-known technique for obtaining the absolute value + ; without a branch. It is derived from an assembly language technique + ; presented in "How to Optimize for the Pentium Processors", + ; Copyright (c) 1996, 1997 by Agner Fog. + mov esi, edi + sar esi, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); + xor edi, esi ; temp ^= temp3; + sub edi, esi ; temp -= temp3; + + ; For a negative input, want temp2 = bitwise complement of abs(input) + ; This code assumes we are on a two's complement machine + add ebx, esi ; temp2 += temp3; + + ; Find the number of bits needed for the magnitude of the coefficient + lea r11, [rel jpeg_nbits_table] + movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp); + ; Emit the Huffman-coded symbol for the number of bits + mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits]; + movzx esi, byte [r14 + rdi + 1024] ; size = dctbl->ehufsi[nbits]; + EMIT_BITS r11, esi ; EMIT_BITS(code, size) + + ; Mask off any extra bits in code + mov esi, 1 + mov ecx, edi + shl esi, cl + dec esi + and ebx, esi ; temp2 &= (((JLONG) 1)<ehufco[0xf0]; + movzx r14d, byte [r15 + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; + lea rsi, [t1] .BLOOP: - bsf r12, r11 ; r = __builtin_ctzl(index); - jz .ELOOP - mov rcx, r12 - lea rsi, [rsi+r12*2] ; k += r; - shr r11, cl ; index >>= r; - movzx rdi, word [rsi] ; temp = t1[k]; - lea rbx, [rel jpeg_nbits_table] - movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp); + bsf r12, r11 ; r = __builtin_ctzl(index); + jz .ELOOP + mov rcx, r12 + lea rsi, [rsi+r12*2] ; k += r; + shr r11, cl ; index >>= r; + movzx rdi, word [rsi] ; temp = t1[k]; + lea rbx, [rel jpeg_nbits_table] + movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp); .BRLOOP: - cmp r12, 16 ; while (r > 15) { - jl .ERLOOP - EMIT_BITS r13, r14d ; EMIT_BITS(code_0xf0, size_0xf0) - sub r12, 16 ; r -= 16; - jmp .BRLOOP + cmp r12, 16 ; while (r > 15) { + jl .ERLOOP + EMIT_BITS r13, r14d ; EMIT_BITS(code_0xf0, size_0xf0) + sub r12, 16 ; r -= 16; + jmp .BRLOOP .ERLOOP: - ; Emit Huffman symbol for run length / number of bits - CHECKBUF31 ; uses rcx, rdx - - shl r12, 4 ; temp3 = (r << 4) + nbits; - add r12, rdi - mov ebx, INT [r15 + r12 * 4] ; code = actbl->ehufco[temp3]; - movzx ecx, byte [r15 + r12 + 1024] ; size = actbl->ehufsi[temp3]; - PUT_BITS rbx - - ;EMIT_CODE(code, size) - - movsx ebx, word [rsi-DCTSIZE2*2] ; temp2 = t2[k]; - ; Mask off any extra bits in code - mov rcx, rdi - mov rdx, 1 - shl rdx, cl - dec rdx - and rbx, rdx ; temp2 &= (((JLONG) 1)<>= 1; - add rsi, 2 ; ++k; - jmp .BLOOP + ; Emit Huffman symbol for run length / number of bits + CHECKBUF31 ; uses rcx, rdx + + shl r12, 4 ; temp3 = (r << 4) + nbits; + add r12, rdi + mov ebx, INT [r15 + r12 * 4] ; code = actbl->ehufco[temp3]; + movzx ecx, byte [r15 + r12 + 1024] ; size = actbl->ehufsi[temp3]; + PUT_BITS rbx + + ;EMIT_CODE(code, size) + + movsx ebx, word [rsi-DCTSIZE2*2] ; temp2 = t2[k]; + ; Mask off any extra bits in code + mov rcx, rdi + mov rdx, 1 + shl rdx, cl + dec rdx + and rbx, rdx ; temp2 &= (((JLONG) 1)<>= 1; + add rsi, 2 ; ++k; + jmp .BLOOP .ELOOP: - ; If the last coef(s) were zero, emit an end-of-block code - lea rdi, [t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k; - cmp rdi, rsi ; if (r > 0) { - je .EFN - mov ebx, INT [r15] ; code = actbl->ehufco[0]; - movzx r12d, byte [r15 + 1024] ; size = actbl->ehufsi[0]; - EMIT_BITS rbx, r12d + ; If the last coef(s) were zero, emit an end-of-block code + lea rdi, [t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k; + cmp rdi, rsi ; if (r > 0) { + je .EFN + mov ebx, INT [r15] ; code = actbl->ehufco[0]; + movzx r12d, byte [r15 + 1024] ; size = actbl->ehufsi[0]; + EMIT_BITS rbx, r12d .EFN: - pop r10 - ; Save put_buffer & put_bits - mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer; - mov DWORD [r10+24], put_bits ; state->cur.put_bits = put_bits; + pop r10 + ; Save put_buffer & put_bits + mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer; + mov DWORD [r10+24], put_bits ; state->cur.put_bits = put_bits; - pop rbx + pop rbx %ifdef WIN64 - movaps xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD] - movaps xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD] - movaps xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD] - movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD] - add rsp, 4*SIZEOF_XMMWORD + movaps xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD] + movaps xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD] + movaps xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD] + movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD] + add rsp, 4*SIZEOF_XMMWORD %endif - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + uncollect_args + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jchuff-sse2.asm b/simd/jchuff-sse2.asm index 36d1f2d..cfae68c 100644 --- a/simd/jchuff-sse2.asm +++ b/simd/jchuff-sse2.asm @@ -23,20 +23,20 @@ %include "jsimdext.inc" ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_huff_encode_one_block) + alignz 16 + global EXTN(jconst_huff_encode_one_block) EXTN(jconst_huff_encode_one_block): %include "jpeg_nbits_table.inc" - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; These macros perform the same task as the emit_bits() function in the ; original libjpeg code. In addition to reducing overhead by explicitly @@ -46,105 +46,105 @@ EXTN(jconst_huff_encode_one_block): ; bytes can be stored in a 64-bit bit buffer before it has to be emptied. %macro EMIT_BYTE 0 - sub put_bits, 8 ; put_bits -= 8; - mov edx, put_buffer - mov ecx, put_bits - shr edx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits); - mov byte [eax], dl ; *buffer++ = c; - add eax, 1 - cmp dl, 0xFF ; need to stuff a zero byte? - jne %%.EMIT_BYTE_END - mov byte [eax], 0 ; *buffer++ = 0; - add eax, 1 + sub put_bits, 8 ; put_bits -= 8; + mov edx, put_buffer + mov ecx, put_bits + shr edx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits); + mov byte [eax], dl ; *buffer++ = c; + add eax, 1 + cmp dl, 0xFF ; need to stuff a zero byte? + jne %%.EMIT_BYTE_END + mov byte [eax], 0 ; *buffer++ = 0; + add eax, 1 %%.EMIT_BYTE_END: %endmacro %macro PUT_BITS 1 - add put_bits, ecx ; put_bits += size; - shl put_buffer, cl ; put_buffer = (put_buffer << size); - or put_buffer, %1 + add put_bits, ecx ; put_bits += size; + shl put_buffer, cl ; put_buffer = (put_buffer << size); + or put_buffer, %1 %endmacro %macro CHECKBUF15 0 - cmp put_bits, 16 ; if (put_bits > 31) { - jl %%.CHECKBUF15_END - mov eax, POINTER [esp+buffer] - EMIT_BYTE - EMIT_BYTE - mov POINTER [esp+buffer], eax + cmp put_bits, 16 ; if (put_bits > 31) { + jl %%.CHECKBUF15_END + mov eax, POINTER [esp+buffer] + EMIT_BYTE + EMIT_BYTE + mov POINTER [esp+buffer], eax %%.CHECKBUF15_END: %endmacro %macro EMIT_BITS 1 - PUT_BITS %1 - CHECKBUF15 + PUT_BITS %1 + CHECKBUF15 %endmacro -%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3) - pxor xmm4, xmm4 ; __m128i neg = _mm_setzero_si128(); - pxor xmm5, xmm5 ; __m128i neg = _mm_setzero_si128(); - pxor xmm6, xmm6 ; __m128i neg = _mm_setzero_si128(); - pxor xmm7, xmm7 ; __m128i neg = _mm_setzero_si128(); - pinsrw %34, word [esi + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0]; - pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8]; - pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16]; - pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24]; - pinsrw %34, word [esi + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1]; - pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9]; - pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17]; - pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25]; - pinsrw %34, word [esi + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2]; - pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10]; - pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18]; - pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26]; - pinsrw %34, word [esi + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3]; - pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11]; - pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19]; - pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27]; - pinsrw %34, word [esi + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4]; - pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12]; - pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20]; - pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28]; - pinsrw %34, word [esi + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5]; - pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13]; - pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21]; - pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29]; - pinsrw %34, word [esi + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6]; - pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14]; - pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22]; - pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30]; - pinsrw %34, word [esi + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7]; - pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15]; - pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23]; +%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3) + pxor xmm4, xmm4 ; __m128i neg = _mm_setzero_si128(); + pxor xmm5, xmm5 ; __m128i neg = _mm_setzero_si128(); + pxor xmm6, xmm6 ; __m128i neg = _mm_setzero_si128(); + pxor xmm7, xmm7 ; __m128i neg = _mm_setzero_si128(); + pinsrw %34, word [esi + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0]; + pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8]; + pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16]; + pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24]; + pinsrw %34, word [esi + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1]; + pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9]; + pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17]; + pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25]; + pinsrw %34, word [esi + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2]; + pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10]; + pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18]; + pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26]; + pinsrw %34, word [esi + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3]; + pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11]; + pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19]; + pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27]; + pinsrw %34, word [esi + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4]; + pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12]; + pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20]; + pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28]; + pinsrw %34, word [esi + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5]; + pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13]; + pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21]; + pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29]; + pinsrw %34, word [esi + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6]; + pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14]; + pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22]; + pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30]; + pinsrw %34, word [esi + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7]; + pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15]; + pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23]; %if %1 != 32 - pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31]; + pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31]; %else - pinsrw %37, ecx, 7 ; xmm_shadow[31] = block[jno31]; + pinsrw %37, ecx, 7 ; xmm_shadow[31] = block[jno31]; %endif - pcmpgtw xmm4, %34 ; neg = _mm_cmpgt_epi16(neg, x1); - pcmpgtw xmm5, %35 ; neg = _mm_cmpgt_epi16(neg, x1); - pcmpgtw xmm6, %36 ; neg = _mm_cmpgt_epi16(neg, x1); - pcmpgtw xmm7, %37 ; neg = _mm_cmpgt_epi16(neg, x1); - paddw %34, xmm4 ; x1 = _mm_add_epi16(x1, neg); - paddw %35, xmm5 ; x1 = _mm_add_epi16(x1, neg); - paddw %36, xmm6 ; x1 = _mm_add_epi16(x1, neg); - paddw %37, xmm7 ; x1 = _mm_add_epi16(x1, neg); - pxor %34, xmm4 ; x1 = _mm_xor_si128(x1, neg); - pxor %35, xmm5 ; x1 = _mm_xor_si128(x1, neg); - pxor %36, xmm6 ; x1 = _mm_xor_si128(x1, neg); - pxor %37, xmm7 ; x1 = _mm_xor_si128(x1, neg); - pxor xmm4, %34 ; neg = _mm_xor_si128(neg, x1); - pxor xmm5, %35 ; neg = _mm_xor_si128(neg, x1); - pxor xmm6, %36 ; neg = _mm_xor_si128(neg, x1); - pxor xmm7, %37 ; neg = _mm_xor_si128(neg, x1); - movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1); - movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1); - movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1); - movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1); - movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg); - movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg); - movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg); - movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg); + pcmpgtw xmm4, %34 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm5, %35 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm6, %36 ; neg = _mm_cmpgt_epi16(neg, x1); + pcmpgtw xmm7, %37 ; neg = _mm_cmpgt_epi16(neg, x1); + paddw %34, xmm4 ; x1 = _mm_add_epi16(x1, neg); + paddw %35, xmm5 ; x1 = _mm_add_epi16(x1, neg); + paddw %36, xmm6 ; x1 = _mm_add_epi16(x1, neg); + paddw %37, xmm7 ; x1 = _mm_add_epi16(x1, neg); + pxor %34, xmm4 ; x1 = _mm_xor_si128(x1, neg); + pxor %35, xmm5 ; x1 = _mm_xor_si128(x1, neg); + pxor %36, xmm6 ; x1 = _mm_xor_si128(x1, neg); + pxor %37, xmm7 ; x1 = _mm_xor_si128(x1, neg); + pxor xmm4, %34 ; neg = _mm_xor_si128(neg, x1); + pxor xmm5, %35 ; neg = _mm_xor_si128(neg, x1); + pxor xmm6, %36 ; neg = _mm_xor_si128(neg, x1); + pxor xmm7, %37 ; neg = _mm_xor_si128(neg, x1); + movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1); + movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1); + movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1); + movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1); + movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg); + movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg); + movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg); + movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg); %endmacro ; @@ -163,264 +163,264 @@ EXTN(jconst_huff_encode_one_block): ; eax + 24 = c_derived_tbl *dctbl ; eax + 28 = c_derived_tbl *actbl -%define pad 6*SIZEOF_DWORD ; Align to 16 bytes -%define t1 pad -%define t2 t1+(DCTSIZE2*SIZEOF_WORD) -%define block t2+(DCTSIZE2*SIZEOF_WORD) -%define actbl block+SIZEOF_DWORD -%define buffer actbl+SIZEOF_DWORD -%define temp buffer+SIZEOF_DWORD -%define temp2 temp+SIZEOF_DWORD -%define temp3 temp2+SIZEOF_DWORD -%define temp4 temp3+SIZEOF_DWORD -%define temp5 temp4+SIZEOF_DWORD -%define gotptr temp5+SIZEOF_DWORD ; void *gotptr -%define put_buffer ebx -%define put_bits edi - - align 16 - global EXTN(jsimd_huff_encode_one_block_sse2) +%define pad 6*SIZEOF_DWORD ; Align to 16 bytes +%define t1 pad +%define t2 t1+(DCTSIZE2*SIZEOF_WORD) +%define block t2+(DCTSIZE2*SIZEOF_WORD) +%define actbl block+SIZEOF_DWORD +%define buffer actbl+SIZEOF_DWORD +%define temp buffer+SIZEOF_DWORD +%define temp2 temp+SIZEOF_DWORD +%define temp3 temp2+SIZEOF_DWORD +%define temp4 temp3+SIZEOF_DWORD +%define temp5 temp4+SIZEOF_DWORD +%define gotptr temp5+SIZEOF_DWORD ; void *gotptr +%define put_buffer ebx +%define put_bits edi + + align 16 + global EXTN(jsimd_huff_encode_one_block_sse2) EXTN(jsimd_huff_encode_one_block_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - sub esp, temp5+9*SIZEOF_DWORD-pad - push ebx - push ecx -; push edx ; need not be preserved - push esi - push edi - push ebp - - mov esi, POINTER [eax+8] ; (working_state *state) - mov put_buffer, DWORD [esi+8] ; put_buffer = state->cur.put_buffer; - mov put_bits, DWORD [esi+12] ; put_bits = state->cur.put_bits; - push esi ; esi is now scratch - - get_GOT edx ; get GOT address - movpic POINTER [esp+gotptr], edx ; save GOT address - - mov ecx, POINTER [eax+28] - mov edx, POINTER [eax+16] - mov esi, POINTER [eax+12] - mov POINTER [esp+actbl], ecx - mov POINTER [esp+block], edx - mov POINTER [esp+buffer], esi - - ; Encode the DC coefficient difference per section F.1.2.1 - mov esi, POINTER [esp+block] ; block - movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val; - sub ecx, DWORD [eax+20] - mov esi, ecx - - ; This is a well-known technique for obtaining the absolute value - ; without a branch. It is derived from an assembly language technique - ; presented in "How to Optimize for the Pentium Processors", - ; Copyright (c) 1996, 1997 by Agner Fog. - mov edx, ecx - sar edx, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); - xor ecx, edx ; temp ^= temp3; - sub ecx, edx ; temp -= temp3; - - ; For a negative input, want temp2 = bitwise complement of abs(input) - ; This code assumes we are on a two's complement machine - add esi, edx ; temp2 += temp3; - mov DWORD [esp+temp], esi ; backup temp2 in temp - - ; Find the number of bits needed for the magnitude of the coefficient - movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp) - movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)] ; nbits = JPEG_NBITS(temp); - mov DWORD [esp+temp2], edx ; backup nbits in temp2 - - ; Emit the Huffman-coded symbol for the number of bits - mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore - mov eax, INT [ebp + edx * 4] ; code = dctbl->ehufco[nbits]; - movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits]; - EMIT_BITS eax ; EMIT_BITS(code, size) - - mov ecx, DWORD [esp+temp2] ; restore nbits - - ; Mask off any extra bits in code - mov eax, 1 - shl eax, cl - dec eax - and eax, DWORD [esp+temp] ; temp2 &= (((JLONG) 1)<cur.put_buffer; + mov put_bits, DWORD [esi+12] ; put_bits = state->cur.put_bits; + push esi ; esi is now scratch + + get_GOT edx ; get GOT address + movpic POINTER [esp+gotptr], edx ; save GOT address + + mov ecx, POINTER [eax+28] + mov edx, POINTER [eax+16] + mov esi, POINTER [eax+12] + mov POINTER [esp+actbl], ecx + mov POINTER [esp+block], edx + mov POINTER [esp+buffer], esi + + ; Encode the DC coefficient difference per section F.1.2.1 + mov esi, POINTER [esp+block] ; block + movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val; + sub ecx, DWORD [eax+20] + mov esi, ecx + + ; This is a well-known technique for obtaining the absolute value + ; with out a branch. It is derived from an assembly language technique + ; presented in "How to Optimize for the Pentium Processors", + ; Copyright (c) 1996, 1997 by Agner Fog. + mov edx, ecx + sar edx, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); + xor ecx, edx ; temp ^= temp3; + sub ecx, edx ; temp -= temp3; + + ; For a negative input, want temp2 = bitwise complement of abs(input) + ; This code assumes we are on a two's complement machine + add esi, edx ; temp2 += temp3; + mov DWORD [esp+temp], esi ; backup temp2 in temp + + ; Find the number of bits needed for the magnitude of the coefficient + movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp) + movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)] ; nbits = JPEG_NBITS(temp); + mov DWORD [esp+temp2], edx ; backup nbits in temp2 + + ; Emit the Huffman-coded symbol for the number of bits + mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore + mov eax, INT [ebp + edx * 4] ; code = dctbl->ehufco[nbits]; + movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits]; + EMIT_BITS eax ; EMIT_BITS(code, size) + + mov ecx, DWORD [esp+temp2] ; restore nbits + + ; Mask off any extra bits in code + mov eax, 1 + shl eax, cl + dec eax + and eax, DWORD [esp+temp] ; temp2 &= (((JLONG) 1)<>= r; - mov DWORD [esp+temp3], edx + bsf ecx, edx ; r = __builtin_ctzl(index); + jz .ELOOP + lea esi, [esi+ecx*2] ; k += r; + shr edx, cl ; index >>= r; + mov DWORD [esp+temp3], edx .BRLOOP: - cmp ecx, 16 ; while (r > 15) { - jl .ERLOOP - sub ecx, 16 ; r -= 16; - mov DWORD [esp+temp], ecx - mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; - movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; - EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0) - mov ecx, DWORD [esp+temp] - jmp .BRLOOP + cmp ecx, 16 ; while (r > 15) { + jl .ERLOOP + sub ecx, 16 ; r -= 16; + mov DWORD [esp+temp], ecx + mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; + movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; + EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0) + mov ecx, DWORD [esp+temp] + jmp .BRLOOP .ERLOOP: - movsx eax, word [esi] ; temp = t1[k]; - movpic edx, POINTER [esp+gotptr] ; load GOT address (edx) - movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)] ; nbits = JPEG_NBITS(temp); - mov DWORD [esp+temp2], eax - ; Emit Huffman symbol for run length / number of bits - shl ecx, 4 ; temp3 = (r << 4) + nbits; - add ecx, eax - mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3]; - movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3]; - EMIT_BITS eax - - movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k]; - ; Mask off any extra bits in code - mov ecx, DWORD [esp+temp2] - mov eax, 1 - shl eax, cl - dec eax - and eax, edx ; temp2 &= (((JLONG) 1)<>= 1; - - jmp .BLOOP + movsx eax, word [esi] ; temp = t1[k]; + movpic edx, POINTER [esp+gotptr] ; load GOT address (edx) + movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)] ; nbits = JPEG_NBITS(temp); + mov DWORD [esp+temp2], eax + ; Emit Huffman symbol for run length / number of bits + shl ecx, 4 ; temp3 = (r << 4) + nbits; + add ecx, eax + mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3]; + movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3]; + EMIT_BITS eax + + movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k]; + ; Mask off any extra bits in code + mov ecx, DWORD [esp+temp2] + mov eax, 1 + shl eax, cl + dec eax + and eax, edx ; temp2 &= (((JLONG) 1)<>= 1; + + jmp .BLOOP .ELOOP: - movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0)); - movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8)); - movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16)); - movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24)); - pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero); - pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero); - pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero); - pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero); - packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1); - packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3); - pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0; - pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16; - shl ecx, 16 - or edx, ecx - not edx ; index = ~index; - - lea eax, [esp + t1 + (DCTSIZE2/2) * 2] - sub eax, esi - shr eax, 1 - bsf ecx, edx ; r = __builtin_ctzl(index); - jz .ELOOP2 - shr edx, cl ; index >>= r; - add ecx, eax - lea esi, [esi+ecx*2] ; k += r; - mov DWORD [esp+temp3], edx - jmp .BRLOOP2 + movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0)); + movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8)); + movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16)); + movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24)); + pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero); + pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero); + pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero); + pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero); + packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1); + packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3); + pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0; + pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16; + shl ecx, 16 + or edx, ecx + not edx ; index = ~index; + + lea eax, [esp + t1 + (DCTSIZE2/2) * 2] + sub eax, esi + shr eax, 1 + bsf ecx, edx ; r = __builtin_ctzl(index); + jz .ELOOP2 + shr edx, cl ; index >>= r; + add ecx, eax + lea esi, [esi+ecx*2] ; k += r; + mov DWORD [esp+temp3], edx + jmp .BRLOOP2 .BLOOP2: - bsf ecx, edx ; r = __builtin_ctzl(index); - jz .ELOOP2 - lea esi, [esi+ecx*2] ; k += r; - shr edx, cl ; index >>= r; - mov DWORD [esp+temp3], edx + bsf ecx, edx ; r = __builtin_ctzl(index); + jz .ELOOP2 + lea esi, [esi+ecx*2] ; k += r; + shr edx, cl ; index >>= r; + mov DWORD [esp+temp3], edx .BRLOOP2: - cmp ecx, 16 ; while (r > 15) { - jl .ERLOOP2 - sub ecx, 16 ; r -= 16; - mov DWORD [esp+temp], ecx - mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; - movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; - EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0) - mov ecx, DWORD [esp+temp] - jmp .BRLOOP2 + cmp ecx, 16 ; while (r > 15) { + jl .ERLOOP2 + sub ecx, 16 ; r -= 16; + mov DWORD [esp+temp], ecx + mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; + movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; + EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0) + mov ecx, DWORD [esp+temp] + jmp .BRLOOP2 .ERLOOP2: - movsx eax, word [esi] ; temp = t1[k]; - bsr eax, eax ; nbits = 32 - __builtin_clz(temp); - inc eax - mov DWORD [esp+temp2], eax - ; Emit Huffman symbol for run length / number of bits - shl ecx, 4 ; temp3 = (r << 4) + nbits; - add ecx, eax - mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3]; - movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3]; - EMIT_BITS eax - - movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k]; - ; Mask off any extra bits in code - mov ecx, DWORD [esp+temp2] - mov eax, 1 - shl eax, cl - dec eax - and eax, edx ; temp2 &= (((JLONG) 1)<>= 1; - - jmp .BLOOP2 + movsx eax, word [esi] ; temp = t1[k]; + bsr eax, eax ; nbits = 32 - __builtin_clz(temp); + inc eax + mov DWORD [esp+temp2], eax + ; Emit Huffman symbol for run length / number of bits + shl ecx, 4 ; temp3 = (r << 4) + nbits; + add ecx, eax + mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3]; + movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3]; + EMIT_BITS eax + + movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k]; + ; Mask off any extra bits in code + mov ecx, DWORD [esp+temp2] + mov eax, 1 + shl eax, cl + dec eax + and eax, edx ; temp2 &= (((JLONG) 1)<>= 1; + + jmp .BLOOP2 .ELOOP2: - ; If the last coef(s) were zero, emit an end-of-block code - lea edx, [esp + t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k; - cmp edx, esi ; if (r > 0) { - je .EFN - mov eax, INT [ebp] ; code = actbl->ehufco[0]; - movzx ecx, byte [ebp + 1024] ; size = actbl->ehufsi[0]; - EMIT_BITS eax + ; If the last coef(s) were zero, emit an end-of-block code + lea edx, [esp + t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k; + cmp edx, esi ; if (r > 0) { + je .EFN + mov eax, INT [ebp] ; code = actbl->ehufco[0]; + movzx ecx, byte [ebp + 1024] ; size = actbl->ehufsi[0]; + EMIT_BITS eax .EFN: - mov eax, [esp+buffer] - pop esi - ; Save put_buffer & put_bits - mov DWORD [esi+8], put_buffer ; state->cur.put_buffer = put_buffer; - mov DWORD [esi+12], put_bits ; state->cur.put_bits = put_bits; - - pop ebp - pop edi - pop esi -; pop edx ; need not be preserved - pop ecx - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + mov eax, [esp+buffer] + pop esi + ; Save put_buffer & put_bits + mov DWORD [esi+8], put_buffer ; state->cur.put_buffer = put_buffer; + mov DWORD [esi+12], put_bits ; state->cur.put_bits = put_bits; + + pop ebp + pop edi + pop esi +; pop edx ; need not be preserved + pop ecx + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jcsample-sse2-64.asm b/simd/jcsample-sse2-64.asm index 40ee15f..f6b2aa7 100644 --- a/simd/jcsample-sse2-64.asm +++ b/simd/jcsample-sse2-64.asm @@ -19,8 +19,8 @@ %include "jsimdext.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Downsample pixel values of a single component. ; This version handles the common case of 2:1 horizontal and 1:1 vertical, @@ -39,130 +39,130 @@ ; r14 = JSAMPARRAY input_data ; r15 = JSAMPARRAY output_data - align 16 - global EXTN(jsimd_h2v1_downsample_sse2) + align 16 + global EXTN(jsimd_h2v1_downsample_sse2) EXTN(jsimd_h2v1_downsample_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args + push rbp + mov rax, rsp + mov rbp, rsp + collect_args - mov ecx, r13d - shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) - jz near .return + mov ecx, r13d + shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) + jz near .return - mov edx, r10d + mov edx, r10d - ; -- expand_right_edge + ; -- expand_right_edge - push rcx - shl rcx,1 ; output_cols * 2 - sub rcx,rdx - jle short .expand_end + push rcx + shl rcx, 1 ; output_cols * 2 + sub rcx, rdx + jle short .expand_end - mov rax, r11 - test rax,rax - jle short .expand_end + mov rax, r11 + test rax, rax + jle short .expand_end - cld - mov rsi, r14 ; input_data + cld + mov rsi, r14 ; input_data .expandloop: - push rax - push rcx + push rax + push rcx - mov rdi, JSAMPROW [rsi] - add rdi,rdx - mov al, JSAMPLE [rdi-1] + mov rdi, JSAMPROW [rsi] + add rdi, rdx + mov al, JSAMPLE [rdi-1] - rep stosb + rep stosb - pop rcx - pop rax + pop rcx + pop rax - add rsi, byte SIZEOF_JSAMPROW - dec rax - jg short .expandloop + add rsi, byte SIZEOF_JSAMPROW + dec rax + jg short .expandloop .expand_end: - pop rcx ; output_cols + pop rcx ; output_cols - ; -- h2v1_downsample + ; -- h2v1_downsample - mov eax, r12d ; rowctr - test eax,eax - jle near .return + mov eax, r12d ; rowctr + test eax, eax + jle near .return - mov rdx, 0x00010000 ; bias pattern - movd xmm7,edx - pcmpeqw xmm6,xmm6 - pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} - psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + mov rdx, 0x00010000 ; bias pattern + movd xmm7, edx + pcmpeqw xmm6, xmm6 + pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} - mov rsi, r14 ; input_data - mov rdi, r15 ; output_data + mov rsi, r14 ; input_data + mov rdi, r15 ; output_data .rowloop: - push rcx - push rdi - push rsi + push rcx + push rdi + push rsi - mov rsi, JSAMPROW [rsi] ; inptr - mov rdi, JSAMPROW [rdi] ; outptr + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr - cmp rcx, byte SIZEOF_XMMWORD - jae short .columnloop + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop .columnloop_r8: - movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] - pxor xmm1,xmm1 - mov rcx, SIZEOF_XMMWORD - jmp short .downsample + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + pxor xmm1, xmm1 + mov rcx, SIZEOF_XMMWORD + jmp short .downsample .columnloop: - movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] .downsample: - movdqa xmm2,xmm0 - movdqa xmm3,xmm1 - - pand xmm0,xmm6 - psrlw xmm2,BYTE_BIT - pand xmm1,xmm6 - psrlw xmm3,BYTE_BIT - - paddw xmm0,xmm2 - paddw xmm1,xmm3 - paddw xmm0,xmm7 - paddw xmm1,xmm7 - psrlw xmm0,1 - psrlw xmm1,1 - - packuswb xmm0,xmm1 - - movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 - - sub rcx, byte SIZEOF_XMMWORD ; outcol - add rsi, byte 2*SIZEOF_XMMWORD ; inptr - add rdi, byte 1*SIZEOF_XMMWORD ; outptr - cmp rcx, byte SIZEOF_XMMWORD - jae short .columnloop - test rcx,rcx - jnz short .columnloop_r8 - - pop rsi - pop rdi - pop rcx - - add rsi, byte SIZEOF_JSAMPROW ; input_data - add rdi, byte SIZEOF_JSAMPROW ; output_data - dec rax ; rowctr - jg near .rowloop + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + pand xmm0, xmm6 + psrlw xmm2, BYTE_BIT + pand xmm1, xmm6 + psrlw xmm3, BYTE_BIT + + paddw xmm0, xmm2 + paddw xmm1, xmm3 + paddw xmm0, xmm7 + paddw xmm1, xmm7 + psrlw xmm0, 1 + psrlw xmm1, 1 + + packuswb xmm0, xmm1 + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + + sub rcx, byte SIZEOF_XMMWORD ; outcol + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rdi, byte 1*SIZEOF_XMMWORD ; outptr + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop + test rcx, rcx + jnz short .columnloop_r8 + + pop rsi + pop rdi + pop rcx + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rax ; rowctr + jg near .rowloop .return: - uncollect_args - pop rbp - ret + uncollect_args + pop rbp + ret ; -------------------------------------------------------------------------- ; @@ -183,147 +183,147 @@ EXTN(jsimd_h2v1_downsample_sse2): ; r14 = JSAMPARRAY input_data ; r15 = JSAMPARRAY output_data - align 16 - global EXTN(jsimd_h2v2_downsample_sse2) + align 16 + global EXTN(jsimd_h2v2_downsample_sse2) EXTN(jsimd_h2v2_downsample_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args + push rbp + mov rax, rsp + mov rbp, rsp + collect_args - mov ecx, r13d - shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) - jz near .return + mov ecx, r13d + shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) + jz near .return - mov edx, r10d + mov edx, r10d - ; -- expand_right_edge + ; -- expand_right_edge - push rcx - shl rcx,1 ; output_cols * 2 - sub rcx,rdx - jle short .expand_end + push rcx + shl rcx, 1 ; output_cols * 2 + sub rcx, rdx + jle short .expand_end - mov rax, r11 - test rax,rax - jle short .expand_end + mov rax, r11 + test rax, rax + jle short .expand_end - cld - mov rsi, r14 ; input_data + cld + mov rsi, r14 ; input_data .expandloop: - push rax - push rcx + push rax + push rcx - mov rdi, JSAMPROW [rsi] - add rdi,rdx - mov al, JSAMPLE [rdi-1] + mov rdi, JSAMPROW [rsi] + add rdi, rdx + mov al, JSAMPLE [rdi-1] - rep stosb + rep stosb - pop rcx - pop rax + pop rcx + pop rax - add rsi, byte SIZEOF_JSAMPROW - dec rax - jg short .expandloop + add rsi, byte SIZEOF_JSAMPROW + dec rax + jg short .expandloop .expand_end: - pop rcx ; output_cols + pop rcx ; output_cols - ; -- h2v2_downsample + ; -- h2v2_downsample - mov eax, r12d ; rowctr - test rax,rax - jle near .return + mov eax, r12d ; rowctr + test rax, rax + jle near .return - mov rdx, 0x00020001 ; bias pattern - movd xmm7,edx - pcmpeqw xmm6,xmm6 - pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} - psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + mov rdx, 0x00020001 ; bias pattern + movd xmm7, edx + pcmpeqw xmm6, xmm6 + pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} - mov rsi, r14 ; input_data - mov rdi, r15 ; output_data + mov rsi, r14 ; input_data + mov rdi, r15 ; output_data .rowloop: - push rcx - push rdi - push rsi + push rcx + push rdi + push rsi - mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 - mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 - mov rdi, JSAMPROW [rdi] ; outptr + mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 + mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 + mov rdi, JSAMPROW [rdi] ; outptr - cmp rcx, byte SIZEOF_XMMWORD - jae short .columnloop + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop .columnloop_r8: - movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] - pxor xmm2,xmm2 - pxor xmm3,xmm3 - mov rcx, SIZEOF_XMMWORD - jmp short .downsample + movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + pxor xmm2, xmm2 + pxor xmm3, xmm3 + mov rcx, SIZEOF_XMMWORD + jmp short .downsample .columnloop: - movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] - movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] .downsample: - movdqa xmm4,xmm0 - movdqa xmm5,xmm1 - pand xmm0,xmm6 - psrlw xmm4,BYTE_BIT - pand xmm1,xmm6 - psrlw xmm5,BYTE_BIT - paddw xmm0,xmm4 - paddw xmm1,xmm5 - - movdqa xmm4,xmm2 - movdqa xmm5,xmm3 - pand xmm2,xmm6 - psrlw xmm4,BYTE_BIT - pand xmm3,xmm6 - psrlw xmm5,BYTE_BIT - paddw xmm2,xmm4 - paddw xmm3,xmm5 - - paddw xmm0,xmm1 - paddw xmm2,xmm3 - paddw xmm0,xmm7 - paddw xmm2,xmm7 - psrlw xmm0,2 - psrlw xmm2,2 - - packuswb xmm0,xmm2 - - movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 - - sub rcx, byte SIZEOF_XMMWORD ; outcol - add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 - add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 - add rdi, byte 1*SIZEOF_XMMWORD ; outptr - cmp rcx, byte SIZEOF_XMMWORD - jae near .columnloop - test rcx,rcx - jnz near .columnloop_r8 - - pop rsi - pop rdi - pop rcx - - add rsi, byte 2*SIZEOF_JSAMPROW ; input_data - add rdi, byte 1*SIZEOF_JSAMPROW ; output_data - dec rax ; rowctr - jg near .rowloop + movdqa xmm4, xmm0 + movdqa xmm5, xmm1 + pand xmm0, xmm6 + psrlw xmm4, BYTE_BIT + pand xmm1, xmm6 + psrlw xmm5, BYTE_BIT + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movdqa xmm4, xmm2 + movdqa xmm5, xmm3 + pand xmm2, xmm6 + psrlw xmm4, BYTE_BIT + pand xmm3, xmm6 + psrlw xmm5, BYTE_BIT + paddw xmm2, xmm4 + paddw xmm3, xmm5 + + paddw xmm0, xmm1 + paddw xmm2, xmm3 + paddw xmm0, xmm7 + paddw xmm2, xmm7 + psrlw xmm0, 2 + psrlw xmm2, 2 + + packuswb xmm0, xmm2 + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + + sub rcx, byte SIZEOF_XMMWORD ; outcol + add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 + add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 + add rdi, byte 1*SIZEOF_XMMWORD ; outptr + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx, rcx + jnz near .columnloop_r8 + + pop rsi + pop rdi + pop rcx + + add rsi, byte 2*SIZEOF_JSAMPROW ; input_data + add rdi, byte 1*SIZEOF_JSAMPROW ; output_data + dec rax ; rowctr + jg near .rowloop .return: - uncollect_args - pop rbp - ret + uncollect_args + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jcsample-sse2.asm b/simd/jcsample-sse2.asm index 83c9d15..5fc2637 100644 --- a/simd/jcsample-sse2.asm +++ b/simd/jcsample-sse2.asm @@ -18,8 +18,8 @@ %include "jsimdext.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Downsample pixel values of a single component. ; This version handles the common case of 2:1 horizontal and 1:1 vertical, @@ -38,141 +38,141 @@ %define input_data(b) (b)+24 ; JSAMPARRAY input_data %define output_data(b) (b)+28 ; JSAMPARRAY output_data - align 16 - global EXTN(jsimd_h2v1_downsample_sse2) + align 16 + global EXTN(jsimd_h2v1_downsample_sse2) EXTN(jsimd_h2v1_downsample_sse2): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov ecx, JDIMENSION [width_blks(ebp)] - shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) - jz near .return - - mov edx, JDIMENSION [img_width(ebp)] - - ; -- expand_right_edge - - push ecx - shl ecx,1 ; output_cols * 2 - sub ecx,edx - jle short .expand_end - - mov eax, INT [max_v_samp(ebp)] - test eax,eax - jle short .expand_end - - cld - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - alignx 16,7 + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx, 1 ; output_cols * 2 + sub ecx, edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax, eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16, 7 .expandloop: - push eax - push ecx + push eax + push ecx - mov edi, JSAMPROW [esi] - add edi,edx - mov al, JSAMPLE [edi-1] + mov edi, JSAMPROW [esi] + add edi, edx + mov al, JSAMPLE [edi-1] - rep stosb + rep stosb - pop ecx - pop eax + pop ecx + pop eax - add esi, byte SIZEOF_JSAMPROW - dec eax - jg short .expandloop + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop .expand_end: - pop ecx ; output_cols + pop ecx ; output_cols - ; -- h2v1_downsample + ; -- h2v1_downsample - mov eax, JDIMENSION [v_samp(ebp)] ; rowctr - test eax,eax - jle near .return + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax, eax + jle near .return - mov edx, 0x00010000 ; bias pattern - movd xmm7,edx - pcmpeqw xmm6,xmm6 - pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} - psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + mov edx, 0x00010000 ; bias pattern + movd xmm7, edx + pcmpeqw xmm6, xmm6 + pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, JSAMPARRAY [output_data(ebp)] ; output_data - alignx 16,7 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16, 7 .rowloop: - push ecx - push edi - push esi + push ecx + push edi + push esi - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr - cmp ecx, byte SIZEOF_XMMWORD - jae short .columnloop - alignx 16,7 + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + alignx 16, 7 .columnloop_r8: - movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] - pxor xmm1,xmm1 - mov ecx, SIZEOF_XMMWORD - jmp short .downsample - alignx 16,7 + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + pxor xmm1, xmm1 + mov ecx, SIZEOF_XMMWORD + jmp short .downsample + alignx 16, 7 .columnloop: - movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] .downsample: - movdqa xmm2,xmm0 - movdqa xmm3,xmm1 - - pand xmm0,xmm6 - psrlw xmm2,BYTE_BIT - pand xmm1,xmm6 - psrlw xmm3,BYTE_BIT - - paddw xmm0,xmm2 - paddw xmm1,xmm3 - paddw xmm0,xmm7 - paddw xmm1,xmm7 - psrlw xmm0,1 - psrlw xmm1,1 - - packuswb xmm0,xmm1 - - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 - - sub ecx, byte SIZEOF_XMMWORD ; outcol - add esi, byte 2*SIZEOF_XMMWORD ; inptr - add edi, byte 1*SIZEOF_XMMWORD ; outptr - cmp ecx, byte SIZEOF_XMMWORD - jae short .columnloop - test ecx,ecx - jnz short .columnloop_r8 - - pop esi - pop edi - pop ecx - - add esi, byte SIZEOF_JSAMPROW ; input_data - add edi, byte SIZEOF_JSAMPROW ; output_data - dec eax ; rowctr - jg near .rowloop + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + pand xmm0, xmm6 + psrlw xmm2, BYTE_BIT + pand xmm1, xmm6 + psrlw xmm3, BYTE_BIT + + paddw xmm0, xmm2 + paddw xmm1, xmm3 + paddw xmm0, xmm7 + paddw xmm1, xmm7 + psrlw xmm0, 1 + psrlw xmm1, 1 + + packuswb xmm0, xmm1 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + + sub ecx, byte SIZEOF_XMMWORD ; outcol + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add edi, byte 1*SIZEOF_XMMWORD ; outptr + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + test ecx, ecx + jnz short .columnloop_r8 + + pop esi + pop edi + pop ecx + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved -; pop ebx ; unused - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -193,158 +193,158 @@ EXTN(jsimd_h2v1_downsample_sse2): %define input_data(b) (b)+24 ; JSAMPARRAY input_data %define output_data(b) (b)+28 ; JSAMPARRAY output_data - align 16 - global EXTN(jsimd_h2v2_downsample_sse2) + align 16 + global EXTN(jsimd_h2v2_downsample_sse2) EXTN(jsimd_h2v2_downsample_sse2): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov ecx, JDIMENSION [width_blks(ebp)] - shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) - jz near .return - - mov edx, JDIMENSION [img_width(ebp)] - - ; -- expand_right_edge - - push ecx - shl ecx,1 ; output_cols * 2 - sub ecx,edx - jle short .expand_end - - mov eax, INT [max_v_samp(ebp)] - test eax,eax - jle short .expand_end - - cld - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - alignx 16,7 + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx, 1 ; output_cols * 2 + sub ecx, edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax, eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16, 7 .expandloop: - push eax - push ecx + push eax + push ecx - mov edi, JSAMPROW [esi] - add edi,edx - mov al, JSAMPLE [edi-1] + mov edi, JSAMPROW [esi] + add edi, edx + mov al, JSAMPLE [edi-1] - rep stosb + rep stosb - pop ecx - pop eax + pop ecx + pop eax - add esi, byte SIZEOF_JSAMPROW - dec eax - jg short .expandloop + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop .expand_end: - pop ecx ; output_cols + pop ecx ; output_cols - ; -- h2v2_downsample + ; -- h2v2_downsample - mov eax, JDIMENSION [v_samp(ebp)] ; rowctr - test eax,eax - jle near .return + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax, eax + jle near .return - mov edx, 0x00020001 ; bias pattern - movd xmm7,edx - pcmpeqw xmm6,xmm6 - pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} - psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + mov edx, 0x00020001 ; bias pattern + movd xmm7, edx + pcmpeqw xmm6, xmm6 + pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, JSAMPARRAY [output_data(ebp)] ; output_data - alignx 16,7 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16, 7 .rowloop: - push ecx - push edi - push esi + push ecx + push edi + push esi - mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 - mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 - mov edi, JSAMPROW [edi] ; outptr + mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 + mov edi, JSAMPROW [edi] ; outptr - cmp ecx, byte SIZEOF_XMMWORD - jae short .columnloop - alignx 16,7 + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + alignx 16, 7 .columnloop_r8: - movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] - pxor xmm2,xmm2 - pxor xmm3,xmm3 - mov ecx, SIZEOF_XMMWORD - jmp short .downsample - alignx 16,7 + movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + pxor xmm2, xmm2 + pxor xmm3, xmm3 + mov ecx, SIZEOF_XMMWORD + jmp short .downsample + alignx 16, 7 .columnloop: - movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] - movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] .downsample: - movdqa xmm4,xmm0 - movdqa xmm5,xmm1 - pand xmm0,xmm6 - psrlw xmm4,BYTE_BIT - pand xmm1,xmm6 - psrlw xmm5,BYTE_BIT - paddw xmm0,xmm4 - paddw xmm1,xmm5 - - movdqa xmm4,xmm2 - movdqa xmm5,xmm3 - pand xmm2,xmm6 - psrlw xmm4,BYTE_BIT - pand xmm3,xmm6 - psrlw xmm5,BYTE_BIT - paddw xmm2,xmm4 - paddw xmm3,xmm5 - - paddw xmm0,xmm1 - paddw xmm2,xmm3 - paddw xmm0,xmm7 - paddw xmm2,xmm7 - psrlw xmm0,2 - psrlw xmm2,2 - - packuswb xmm0,xmm2 - - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 - - sub ecx, byte SIZEOF_XMMWORD ; outcol - add edx, byte 2*SIZEOF_XMMWORD ; inptr0 - add esi, byte 2*SIZEOF_XMMWORD ; inptr1 - add edi, byte 1*SIZEOF_XMMWORD ; outptr - cmp ecx, byte SIZEOF_XMMWORD - jae near .columnloop - test ecx,ecx - jnz near .columnloop_r8 - - pop esi - pop edi - pop ecx - - add esi, byte 2*SIZEOF_JSAMPROW ; input_data - add edi, byte 1*SIZEOF_JSAMPROW ; output_data - dec eax ; rowctr - jg near .rowloop + movdqa xmm4, xmm0 + movdqa xmm5, xmm1 + pand xmm0, xmm6 + psrlw xmm4, BYTE_BIT + pand xmm1, xmm6 + psrlw xmm5, BYTE_BIT + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movdqa xmm4, xmm2 + movdqa xmm5, xmm3 + pand xmm2, xmm6 + psrlw xmm4, BYTE_BIT + pand xmm3, xmm6 + psrlw xmm5, BYTE_BIT + paddw xmm2, xmm4 + paddw xmm3, xmm5 + + paddw xmm0, xmm1 + paddw xmm2, xmm3 + paddw xmm0, xmm7 + paddw xmm2, xmm7 + psrlw xmm0, 2 + psrlw xmm2, 2 + + packuswb xmm0, xmm2 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + + sub ecx, byte SIZEOF_XMMWORD ; outcol + add edx, byte 2*SIZEOF_XMMWORD ; inptr0 + add esi, byte 2*SIZEOF_XMMWORD ; inptr1 + add edi, byte 1*SIZEOF_XMMWORD ; outptr + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx, ecx + jnz near .columnloop_r8 + + pop esi + pop edi + pop ecx + + add esi, byte 2*SIZEOF_JSAMPROW ; input_data + add edi, byte 1*SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved -; pop ebx ; unused - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jdcolext-sse2-64.asm b/simd/jdcolext-sse2-64.asm index 4634066..a42091e 100644 --- a/simd/jdcolext-sse2-64.asm +++ b/simd/jdcolext-sse2-64.asm @@ -34,407 +34,407 @@ ; r13 = JSAMPARRAY output_buf ; r14 = int num_rows -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_ycc_rgb_convert_sse2) + align 16 + global EXTN(jsimd_ycc_rgb_convert_sse2) EXTN(jsimd_ycc_rgb_convert_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args - push rbx - - mov ecx, r10d ; num_cols - test rcx,rcx - jz near .return - - push rcx - - mov rdi, r11 - mov ecx, r12d - mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] - mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] - mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] - lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] - lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] - lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] - - pop rcx - - mov rdi, r13 - mov eax, r14d - test rax,rax - jle near .return + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov ecx, r10d ; num_cols + test rcx, rcx + jz near .return + + push rcx + + mov rdi, r11 + mov ecx, r12d + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] + lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] + lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rdi, r13 + mov eax, r14d + test rax, rax + jle near .return .rowloop: - push rax - push rdi - push rdx - push rbx - push rsi - push rcx ; col - - mov rsi, JSAMPROW [rsi] ; inptr0 - mov rbx, JSAMPROW [rbx] ; inptr1 - mov rdx, JSAMPROW [rdx] ; inptr2 - mov rdi, JSAMPROW [rdi] ; outptr + push rax + push rdi + push rdx + push rbx + push rsi + push rcx ; col + + mov rsi, JSAMPROW [rsi] ; inptr0 + mov rbx, JSAMPROW [rbx] ; inptr1 + mov rdx, JSAMPROW [rdx] ; inptr2 + mov rdi, JSAMPROW [rdi] ; outptr .columnloop: - movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF) - movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF) - - pcmpeqw xmm4,xmm4 - pcmpeqw xmm7,xmm7 - psrlw xmm4,BYTE_BIT - psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} - movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} - - pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE - psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO - pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE - psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO - - paddw xmm4,xmm7 - paddw xmm5,xmm7 - paddw xmm0,xmm7 - paddw xmm1,xmm7 - - ; (Original) - ; R = Y + 1.40200 * Cr - ; G = Y - 0.34414 * Cb - 0.71414 * Cr - ; B = Y + 1.77200 * Cb - ; - ; (This implementation) - ; R = Y + 0.40200 * Cr + Cr - ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr - ; B = Y - 0.22800 * Cb + Cb + Cb - - movdqa xmm2,xmm4 ; xmm2=CbE - movdqa xmm3,xmm5 ; xmm3=CbO - paddw xmm4,xmm4 ; xmm4=2*CbE - paddw xmm5,xmm5 ; xmm5=2*CbO - movdqa xmm6,xmm0 ; xmm6=CrE - movdqa xmm7,xmm1 ; xmm7=CrO - paddw xmm0,xmm0 ; xmm0=2*CrE - paddw xmm1,xmm1 ; xmm1=2*CrO - - pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800)) - pmulhw xmm5,[rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800)) - pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200)) - pmulhw xmm1,[rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200)) - - paddw xmm4,[rel PW_ONE] - paddw xmm5,[rel PW_ONE] - psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800)) - psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800)) - paddw xmm0,[rel PW_ONE] - paddw xmm1,[rel PW_ONE] - psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200)) - psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200)) - - paddw xmm4,xmm2 - paddw xmm5,xmm3 - paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E - paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O - paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E - paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E - movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O - - movdqa xmm4,xmm2 - movdqa xmm5,xmm3 - punpcklwd xmm2,xmm6 - punpckhwd xmm4,xmm6 - pmaddwd xmm2,[rel PW_MF0344_F0285] - pmaddwd xmm4,[rel PW_MF0344_F0285] - punpcklwd xmm3,xmm7 - punpckhwd xmm5,xmm7 - pmaddwd xmm3,[rel PW_MF0344_F0285] - pmaddwd xmm5,[rel PW_MF0344_F0285] - - paddd xmm2,[rel PD_ONEHALF] - paddd xmm4,[rel PD_ONEHALF] - psrad xmm2,SCALEBITS - psrad xmm4,SCALEBITS - paddd xmm3,[rel PD_ONEHALF] - paddd xmm5,[rel PD_ONEHALF] - psrad xmm3,SCALEBITS - psrad xmm5,SCALEBITS - - packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) - packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) - psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E - psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O - - movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF) - - pcmpeqw xmm4,xmm4 - psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} - pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE - psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO - - paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) - paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) - packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) - packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) - - paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) - paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) - packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) - packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) - - paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) - paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) - packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) - packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) - -%if RGB_PIXELSIZE == 3 ; --------------- - - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) - punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) - - movdqa xmmG,xmmA - movdqa xmmH,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) - punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) - - psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) - psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) - - movdqa xmmC,xmmD - movdqa xmmB,xmmD - punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) - punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) - - psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) - - movdqa xmmF,xmmE - punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) - punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) - - pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) - movdqa xmmB,xmmE - punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) - punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) - punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) - - pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) - movdqa xmmB,xmmF - punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) - punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) - punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) - - punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - - cmp rcx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test rdi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF - jmp short .out0 + movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF) + movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF) + + pcmpeqw xmm4, xmm4 + pcmpeqw xmm7, xmm7 + psrlw xmm4, BYTE_BIT + psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} + + pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE + psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO + pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE + psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO + + paddw xmm4, xmm7 + paddw xmm5, xmm7 + paddw xmm0, xmm7 + paddw xmm1, xmm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm2, xmm4 ; xmm2=CbE + movdqa xmm3, xmm5 ; xmm3=CbO + paddw xmm4, xmm4 ; xmm4=2*CbE + paddw xmm5, xmm5 ; xmm5=2*CbO + movdqa xmm6, xmm0 ; xmm6=CrE + movdqa xmm7, xmm1 ; xmm7=CrO + paddw xmm0, xmm0 ; xmm0=2*CrE + paddw xmm1, xmm1 ; xmm1=2*CrO + + pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800)) + pmulhw xmm5, [rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800)) + pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200)) + pmulhw xmm1, [rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200)) + + paddw xmm4, [rel PW_ONE] + paddw xmm5, [rel PW_ONE] + psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800)) + psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800)) + paddw xmm0, [rel PW_ONE] + paddw xmm1, [rel PW_ONE] + psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200)) + psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200)) + + paddw xmm4, xmm2 + paddw xmm5, xmm3 + paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E + paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O + paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E + paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O + + movdqa xmm4, xmm2 + movdqa xmm5, xmm3 + punpcklwd xmm2, xmm6 + punpckhwd xmm4, xmm6 + pmaddwd xmm2, [rel PW_MF0344_F0285] + pmaddwd xmm4, [rel PW_MF0344_F0285] + punpcklwd xmm3, xmm7 + punpckhwd xmm5, xmm7 + pmaddwd xmm3, [rel PW_MF0344_F0285] + pmaddwd xmm5, [rel PW_MF0344_F0285] + + paddd xmm2, [rel PD_ONEHALF] + paddd xmm4, [rel PD_ONEHALF] + psrad xmm2, SCALEBITS + psrad xmm4, SCALEBITS + paddd xmm3, [rel PD_ONEHALF] + paddd xmm5, [rel PD_ONEHALF] + psrad xmm3, SCALEBITS + psrad xmm5, SCALEBITS + + packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF) + + pcmpeqw xmm4, xmm4 + psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} + pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE + psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO + + paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) + paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) + packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) + paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) + packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) + paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) + packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG, xmmA + movdqa xmmH, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC, xmmD + movdqa xmmB, xmmD + punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF, xmmE + punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB, xmmE + punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB, xmmF + punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 .out1: ; --(unaligned)----------------- - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF .out0: - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub rcx, byte SIZEOF_XMMWORD - jz near .nextrow + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .nextrow - add rsi, byte SIZEOF_XMMWORD ; inptr0 - add rbx, byte SIZEOF_XMMWORD ; inptr1 - add rdx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop + add rsi, byte SIZEOF_XMMWORD ; inptr0 + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop .column_st32: - lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE - cmp rcx, byte 2*SIZEOF_XMMWORD - jb short .column_st16 - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - add rdi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmF - sub rcx, byte 2*SIZEOF_XMMWORD - jmp short .column_st15 + lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE + cmp rcx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmF + sub rcx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 .column_st16: - cmp rcx, byte SIZEOF_XMMWORD - jb short .column_st15 - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub rcx, byte SIZEOF_XMMWORD + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub rcx, byte SIZEOF_XMMWORD .column_st15: - ; Store the lower 8 bytes of xmmA to the output when it has enough - ; space. - cmp rcx, byte SIZEOF_MMWORD - jb short .column_st7 - movq XMM_MMWORD [rdi], xmmA - add rdi, byte SIZEOF_MMWORD - sub rcx, byte SIZEOF_MMWORD - psrldq xmmA, SIZEOF_MMWORD + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_MMWORD + sub rcx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD .column_st7: - ; Store the lower 4 bytes of xmmA to the output when it has enough - ; space. - cmp rcx, byte SIZEOF_DWORD - jb short .column_st3 - movd XMM_DWORD [rdi], xmmA - add rdi, byte SIZEOF_DWORD - sub rcx, byte SIZEOF_DWORD - psrldq xmmA, SIZEOF_DWORD + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [rdi], xmmA + add rdi, byte SIZEOF_DWORD + sub rcx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD .column_st3: - ; Store the lower 2 bytes of rax to the output when it has enough - ; space. - movd eax, xmmA - cmp rcx, byte SIZEOF_WORD - jb short .column_st1 - mov WORD [rdi], ax - add rdi, byte SIZEOF_WORD - sub rcx, byte SIZEOF_WORD - shr rax, 16 + ; Store the lower 2 bytes of rax to the output when it has enough + ; space. + movd eax, xmmA + cmp rcx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [rdi], ax + add rdi, byte SIZEOF_WORD + sub rcx, byte SIZEOF_WORD + shr rax, 16 .column_st1: - ; Store the lower 1 byte of rax to the output when it has enough - ; space. - test rcx, rcx - jz short .nextrow - mov BYTE [rdi], al + ; Store the lower 1 byte of rax to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + mov BYTE [rdi], al -%else ; RGB_PIXELSIZE == 4 ; ----------- +%else ; RGB_PIXELSIZE == 4 ; ----------- %ifdef RGBX_FILLER_0XFF - pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) %else - pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) %endif - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) - punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) - punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) - - movdqa xmmC,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) - punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) - movdqa xmmG,xmmB - punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) - punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) - - movdqa xmmD,xmmA - punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - movdqa xmmH,xmmC - punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - - cmp rcx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test rdi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC - movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH - jmp short .out0 + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG, xmmB + punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH, xmmC + punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 .out1: ; --(unaligned)----------------- - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC - movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH .out0: - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub rcx, byte SIZEOF_XMMWORD - jz near .nextrow + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .nextrow - add rsi, byte SIZEOF_XMMWORD ; inptr0 - add rbx, byte SIZEOF_XMMWORD ; inptr1 - add rdx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop + add rsi, byte SIZEOF_XMMWORD ; inptr0 + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop .column_st32: - cmp rcx, byte SIZEOF_XMMWORD/2 - jb short .column_st16 - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - add rdi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmC - movdqa xmmD,xmmH - sub rcx, byte SIZEOF_XMMWORD/2 + cmp rcx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmC + movdqa xmmD, xmmH + sub rcx, byte SIZEOF_XMMWORD/2 .column_st16: - cmp rcx, byte SIZEOF_XMMWORD/4 - jb short .column_st15 - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub rcx, byte SIZEOF_XMMWORD/4 + cmp rcx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub rcx, byte SIZEOF_XMMWORD/4 .column_st15: - ; Store two pixels (8 bytes) of xmmA to the output when it has enough - ; space. - cmp rcx, byte SIZEOF_XMMWORD/8 - jb short .column_st7 - movq MMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD/8*4 - sub rcx, byte SIZEOF_XMMWORD/8 - psrldq xmmA, SIZEOF_XMMWORD/8*4 + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq MMWORD [rdi], xmmA + add rdi, byte SIZEOF_XMMWORD/8*4 + sub rcx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 .column_st7: - ; Store one pixel (4 bytes) of xmmA to the output when it has enough - ; space. - test rcx, rcx - jz short .nextrow - movd XMM_DWORD [rdi], xmmA + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + movd XMM_DWORD [rdi], xmmA -%endif ; RGB_PIXELSIZE ; --------------- +%endif ; RGB_PIXELSIZE ; --------------- .nextrow: - pop rcx - pop rsi - pop rbx - pop rdx - pop rdi - pop rax - - add rsi, byte SIZEOF_JSAMPROW - add rbx, byte SIZEOF_JSAMPROW - add rdx, byte SIZEOF_JSAMPROW - add rdi, byte SIZEOF_JSAMPROW ; output_buf - dec rax ; num_rows - jg near .rowloop - - sfence ; flush the write buffer + pop rcx + pop rsi + pop rbx + pop rdx + pop rdi + pop rax + + add rsi, byte SIZEOF_JSAMPROW + add rbx, byte SIZEOF_JSAMPROW + add rdx, byte SIZEOF_JSAMPROW + add rdi, byte SIZEOF_JSAMPROW ; output_buf + dec rax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer .return: - pop rbx - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + pop rbx + uncollect_args + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jdcolext-sse2.asm b/simd/jdcolext-sse2.asm index 682aef3..09844bf 100644 --- a/simd/jdcolext-sse2.asm +++ b/simd/jdcolext-sse2.asm @@ -28,432 +28,432 @@ ; JSAMPARRAY output_buf, int num_rows) ; -%define out_width(b) (b)+8 ; JDIMENSION out_width -%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf -%define input_row(b) (b)+16 ; JDIMENSION input_row -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf -%define num_rows(b) (b)+24 ; int num_rows +%define out_width(b) (b)+8 ; JDIMENSION out_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define input_row(b) (b)+16 ; JDIMENSION input_row +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf +%define num_rows(b) (b)+24 ; int num_rows -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 - global EXTN(jsimd_ycc_rgb_convert_sse2) + align 16 + global EXTN(jsimd_ycc_rgb_convert_sse2) EXTN(jsimd_ycc_rgb_convert_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [out_width(eax)] ; num_cols - test ecx,ecx - jz near .return - - push ecx - - mov edi, JSAMPIMAGE [input_buf(eax)] - mov ecx, JDIMENSION [input_row(eax)] - mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] - lea esi, [esi+ecx*SIZEOF_JSAMPROW] - lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] - lea edx, [edx+ecx*SIZEOF_JSAMPROW] - - pop ecx - - mov edi, JSAMPARRAY [output_buf(eax)] - mov eax, INT [num_rows(eax)] - test eax,eax - jle near .return - alignx 16,7 + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [out_width(eax)] ; num_cols + test ecx, ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [input_row(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov edi, JSAMPARRAY [output_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax, eax + jle near .return + alignx 16, 7 .rowloop: - push eax - push edi - push edx - push ebx - push esi - push ecx ; col - - mov esi, JSAMPROW [esi] ; inptr0 - mov ebx, JSAMPROW [ebx] ; inptr1 - mov edx, JSAMPROW [edx] ; inptr2 - mov edi, JSAMPROW [edi] ; outptr - movpic eax, POINTER [gotptr] ; load GOT address (eax) - alignx 16,7 + push eax + push edi + push edx + push ebx + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr0 + mov ebx, JSAMPROW [ebx] ; inptr1 + mov edx, JSAMPROW [edx] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + movpic eax, POINTER [gotptr] ; load GOT address (eax) + alignx 16, 7 .columnloop: - movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF) - movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF) - - pcmpeqw xmm4,xmm4 - pcmpeqw xmm7,xmm7 - psrlw xmm4,BYTE_BIT - psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} - movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} - - pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE - psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO - pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE - psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO - - paddw xmm4,xmm7 - paddw xmm5,xmm7 - paddw xmm0,xmm7 - paddw xmm1,xmm7 - - ; (Original) - ; R = Y + 1.40200 * Cr - ; G = Y - 0.34414 * Cb - 0.71414 * Cr - ; B = Y + 1.77200 * Cb - ; - ; (This implementation) - ; R = Y + 0.40200 * Cr + Cr - ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr - ; B = Y - 0.22800 * Cb + Cb + Cb - - movdqa xmm2,xmm4 ; xmm2=CbE - movdqa xmm3,xmm5 ; xmm3=CbO - paddw xmm4,xmm4 ; xmm4=2*CbE - paddw xmm5,xmm5 ; xmm5=2*CbO - movdqa xmm6,xmm0 ; xmm6=CrE - movdqa xmm7,xmm1 ; xmm7=CrO - paddw xmm0,xmm0 ; xmm0=2*CrE - paddw xmm1,xmm1 ; xmm1=2*CrO - - pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800)) - pmulhw xmm5,[GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800)) - pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200)) - pmulhw xmm1,[GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200)) - - paddw xmm4,[GOTOFF(eax,PW_ONE)] - paddw xmm5,[GOTOFF(eax,PW_ONE)] - psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800)) - psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800)) - paddw xmm0,[GOTOFF(eax,PW_ONE)] - paddw xmm1,[GOTOFF(eax,PW_ONE)] - psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200)) - psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200)) - - paddw xmm4,xmm2 - paddw xmm5,xmm3 - paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E - paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O - paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E - paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E - movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O - - movdqa xmm4,xmm2 - movdqa xmm5,xmm3 - punpcklwd xmm2,xmm6 - punpckhwd xmm4,xmm6 - pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd xmm4,[GOTOFF(eax,PW_MF0344_F0285)] - punpcklwd xmm3,xmm7 - punpckhwd xmm5,xmm7 - pmaddwd xmm3,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] - - paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] - paddd xmm4,[GOTOFF(eax,PD_ONEHALF)] - psrad xmm2,SCALEBITS - psrad xmm4,SCALEBITS - paddd xmm3,[GOTOFF(eax,PD_ONEHALF)] - paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] - psrad xmm3,SCALEBITS - psrad xmm5,SCALEBITS - - packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) - packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) - psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E - psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O - - movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF) - - pcmpeqw xmm4,xmm4 - psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} - pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE - psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO - - paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) - paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) - packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) - packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) - - paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) - paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) - packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) - packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) - - paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) - paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) - packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) - packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) - -%if RGB_PIXELSIZE == 3 ; --------------- - - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) - punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) - - movdqa xmmG,xmmA - movdqa xmmH,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) - punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) - - psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) - psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) - - movdqa xmmC,xmmD - movdqa xmmB,xmmD - punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) - punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) - - psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) - - movdqa xmmF,xmmE - punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) - punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) - - pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) - movdqa xmmB,xmmE - punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) - punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) - punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) - - pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) - movdqa xmmB,xmmF - punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) - punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) - punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) - - punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test edi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF - jmp short .out0 + movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF) + movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF) + + pcmpeqw xmm4, xmm4 + pcmpeqw xmm7, xmm7 + psrlw xmm4, BYTE_BIT + psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} + + pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE + psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO + pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE + psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO + + paddw xmm4, xmm7 + paddw xmm5, xmm7 + paddw xmm0, xmm7 + paddw xmm1, xmm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm2, xmm4 ; xmm2=CbE + movdqa xmm3, xmm5 ; xmm3=CbO + paddw xmm4, xmm4 ; xmm4=2*CbE + paddw xmm5, xmm5 ; xmm5=2*CbO + movdqa xmm6, xmm0 ; xmm6=CrE + movdqa xmm7, xmm1 ; xmm7=CrO + paddw xmm0, xmm0 ; xmm0=2*CrE + paddw xmm1, xmm1 ; xmm1=2*CrO + + pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800)) + pmulhw xmm5, [GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800)) + pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200)) + pmulhw xmm1, [GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200)) + + paddw xmm4, [GOTOFF(eax,PW_ONE)] + paddw xmm5, [GOTOFF(eax,PW_ONE)] + psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800)) + psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800)) + paddw xmm0, [GOTOFF(eax,PW_ONE)] + paddw xmm1, [GOTOFF(eax,PW_ONE)] + psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200)) + psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200)) + + paddw xmm4, xmm2 + paddw xmm5, xmm3 + paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E + paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O + paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E + paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O + + movdqa xmm4, xmm2 + movdqa xmm5, xmm3 + punpcklwd xmm2, xmm6 + punpckhwd xmm4, xmm6 + pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm4, [GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd xmm3, xmm7 + punpckhwd xmm5, xmm7 + pmaddwd xmm3, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)] + + paddd xmm2, [GOTOFF(eax,PD_ONEHALF)] + paddd xmm4, [GOTOFF(eax,PD_ONEHALF)] + psrad xmm2, SCALEBITS + psrad xmm4, SCALEBITS + paddd xmm3, [GOTOFF(eax,PD_ONEHALF)] + paddd xmm5, [GOTOFF(eax,PD_ONEHALF)] + psrad xmm3, SCALEBITS + psrad xmm5, SCALEBITS + + packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF) + + pcmpeqw xmm4, xmm4 + psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} + pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE + psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO + + paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) + paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) + packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) + paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) + packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) + paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) + packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG, xmmA + movdqa xmmH, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC, xmmD + movdqa xmmB, xmmD + punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF, xmmE + punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB, xmmE + punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB, xmmF + punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 .out1: ; --(unaligned)----------------- - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF .out0: - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub ecx, byte SIZEOF_XMMWORD - jz near .nextrow + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .nextrow - add esi, byte SIZEOF_XMMWORD ; inptr0 - add ebx, byte SIZEOF_XMMWORD ; inptr1 - add edx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop - alignx 16,7 + add esi, byte SIZEOF_XMMWORD ; inptr0 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 .column_st32: - lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE - cmp ecx, byte 2*SIZEOF_XMMWORD - jb short .column_st16 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - add edi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmF - sub ecx, byte 2*SIZEOF_XMMWORD - jmp short .column_st15 + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmF + sub ecx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 .column_st16: - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st15 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub ecx, byte SIZEOF_XMMWORD + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub ecx, byte SIZEOF_XMMWORD .column_st15: - ; Store the lower 8 bytes of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st7 - movq XMM_MMWORD [edi], xmmA - add edi, byte SIZEOF_MMWORD - sub ecx, byte SIZEOF_MMWORD - psrldq xmmA, SIZEOF_MMWORD + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_MMWORD + sub ecx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD .column_st7: - ; Store the lower 4 bytes of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_DWORD - jb short .column_st3 - movd XMM_DWORD [edi], xmmA - add edi, byte SIZEOF_DWORD - sub ecx, byte SIZEOF_DWORD - psrldq xmmA, SIZEOF_DWORD + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [edi], xmmA + add edi, byte SIZEOF_DWORD + sub ecx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD .column_st3: - ; Store the lower 2 bytes of eax to the output when it has enough - ; space. - movd eax, xmmA - cmp ecx, byte SIZEOF_WORD - jb short .column_st1 - mov WORD [edi], ax - add edi, byte SIZEOF_WORD - sub ecx, byte SIZEOF_WORD - shr eax, 16 + ; Store the lower 2 bytes of eax to the output when it has enough + ; space. + movd eax, xmmA + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [edi], ax + add edi, byte SIZEOF_WORD + sub ecx, byte SIZEOF_WORD + shr eax, 16 .column_st1: - ; Store the lower 1 byte of eax to the output when it has enough - ; space. - test ecx, ecx - jz short .nextrow - mov BYTE [edi], al + ; Store the lower 1 byte of eax to the output when it has enough + ; space. + test ecx, ecx + jz short .nextrow + mov BYTE [edi], al -%else ; RGB_PIXELSIZE == 4 ; ----------- +%else ; RGB_PIXELSIZE == 4 ; ----------- %ifdef RGBX_FILLER_0XFF - pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) %else - pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) %endif - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) - punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) - punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) - - movdqa xmmC,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) - punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) - movdqa xmmG,xmmB - punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) - punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) - - movdqa xmmD,xmmA - punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - movdqa xmmH,xmmC - punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test edi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC - movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH - jmp short .out0 + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG, xmmB + punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH, xmmC + punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 .out1: ; --(unaligned)----------------- - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC - movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH .out0: - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub ecx, byte SIZEOF_XMMWORD - jz near .nextrow + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .nextrow - add esi, byte SIZEOF_XMMWORD ; inptr0 - add ebx, byte SIZEOF_XMMWORD ; inptr1 - add edx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop - alignx 16,7 + add esi, byte SIZEOF_XMMWORD ; inptr0 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 .column_st32: - cmp ecx, byte SIZEOF_XMMWORD/2 - jb short .column_st16 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - add edi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmC - movdqa xmmD,xmmH - sub ecx, byte SIZEOF_XMMWORD/2 + cmp ecx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmC + movdqa xmmD, xmmH + sub ecx, byte SIZEOF_XMMWORD/2 .column_st16: - cmp ecx, byte SIZEOF_XMMWORD/4 - jb short .column_st15 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub ecx, byte SIZEOF_XMMWORD/4 + cmp ecx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub ecx, byte SIZEOF_XMMWORD/4 .column_st15: - ; Store two pixels (8 bytes) of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_XMMWORD/8 - jb short .column_st7 - movq XMM_MMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD/8*4 - sub ecx, byte SIZEOF_XMMWORD/8 - psrldq xmmA, SIZEOF_XMMWORD/8*4 + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD/8*4 + sub ecx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 .column_st7: - ; Store one pixel (4 bytes) of xmmA to the output when it has enough - ; space. - test ecx, ecx - jz short .nextrow - movd XMM_DWORD [edi], xmmA + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test ecx, ecx + jz short .nextrow + movd XMM_DWORD [edi], xmmA -%endif ; RGB_PIXELSIZE ; --------------- +%endif ; RGB_PIXELSIZE ; --------------- - alignx 16,7 + alignx 16, 7 .nextrow: - pop ecx - pop esi - pop ebx - pop edx - pop edi - pop eax - - add esi, byte SIZEOF_JSAMPROW - add ebx, byte SIZEOF_JSAMPROW - add edx, byte SIZEOF_JSAMPROW - add edi, byte SIZEOF_JSAMPROW ; output_buf - dec eax ; num_rows - jg near .rowloop - - sfence ; flush the write buffer + pop ecx + pop esi + pop ebx + pop edx + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + add edi, byte SIZEOF_JSAMPROW ; output_buf + dec eax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jdcolor-sse2-64.asm b/simd/jdcolor-sse2-64.asm index d2bf210..855badb 100644 --- a/simd/jdcolor-sse2-64.asm +++ b/simd/jdcolor-sse2-64.asm @@ -20,21 +20,21 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_344 equ 22554 ; FIX(0.34414) -F_0_714 equ 46802 ; FIX(0.71414) -F_1_402 equ 91881 ; FIX(1.40200) -F_1_772 equ 116130 ; FIX(1.77200) -F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) -F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) -F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_ycc_rgb_convert_sse2) + alignz 16 + global EXTN(jconst_ycc_rgb_convert_sse2) EXTN(jconst_ycc_rgb_convert_sse2): @@ -44,11 +44,11 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 PW_ONE times 8 dw 1 PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 %include "jdcolext-sse2-64.asm" diff --git a/simd/jdcolor-sse2.asm b/simd/jdcolor-sse2.asm index 7ff5d05..1345df9 100644 --- a/simd/jdcolor-sse2.asm +++ b/simd/jdcolor-sse2.asm @@ -20,21 +20,21 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_344 equ 22554 ; FIX(0.34414) -F_0_714 equ 46802 ; FIX(0.71414) -F_1_402 equ 91881 ; FIX(1.40200) -F_1_772 equ 116130 ; FIX(1.77200) -F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) -F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) -F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_ycc_rgb_convert_sse2) + alignz 16 + global EXTN(jconst_ycc_rgb_convert_sse2) EXTN(jconst_ycc_rgb_convert_sse2): @@ -44,11 +44,11 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 PW_ONE times 8 dw 1 PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 %include "jdcolext-sse2.asm" diff --git a/simd/jdct.inc b/simd/jdct.inc index b976107..7ae2ca4 100644 --- a/simd/jdct.inc +++ b/simd/jdct.inc @@ -17,11 +17,11 @@ ; %define RANGE_MASK (MAXJSAMPLE * 4 + 3) ; 2 bits wider than legal samples -%define ROW(n,b,s) ((b)+(n)*(s)) -%define COL(n,b,s) ((b)+(n)*(s)*DCTSIZE) +%define ROW(n,b,s) ((b)+(n)*(s)) +%define COL(n,b,s) ((b)+(n)*(s)*DCTSIZE) -%define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD) -%define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD) -%define XMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD) +%define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD) +%define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD) +%define XMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD) ; -------------------------------------------------------------------------- diff --git a/simd/jdmerge-sse2-64.asm b/simd/jdmerge-sse2-64.asm index 244bd40..dddefd8 100644 --- a/simd/jdmerge-sse2-64.asm +++ b/simd/jdmerge-sse2-64.asm @@ -20,21 +20,21 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_344 equ 22554 ; FIX(0.34414) -F_0_714 equ 46802 ; FIX(0.71414) -F_1_402 equ 91881 ; FIX(1.40200) -F_1_772 equ 116130 ; FIX(1.77200) -F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) -F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) -F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_merged_upsample_sse2) + alignz 16 + global EXTN(jconst_merged_upsample_sse2) EXTN(jconst_merged_upsample_sse2): @@ -44,11 +44,11 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 PW_ONE times 8 dw 1 PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 %include "jdmrgext-sse2-64.asm" diff --git a/simd/jdmerge-sse2.asm b/simd/jdmerge-sse2.asm index 236de5a..0683fd0 100644 --- a/simd/jdmerge-sse2.asm +++ b/simd/jdmerge-sse2.asm @@ -20,21 +20,21 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_344 equ 22554 ; FIX(0.34414) -F_0_714 equ 46802 ; FIX(0.71414) -F_1_402 equ 91881 ; FIX(1.40200) -F_1_772 equ 116130 ; FIX(1.77200) -F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) -F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) -F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_merged_upsample_sse2) + alignz 16 + global EXTN(jconst_merged_upsample_sse2) EXTN(jconst_merged_upsample_sse2): @@ -44,11 +44,11 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 PW_ONE times 8 dw 1 PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 %include "jdmrgext-sse2.asm" diff --git a/simd/jdmrgext-sse2-64.asm b/simd/jdmrgext-sse2-64.asm index ad74c5f..9e8eb27 100644 --- a/simd/jdmrgext-sse2-64.asm +++ b/simd/jdmrgext-sse2-64.asm @@ -34,399 +34,399 @@ ; r12 = JDIMENSION in_row_group_ctr ; r13 = JSAMPARRAY output_buf -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 3 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 3 - align 16 - global EXTN(jsimd_h2v1_merged_upsample_sse2) + align 16 + global EXTN(jsimd_h2v1_merged_upsample_sse2) EXTN(jsimd_h2v1_merged_upsample_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args - push rbx - - mov ecx, r10d ; col - test rcx,rcx - jz near .return - - push rcx - - mov rdi, r11 - mov ecx, r12d - mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] - mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] - mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] - mov rdi, r13 - mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0 - mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1 - mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2 - mov rdi, JSAMPROW [rdi] ; outptr - - pop rcx ; col + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov ecx, r10d ; col + test rcx, rcx + jz near .return + + push rcx + + mov rdi, r11 + mov ecx, r12d + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + mov rdi, r13 + mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0 + mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1 + mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2 + mov rdi, JSAMPROW [rdi] ; outptr + + pop rcx ; col .columnloop: - movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF) - movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF) - - pxor xmm1,xmm1 ; xmm1=(all 0's) - pcmpeqw xmm3,xmm3 - psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} - - movdqa xmm4,xmm6 - punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH - punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL - movdqa xmm0,xmm7 - punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH - punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL - - paddw xmm6,xmm3 - paddw xmm4,xmm3 - paddw xmm7,xmm3 - paddw xmm0,xmm3 - - ; (Original) - ; R = Y + 1.40200 * Cr - ; G = Y - 0.34414 * Cb - 0.71414 * Cr - ; B = Y + 1.77200 * Cb - ; - ; (This implementation) - ; R = Y + 0.40200 * Cr + Cr - ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr - ; B = Y - 0.22800 * Cb + Cb + Cb - - movdqa xmm5,xmm6 ; xmm5=CbH - movdqa xmm2,xmm4 ; xmm2=CbL - paddw xmm6,xmm6 ; xmm6=2*CbH - paddw xmm4,xmm4 ; xmm4=2*CbL - movdqa xmm1,xmm7 ; xmm1=CrH - movdqa xmm3,xmm0 ; xmm3=CrL - paddw xmm7,xmm7 ; xmm7=2*CrH - paddw xmm0,xmm0 ; xmm0=2*CrL - - pmulhw xmm6,[rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800)) - pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800)) - pmulhw xmm7,[rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200)) - pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200)) - - paddw xmm6,[rel PW_ONE] - paddw xmm4,[rel PW_ONE] - psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800)) - psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800)) - paddw xmm7,[rel PW_ONE] - paddw xmm0,[rel PW_ONE] - psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200)) - psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200)) - - paddw xmm6,xmm5 - paddw xmm4,xmm2 - paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H - paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L - paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H - paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L - - movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H - movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H - - movdqa xmm6,xmm5 - movdqa xmm7,xmm2 - punpcklwd xmm5,xmm1 - punpckhwd xmm6,xmm1 - pmaddwd xmm5,[rel PW_MF0344_F0285] - pmaddwd xmm6,[rel PW_MF0344_F0285] - punpcklwd xmm2,xmm3 - punpckhwd xmm7,xmm3 - pmaddwd xmm2,[rel PW_MF0344_F0285] - pmaddwd xmm7,[rel PW_MF0344_F0285] - - paddd xmm5,[rel PD_ONEHALF] - paddd xmm6,[rel PD_ONEHALF] - psrad xmm5,SCALEBITS - psrad xmm6,SCALEBITS - paddd xmm2,[rel PD_ONEHALF] - paddd xmm7,[rel PD_ONEHALF] - psrad xmm2,SCALEBITS - psrad xmm7,SCALEBITS - - packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) - packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) - psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H - psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L - - movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H - - mov al,2 ; Yctr - jmp short .Yloop_1st + movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF) + movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF) + + pxor xmm1, xmm1 ; xmm1=(all 0's) + pcmpeqw xmm3, xmm3 + psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + movdqa xmm4, xmm6 + punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH + punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL + movdqa xmm0, xmm7 + punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH + punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL + + paddw xmm6, xmm3 + paddw xmm4, xmm3 + paddw xmm7, xmm3 + paddw xmm0, xmm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm5, xmm6 ; xmm5=CbH + movdqa xmm2, xmm4 ; xmm2=CbL + paddw xmm6, xmm6 ; xmm6=2*CbH + paddw xmm4, xmm4 ; xmm4=2*CbL + movdqa xmm1, xmm7 ; xmm1=CrH + movdqa xmm3, xmm0 ; xmm3=CrL + paddw xmm7, xmm7 ; xmm7=2*CrH + paddw xmm0, xmm0 ; xmm0=2*CrL + + pmulhw xmm6, [rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800)) + pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800)) + pmulhw xmm7, [rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200)) + pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200)) + + paddw xmm6, [rel PW_ONE] + paddw xmm4, [rel PW_ONE] + psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800)) + psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800)) + paddw xmm7, [rel PW_ONE] + paddw xmm0, [rel PW_ONE] + psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200)) + psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200)) + + paddw xmm6, xmm5 + paddw xmm4, xmm2 + paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H + paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L + paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H + paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L + + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H + + movdqa xmm6, xmm5 + movdqa xmm7, xmm2 + punpcklwd xmm5, xmm1 + punpckhwd xmm6, xmm1 + pmaddwd xmm5, [rel PW_MF0344_F0285] + pmaddwd xmm6, [rel PW_MF0344_F0285] + punpcklwd xmm2, xmm3 + punpckhwd xmm7, xmm3 + pmaddwd xmm2, [rel PW_MF0344_F0285] + pmaddwd xmm7, [rel PW_MF0344_F0285] + + paddd xmm5, [rel PD_ONEHALF] + paddd xmm6, [rel PD_ONEHALF] + psrad xmm5, SCALEBITS + psrad xmm6, SCALEBITS + paddd xmm2, [rel PD_ONEHALF] + paddd xmm7, [rel PD_ONEHALF] + psrad xmm2, SCALEBITS + psrad xmm7, SCALEBITS + + packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H + + mov al, 2 ; Yctr + jmp short .Yloop_1st .Yloop_2nd: - movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H - movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H - movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H .Yloop_1st: - movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF) - - pcmpeqw xmm6,xmm6 - psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} - pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE - psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO - - movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H) - movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H) - movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H) - - paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) - paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) - packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) - packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) - - paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) - paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) - packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) - packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) - - paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) - paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) - packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) - packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) - -%if RGB_PIXELSIZE == 3 ; --------------- - - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) - punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) - - movdqa xmmG,xmmA - movdqa xmmH,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) - punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) - - psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) - psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) - - movdqa xmmC,xmmD - movdqa xmmB,xmmD - punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) - punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) - - psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) - - movdqa xmmF,xmmE - punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) - punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) - - pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) - movdqa xmmB,xmmE - punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) - punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) - punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) - - pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) - movdqa xmmB,xmmF - punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) - punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) - punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) - - punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - - cmp rcx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test rdi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF - jmp short .out0 + movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF) + + pcmpeqw xmm6, xmm6 + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE + psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO + + movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H) + movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H) + movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H) + + paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) + paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) + packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) + paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) + packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) + paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) + packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG, xmmA + movdqa xmmH, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC, xmmD + movdqa xmmB, xmmD + punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF, xmmE + punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB, xmmE + punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB, xmmF + punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 .out1: ; --(unaligned)----------------- - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF .out0: - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub rcx, byte SIZEOF_XMMWORD - jz near .endcolumn + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .endcolumn - add rsi, byte SIZEOF_XMMWORD ; inptr0 - dec al ; Yctr - jnz near .Yloop_2nd + add rsi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd - add rbx, byte SIZEOF_XMMWORD ; inptr1 - add rdx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop .column_st32: - lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE - cmp rcx, byte 2*SIZEOF_XMMWORD - jb short .column_st16 - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - add rdi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmF - sub rcx, byte 2*SIZEOF_XMMWORD - jmp short .column_st15 + lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE + cmp rcx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmF + sub rcx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 .column_st16: - cmp rcx, byte SIZEOF_XMMWORD - jb short .column_st15 - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub rcx, byte SIZEOF_XMMWORD + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub rcx, byte SIZEOF_XMMWORD .column_st15: - ; Store the lower 8 bytes of xmmA to the output when it has enough - ; space. - cmp rcx, byte SIZEOF_MMWORD - jb short .column_st7 - movq XMM_MMWORD [rdi], xmmA - add rdi, byte SIZEOF_MMWORD - sub rcx, byte SIZEOF_MMWORD - psrldq xmmA, SIZEOF_MMWORD + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_MMWORD + sub rcx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD .column_st7: - ; Store the lower 4 bytes of xmmA to the output when it has enough - ; space. - cmp rcx, byte SIZEOF_DWORD - jb short .column_st3 - movd XMM_DWORD [rdi], xmmA - add rdi, byte SIZEOF_DWORD - sub rcx, byte SIZEOF_DWORD - psrldq xmmA, SIZEOF_DWORD + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [rdi], xmmA + add rdi, byte SIZEOF_DWORD + sub rcx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD .column_st3: - ; Store the lower 2 bytes of rax to the output when it has enough - ; space. - movd eax, xmmA - cmp rcx, byte SIZEOF_WORD - jb short .column_st1 - mov WORD [rdi], ax - add rdi, byte SIZEOF_WORD - sub rcx, byte SIZEOF_WORD - shr rax, 16 + ; Store the lower 2 bytes of rax to the output when it has enough + ; space. + movd eax, xmmA + cmp rcx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [rdi], ax + add rdi, byte SIZEOF_WORD + sub rcx, byte SIZEOF_WORD + shr rax, 16 .column_st1: - ; Store the lower 1 byte of rax to the output when it has enough - ; space. - test rcx, rcx - jz short .endcolumn - mov BYTE [rdi], al + ; Store the lower 1 byte of rax to the output when it has enough + ; space. + test rcx, rcx + jz short .endcolumn + mov BYTE [rdi], al -%else ; RGB_PIXELSIZE == 4 ; ----------- +%else ; RGB_PIXELSIZE == 4 ; ----------- %ifdef RGBX_FILLER_0XFF - pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) %else - pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) %endif - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) - punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) - punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) - - movdqa xmmC,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) - punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) - movdqa xmmG,xmmB - punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) - punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) - - movdqa xmmD,xmmA - punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - movdqa xmmH,xmmC - punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - - cmp rcx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test rdi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC - movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH - jmp short .out0 + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG, xmmB + punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH, xmmC + punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 .out1: ; --(unaligned)----------------- - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC - movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH .out0: - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub rcx, byte SIZEOF_XMMWORD - jz near .endcolumn + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .endcolumn - add rsi, byte SIZEOF_XMMWORD ; inptr0 - dec al ; Yctr - jnz near .Yloop_2nd + add rsi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd - add rbx, byte SIZEOF_XMMWORD ; inptr1 - add rdx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop .column_st32: - cmp rcx, byte SIZEOF_XMMWORD/2 - jb short .column_st16 - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - add rdi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmC - movdqa xmmD,xmmH - sub rcx, byte SIZEOF_XMMWORD/2 + cmp rcx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmC + movdqa xmmD, xmmH + sub rcx, byte SIZEOF_XMMWORD/2 .column_st16: - cmp rcx, byte SIZEOF_XMMWORD/4 - jb short .column_st15 - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub rcx, byte SIZEOF_XMMWORD/4 + cmp rcx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub rcx, byte SIZEOF_XMMWORD/4 .column_st15: - ; Store two pixels (8 bytes) of xmmA to the output when it has enough - ; space. - cmp rcx, byte SIZEOF_XMMWORD/8 - jb short .column_st7 - movq XMM_MMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD/8*4 - sub rcx, byte SIZEOF_XMMWORD/8 - psrldq xmmA, SIZEOF_XMMWORD/8*4 + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_XMMWORD/8*4 + sub rcx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 .column_st7: - ; Store one pixel (4 bytes) of xmmA to the output when it has enough - ; space. - test rcx, rcx - jz short .endcolumn - movd XMM_DWORD [rdi], xmmA + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test rcx, rcx + jz short .endcolumn + movd XMM_DWORD [rdi], xmmA -%endif ; RGB_PIXELSIZE ; --------------- +%endif ; RGB_PIXELSIZE ; --------------- .endcolumn: - sfence ; flush the write buffer + sfence ; flush the write buffer .return: - pop rbx - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + pop rbx + uncollect_args + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; -------------------------------------------------------------------------- ; @@ -444,94 +444,94 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): ; r12 = JDIMENSION in_row_group_ctr ; r13 = JSAMPARRAY output_buf - align 16 - global EXTN(jsimd_h2v2_merged_upsample_sse2) + align 16 + global EXTN(jsimd_h2v2_merged_upsample_sse2) EXTN(jsimd_h2v2_merged_upsample_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - push rbx - - mov eax, r10d - - mov rdi, r11 - mov ecx, r12d - mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] - mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] - mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] - mov rdi, r13 - lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] - - push rdx ; inptr2 - push rbx ; inptr1 - push rsi ; inptr00 - mov rbx,rsp - - push rdi - push rcx - push rax - - %ifdef WIN64 - mov r8, rcx - mov r9, rdi - mov rcx, rax - mov rdx, rbx - %else - mov rdx, rcx - mov rcx, rdi - mov rdi, rax - mov rsi, rbx - %endif - - call EXTN(jsimd_h2v1_merged_upsample_sse2) - - pop rax - pop rcx - pop rdi - pop rsi - pop rbx - pop rdx - - add rdi, byte SIZEOF_JSAMPROW ; outptr1 - add rsi, byte SIZEOF_JSAMPROW ; inptr01 - - push rdx ; inptr2 - push rbx ; inptr1 - push rsi ; inptr00 - mov rbx,rsp - - push rdi - push rcx - push rax - - %ifdef WIN64 - mov r8, rcx - mov r9, rdi - mov rcx, rax - mov rdx, rbx - %else - mov rdx, rcx - mov rcx, rdi - mov rdi, rax - mov rsi, rbx - %endif - - call EXTN(jsimd_h2v1_merged_upsample_sse2) - - pop rax - pop rcx - pop rdi - pop rsi - pop rbx - pop rdx - - pop rbx - uncollect_args - pop rbp - ret + push rbp + mov rax, rsp + mov rbp, rsp + collect_args + push rbx + + mov eax, r10d + + mov rdi, r11 + mov ecx, r12d + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + mov rdi, r13 + lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] + + push rdx ; inptr2 + push rbx ; inptr1 + push rsi ; inptr00 + mov rbx, rsp + + push rdi + push rcx + push rax + + %ifdef WIN64 + mov r8, rcx + mov r9, rdi + mov rcx, rax + mov rdx, rbx + %else + mov rdx, rcx + mov rcx, rdi + mov rdi, rax + mov rsi, rbx + %endif + + call EXTN(jsimd_h2v1_merged_upsample_sse2) + + pop rax + pop rcx + pop rdi + pop rsi + pop rbx + pop rdx + + add rdi, byte SIZEOF_JSAMPROW ; outptr1 + add rsi, byte SIZEOF_JSAMPROW ; inptr01 + + push rdx ; inptr2 + push rbx ; inptr1 + push rsi ; inptr00 + mov rbx, rsp + + push rdi + push rcx + push rax + + %ifdef WIN64 + mov r8, rcx + mov r9, rdi + mov rcx, rax + mov rdx, rbx + %else + mov rdx, rcx + mov rcx, rdi + mov rdi, rax + mov rsi, rbx + %endif + + call EXTN(jsimd_h2v1_merged_upsample_sse2) + + pop rax + pop rcx + pop rdi + pop rsi + pop rbx + pop rdx + + pop rbx + uncollect_args + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jdmrgext-sse2.asm b/simd/jdmrgext-sse2.asm index b50f698..187ba0c 100644 --- a/simd/jdmrgext-sse2.asm +++ b/simd/jdmrgext-sse2.asm @@ -29,422 +29,422 @@ ; JSAMPARRAY output_buf); ; -%define output_width(b) (b)+8 ; JDIMENSION output_width -%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf -%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf +%define output_width(b) (b)+8 ; JDIMENSION output_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 3 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 3 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 - global EXTN(jsimd_h2v1_merged_upsample_sse2) + align 16 + global EXTN(jsimd_h2v1_merged_upsample_sse2) EXTN(jsimd_h2v1_merged_upsample_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [output_width(eax)] ; col - test ecx,ecx - jz near .return - - push ecx - - mov edi, JSAMPIMAGE [input_buf(eax)] - mov ecx, JDIMENSION [in_row_group_ctr(eax)] - mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] - mov edi, JSAMPARRAY [output_buf(eax)] - mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 - mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 - mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 - mov edi, JSAMPROW [edi] ; outptr - - pop ecx ; col - - alignx 16,7 + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [output_width(eax)] ; col + test ecx, ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [in_row_group_ctr(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(eax)] + mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 + mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 + mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + + pop ecx ; col + + alignx 16, 7 .columnloop: - movpic eax, POINTER [gotptr] ; load GOT address (eax) - - movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF) - movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF) - - pxor xmm1,xmm1 ; xmm1=(all 0's) - pcmpeqw xmm3,xmm3 - psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} - - movdqa xmm4,xmm6 - punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH - punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL - movdqa xmm0,xmm7 - punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH - punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL - - paddw xmm6,xmm3 - paddw xmm4,xmm3 - paddw xmm7,xmm3 - paddw xmm0,xmm3 - - ; (Original) - ; R = Y + 1.40200 * Cr - ; G = Y - 0.34414 * Cb - 0.71414 * Cr - ; B = Y + 1.77200 * Cb - ; - ; (This implementation) - ; R = Y + 0.40200 * Cr + Cr - ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr - ; B = Y - 0.22800 * Cb + Cb + Cb - - movdqa xmm5,xmm6 ; xmm5=CbH - movdqa xmm2,xmm4 ; xmm2=CbL - paddw xmm6,xmm6 ; xmm6=2*CbH - paddw xmm4,xmm4 ; xmm4=2*CbL - movdqa xmm1,xmm7 ; xmm1=CrH - movdqa xmm3,xmm0 ; xmm3=CrL - paddw xmm7,xmm7 ; xmm7=2*CrH - paddw xmm0,xmm0 ; xmm0=2*CrL - - pmulhw xmm6,[GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800)) - pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800)) - pmulhw xmm7,[GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200)) - pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200)) - - paddw xmm6,[GOTOFF(eax,PW_ONE)] - paddw xmm4,[GOTOFF(eax,PW_ONE)] - psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800)) - psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800)) - paddw xmm7,[GOTOFF(eax,PW_ONE)] - paddw xmm0,[GOTOFF(eax,PW_ONE)] - psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200)) - psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200)) - - paddw xmm6,xmm5 - paddw xmm4,xmm2 - paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H - paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L - paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H - paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L - - movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H - movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H - - movdqa xmm6,xmm5 - movdqa xmm7,xmm2 - punpcklwd xmm5,xmm1 - punpckhwd xmm6,xmm1 - pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd xmm6,[GOTOFF(eax,PW_MF0344_F0285)] - punpcklwd xmm2,xmm3 - punpckhwd xmm7,xmm3 - pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd xmm7,[GOTOFF(eax,PW_MF0344_F0285)] - - paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] - paddd xmm6,[GOTOFF(eax,PD_ONEHALF)] - psrad xmm5,SCALEBITS - psrad xmm6,SCALEBITS - paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] - paddd xmm7,[GOTOFF(eax,PD_ONEHALF)] - psrad xmm2,SCALEBITS - psrad xmm7,SCALEBITS - - packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) - packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) - psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H - psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L - - movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H - - mov al,2 ; Yctr - jmp short .Yloop_1st - alignx 16,7 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF) + movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF) + + pxor xmm1, xmm1 ; xmm1=(all 0's) + pcmpeqw xmm3, xmm3 + psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + movdqa xmm4, xmm6 + punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH + punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL + movdqa xmm0, xmm7 + punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH + punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL + + paddw xmm6, xmm3 + paddw xmm4, xmm3 + paddw xmm7, xmm3 + paddw xmm0, xmm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm5, xmm6 ; xmm5=CbH + movdqa xmm2, xmm4 ; xmm2=CbL + paddw xmm6, xmm6 ; xmm6=2*CbH + paddw xmm4, xmm4 ; xmm4=2*CbL + movdqa xmm1, xmm7 ; xmm1=CrH + movdqa xmm3, xmm0 ; xmm3=CrL + paddw xmm7, xmm7 ; xmm7=2*CrH + paddw xmm0, xmm0 ; xmm0=2*CrL + + pmulhw xmm6, [GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800)) + pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800)) + pmulhw xmm7, [GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200)) + pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200)) + + paddw xmm6, [GOTOFF(eax,PW_ONE)] + paddw xmm4, [GOTOFF(eax,PW_ONE)] + psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800)) + psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800)) + paddw xmm7, [GOTOFF(eax,PW_ONE)] + paddw xmm0, [GOTOFF(eax,PW_ONE)] + psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200)) + psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200)) + + paddw xmm6, xmm5 + paddw xmm4, xmm2 + paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H + paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L + paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H + paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L + + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H + + movdqa xmm6, xmm5 + movdqa xmm7, xmm2 + punpcklwd xmm5, xmm1 + punpckhwd xmm6, xmm1 + pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm6, [GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd xmm2, xmm3 + punpckhwd xmm7, xmm3 + pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm7, [GOTOFF(eax,PW_MF0344_F0285)] + + paddd xmm5, [GOTOFF(eax,PD_ONEHALF)] + paddd xmm6, [GOTOFF(eax,PD_ONEHALF)] + psrad xmm5, SCALEBITS + psrad xmm6, SCALEBITS + paddd xmm2, [GOTOFF(eax,PD_ONEHALF)] + paddd xmm7, [GOTOFF(eax,PD_ONEHALF)] + psrad xmm2, SCALEBITS + psrad xmm7, SCALEBITS + + packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H + + mov al, 2 ; Yctr + jmp short .Yloop_1st + alignx 16, 7 .Yloop_2nd: - movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H - movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H - movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H - alignx 16,7 + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H + alignx 16, 7 .Yloop_1st: - movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF) - - pcmpeqw xmm6,xmm6 - psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} - pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE - psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO - - movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H) - movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H) - movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H) - - paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) - paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) - packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) - packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) - - paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) - paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) - packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) - packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) - - paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) - paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) - packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) - packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) - -%if RGB_PIXELSIZE == 3 ; --------------- - - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) - punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) - - movdqa xmmG,xmmA - movdqa xmmH,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) - punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) - - psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) - psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) - - movdqa xmmC,xmmD - movdqa xmmB,xmmD - punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) - punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) - - psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) - - movdqa xmmF,xmmE - punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) - punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) - - pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) - movdqa xmmB,xmmE - punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) - punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) - punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) - - pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) - movdqa xmmB,xmmF - punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) - punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) - punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) - - punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test edi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF - jmp short .out0 + movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF) + + pcmpeqw xmm6, xmm6 + psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE + psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO + + movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H) + movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H) + movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H) + + paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) + paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) + packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) + paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) + packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) + paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) + packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) + +%if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG, xmmA + movdqa xmmH, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC, xmmD + movdqa xmmB, xmmD + punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF, xmmE + punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB, xmmE + punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB, xmmF + punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 .out1: ; --(unaligned)----------------- - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF .out0: - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub ecx, byte SIZEOF_XMMWORD - jz near .endcolumn + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .endcolumn - add esi, byte SIZEOF_XMMWORD ; inptr0 - dec al ; Yctr - jnz near .Yloop_2nd + add esi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd - add ebx, byte SIZEOF_XMMWORD ; inptr1 - add edx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop - alignx 16,7 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 .column_st32: - lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE - cmp ecx, byte 2*SIZEOF_XMMWORD - jb short .column_st16 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - add edi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmF - sub ecx, byte 2*SIZEOF_XMMWORD - jmp short .column_st15 + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmF + sub ecx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 .column_st16: - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st15 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub ecx, byte SIZEOF_XMMWORD + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub ecx, byte SIZEOF_XMMWORD .column_st15: - ; Store the lower 8 bytes of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st7 - movq XMM_MMWORD [edi], xmmA - add edi, byte SIZEOF_MMWORD - sub ecx, byte SIZEOF_MMWORD - psrldq xmmA, SIZEOF_MMWORD + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_MMWORD + sub ecx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD .column_st7: - ; Store the lower 4 bytes of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_DWORD - jb short .column_st3 - movd XMM_DWORD [edi], xmmA - add edi, byte SIZEOF_DWORD - sub ecx, byte SIZEOF_DWORD - psrldq xmmA, SIZEOF_DWORD + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [edi], xmmA + add edi, byte SIZEOF_DWORD + sub ecx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD .column_st3: - ; Store the lower 2 bytes of eax to the output when it has enough - ; space. - movd eax, xmmA - cmp ecx, byte SIZEOF_WORD - jb short .column_st1 - mov WORD [edi], ax - add edi, byte SIZEOF_WORD - sub ecx, byte SIZEOF_WORD - shr eax, 16 + ; Store the lower 2 bytes of eax to the output when it has enough + ; space. + movd eax, xmmA + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [edi], ax + add edi, byte SIZEOF_WORD + sub ecx, byte SIZEOF_WORD + shr eax, 16 .column_st1: - ; Store the lower 1 byte of eax to the output when it has enough - ; space. - test ecx, ecx - jz short .endcolumn - mov BYTE [edi], al + ; Store the lower 1 byte of eax to the output when it has enough + ; space. + test ecx, ecx + jz short .endcolumn + mov BYTE [edi], al -%else ; RGB_PIXELSIZE == 4 ; ----------- +%else ; RGB_PIXELSIZE == 4 ; ----------- %ifdef RGBX_FILLER_0XFF - pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) %else - pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) %endif - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) - punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) - punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) - - movdqa xmmC,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) - punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) - movdqa xmmG,xmmB - punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) - punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) - - movdqa xmmD,xmmA - punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - movdqa xmmH,xmmC - punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test edi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC - movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH - jmp short .out0 + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC, xmmA + punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG, xmmB + punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD, xmmA + punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH, xmmC + punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 .out1: ; --(unaligned)----------------- - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC - movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH .out0: - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub ecx, byte SIZEOF_XMMWORD - jz near .endcolumn + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .endcolumn - add esi, byte SIZEOF_XMMWORD ; inptr0 - dec al ; Yctr - jnz near .Yloop_2nd + add esi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd - add ebx, byte SIZEOF_XMMWORD ; inptr1 - add edx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop - alignx 16,7 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16, 7 .column_st32: - cmp ecx, byte SIZEOF_XMMWORD/2 - jb short .column_st16 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - add edi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmC - movdqa xmmD,xmmH - sub ecx, byte SIZEOF_XMMWORD/2 + cmp ecx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmC + movdqa xmmD, xmmH + sub ecx, byte SIZEOF_XMMWORD/2 .column_st16: - cmp ecx, byte SIZEOF_XMMWORD/4 - jb short .column_st15 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub ecx, byte SIZEOF_XMMWORD/4 + cmp ecx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA, xmmD + sub ecx, byte SIZEOF_XMMWORD/4 .column_st15: - ; Store two pixels (8 bytes) of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_XMMWORD/8 - jb short .column_st7 - movq XMM_MMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD/8*4 - sub ecx, byte SIZEOF_XMMWORD/8 - psrldq xmmA, SIZEOF_XMMWORD/8*4 + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD/8*4 + sub ecx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 .column_st7: - ; Store one pixel (4 bytes) of xmmA to the output when it has enough - ; space. - test ecx, ecx - jz short .endcolumn - movd XMM_DWORD [edi], xmmA + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test ecx, ecx + jz short .endcolumn + movd XMM_DWORD [edi], xmmA -%endif ; RGB_PIXELSIZE ; --------------- +%endif ; RGB_PIXELSIZE ; --------------- .endcolumn: - sfence ; flush the write buffer + sfence ; flush the write buffer .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -457,62 +457,62 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): ; JSAMPARRAY output_buf); ; -%define output_width(b) (b)+8 ; JDIMENSION output_width -%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf -%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf +%define output_width(b) (b)+8 ; JDIMENSION output_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf - align 16 - global EXTN(jsimd_h2v2_merged_upsample_sse2) + align 16 + global EXTN(jsimd_h2v2_merged_upsample_sse2) EXTN(jsimd_h2v2_merged_upsample_sse2): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov eax, POINTER [output_width(ebp)] - - mov edi, JSAMPIMAGE [input_buf(ebp)] - mov ecx, JDIMENSION [in_row_group_ctr(ebp)] - mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] - mov edi, JSAMPARRAY [output_buf(ebp)] - lea esi, [esi+ecx*SIZEOF_JSAMPROW] - - push edx ; inptr2 - push ebx ; inptr1 - push esi ; inptr00 - mov ebx,esp - - push edi ; output_buf (outptr0) - push ecx ; in_row_group_ctr - push ebx ; input_buf - push eax ; output_width - - call near EXTN(jsimd_h2v1_merged_upsample_sse2) - - add esi, byte SIZEOF_JSAMPROW ; inptr01 - add edi, byte SIZEOF_JSAMPROW ; outptr1 - mov POINTER [ebx+0*SIZEOF_POINTER], esi - mov POINTER [ebx-1*SIZEOF_POINTER], edi - - call near EXTN(jsimd_h2v1_merged_upsample_sse2) - - add esp, byte 7*SIZEOF_DWORD - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov eax, POINTER [output_width(ebp)] + + mov edi, JSAMPIMAGE [input_buf(ebp)] + mov ecx, JDIMENSION [in_row_group_ctr(ebp)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(ebp)] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + + push edx ; inptr2 + push ebx ; inptr1 + push esi ; inptr00 + mov ebx, esp + + push edi ; output_buf (outptr0) + push ecx ; in_row_group_ctr + push ebx ; input_buf + push eax ; output_width + + call near EXTN(jsimd_h2v1_merged_upsample_sse2) + + add esi, byte SIZEOF_JSAMPROW ; inptr01 + add edi, byte SIZEOF_JSAMPROW ; outptr1 + mov POINTER [ebx+0*SIZEOF_POINTER], esi + mov POINTER [ebx-1*SIZEOF_POINTER], edi + + call near EXTN(jsimd_h2v1_merged_upsample_sse2) + + add esp, byte 7*SIZEOF_DWORD + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jdsample-sse2-64.asm b/simd/jdsample-sse2-64.asm index 1faaed6..9a99050 100644 --- a/simd/jdsample-sse2-64.asm +++ b/simd/jdsample-sse2-64.asm @@ -19,24 +19,24 @@ %include "jsimdext.inc" ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_fancy_upsample_sse2) + alignz 16 + global EXTN(jconst_fancy_upsample_sse2) EXTN(jconst_fancy_upsample_sse2): -PW_ONE times 8 dw 1 -PW_TWO times 8 dw 2 -PW_THREE times 8 dw 3 -PW_SEVEN times 8 dw 7 -PW_EIGHT times 8 dw 8 +PW_ONE times 8 dw 1 +PW_TWO times 8 dw 2 +PW_THREE times 8 dw 3 +PW_SEVEN times 8 dw 7 +PW_EIGHT times 8 dw 8 - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. ; @@ -57,127 +57,127 @@ PW_EIGHT times 8 dw 8 ; r12 = JSAMPARRAY input_data ; r13 = JSAMPARRAY *output_data_ptr - align 16 - global EXTN(jsimd_h2v1_fancy_upsample_sse2) + align 16 + global EXTN(jsimd_h2v1_fancy_upsample_sse2) EXTN(jsimd_h2v1_fancy_upsample_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - - mov eax, r11d ; colctr - test rax,rax - jz near .return - - mov rcx, r10 ; rowctr - test rcx,rcx - jz near .return - - mov rsi, r12 ; input_data - mov rdi, r13 - mov rdi, JSAMPARRAY [rdi] ; output_data + push rbp + mov rax, rsp + mov rbp, rsp + collect_args + + mov eax, r11d ; colctr + test rax, rax + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data .rowloop: - push rax ; colctr - push rdi - push rsi + push rax ; colctr + push rdi + push rsi - mov rsi, JSAMPROW [rsi] ; inptr - mov rdi, JSAMPROW [rdi] ; outptr + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr - test rax, SIZEOF_XMMWORD-1 - jz short .skip - mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + test rax, SIZEOF_XMMWORD-1 + jz short .skip + mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample .skip: - pxor xmm0,xmm0 ; xmm0=(all 0's) - pcmpeqb xmm7,xmm7 - psrldq xmm7,(SIZEOF_XMMWORD-1) - pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD] + pxor xmm0, xmm0 ; xmm0=(all 0's) + pcmpeqb xmm7, xmm7 + psrldq xmm7, (SIZEOF_XMMWORD-1) + pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD] - add rax, byte SIZEOF_XMMWORD-1 - and rax, byte -SIZEOF_XMMWORD - cmp rax, byte SIZEOF_XMMWORD - ja short .columnloop + add rax, byte SIZEOF_XMMWORD-1 + and rax, byte -SIZEOF_XMMWORD + cmp rax, byte SIZEOF_XMMWORD + ja short .columnloop .columnloop_last: - pcmpeqb xmm6,xmm6 - pslldq xmm6,(SIZEOF_XMMWORD-1) - pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD] - jmp short .upsample + pcmpeqb xmm6, xmm6 + pslldq xmm6, (SIZEOF_XMMWORD-1) + pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD] + jmp short .upsample .columnloop: - movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD] - pslldq xmm6,(SIZEOF_XMMWORD-1) + movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD] + pslldq xmm6, (SIZEOF_XMMWORD-1) .upsample: - movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqa xmm2,xmm1 - movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) - pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) - psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) - - por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) - por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) - - movdqa xmm7,xmm1 - psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) - - movdqa xmm4,xmm1 - punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) - punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) - movdqa xmm5,xmm2 - punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) - punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) - movdqa xmm6,xmm3 - punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) - punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) - - pmullw xmm1,[rel PW_THREE] - pmullw xmm4,[rel PW_THREE] - paddw xmm2,[rel PW_ONE] - paddw xmm5,[rel PW_ONE] - paddw xmm3,[rel PW_TWO] - paddw xmm6,[rel PW_TWO] - - paddw xmm2,xmm1 - paddw xmm5,xmm4 - psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) - psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) - paddw xmm3,xmm1 - paddw xmm6,xmm4 - psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) - psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) - - psllw xmm3,BYTE_BIT - psllw xmm6,BYTE_BIT - por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) - por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) - - movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5 - - sub rax, byte SIZEOF_XMMWORD - add rsi, byte 1*SIZEOF_XMMWORD ; inptr - add rdi, byte 2*SIZEOF_XMMWORD ; outptr - cmp rax, byte SIZEOF_XMMWORD - ja near .columnloop - test eax,eax - jnz near .columnloop_last - - pop rsi - pop rdi - pop rax - - add rsi, byte SIZEOF_JSAMPROW ; input_data - add rdi, byte SIZEOF_JSAMPROW ; output_data - dec rcx ; rowctr - jg near .rowloop + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm2, xmm1 + movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15) + pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14) + psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --) + + por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14) + por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16) + + movdqa xmm7, xmm1 + psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) + + movdqa xmm4, xmm1 + punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm2 + punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) + punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) + movdqa xmm6, xmm3 + punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) + punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) + + pmullw xmm1, [rel PW_THREE] + pmullw xmm4, [rel PW_THREE] + paddw xmm2, [rel PW_ONE] + paddw xmm5, [rel PW_ONE] + paddw xmm3, [rel PW_TWO] + paddw xmm6, [rel PW_TWO] + + paddw xmm2, xmm1 + paddw xmm5, xmm4 + psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) + psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) + paddw xmm3, xmm1 + paddw xmm6, xmm4 + psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) + psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) + + psllw xmm3, BYTE_BIT + psllw xmm6, BYTE_BIT + por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) + por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5 + + sub rax, byte SIZEOF_XMMWORD + add rsi, byte 1*SIZEOF_XMMWORD ; inptr + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + cmp rax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop rsi + pop rdi + pop rax + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rcx ; rowctr + jg near .rowloop .return: - uncollect_args - pop rbp - ret + uncollect_args + pop rbp + ret ; -------------------------------------------------------------------------- ; @@ -196,288 +196,288 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2): ; r12 = JSAMPARRAY input_data ; r13 = JSAMPARRAY *output_data_ptr -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 4 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 4 - align 16 - global EXTN(jsimd_h2v2_fancy_upsample_sse2) + align 16 + global EXTN(jsimd_h2v2_fancy_upsample_sse2) EXTN(jsimd_h2v2_fancy_upsample_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args - push rbx - - mov eax, r11d ; colctr - test rax,rax - jz near .return - - mov rcx, r10 ; rowctr - test rcx,rcx - jz near .return - - mov rsi, r12 ; input_data - mov rdi, r13 - mov rdi, JSAMPARRAY [rdi] ; output_data + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov eax, r11d ; colctr + test rax, rax + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data .rowloop: - push rax ; colctr - push rcx - push rdi - push rsi - - mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) - mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 - mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) - mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 - mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 - - test rax, SIZEOF_XMMWORD-1 - jz short .skip - push rdx - mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl - mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl - mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample - pop rdx + push rax ; colctr + push rcx + push rdi + push rsi + + mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 + mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 + mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 + + test rax, SIZEOF_XMMWORD-1 + jz short .skip + push rdx + mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop rdx .skip: - ; -- process the first column block - - movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] - movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] - movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] - - pxor xmm3,xmm3 ; xmm3=(all 0's) - movdqa xmm4,xmm0 - punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) - punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) - movdqa xmm5,xmm1 - punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) - movdqa xmm6,xmm2 - punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) - - pmullw xmm0,[rel PW_THREE] - pmullw xmm4,[rel PW_THREE] - - pcmpeqb xmm7,xmm7 - psrldq xmm7,(SIZEOF_XMMWORD-2) - - paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) - paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) - paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) - paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) - - movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save - movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data - movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6 - - pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) - pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) - - movdqa XMMWORD [wk(0)], xmm1 - movdqa XMMWORD [wk(1)], xmm2 - - add rax, byte SIZEOF_XMMWORD-1 - and rax, byte -SIZEOF_XMMWORD - cmp rax, byte SIZEOF_XMMWORD - ja short .columnloop + ; -- process the first column block + + movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] + movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] + movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] + + pxor xmm3, xmm3 ; xmm3=(all 0's) + movdqa xmm4, xmm0 + punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm1 + punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6, xmm2 + punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0, [rel PW_THREE] + pmullw xmm4, [rel PW_THREE] + + pcmpeqb xmm7, xmm7 + psrldq xmm7, (SIZEOF_XMMWORD-2) + + paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6 + + pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) + pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) + + movdqa XMMWORD [wk(0)], xmm1 + movdqa XMMWORD [wk(1)], xmm2 + + add rax, byte SIZEOF_XMMWORD-1 + and rax, byte -SIZEOF_XMMWORD + cmp rax, byte SIZEOF_XMMWORD + ja short .columnloop .columnloop_last: - ; -- process the last column block + ; -- process the last column block - pcmpeqb xmm1,xmm1 - pslldq xmm1,(SIZEOF_XMMWORD-2) - movdqa xmm2,xmm1 + pcmpeqb xmm1, xmm1 + pslldq xmm1, (SIZEOF_XMMWORD-2) + movdqa xmm2, xmm1 - pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD] - pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD] + pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD] + pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD] - movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) - movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) + movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) + movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) - jmp near .upsample + jmp near .upsample .columnloop: - ; -- process the next column block - - movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] - movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] - movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] - - pxor xmm3,xmm3 ; xmm3=(all 0's) - movdqa xmm4,xmm0 - punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) - punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) - movdqa xmm5,xmm1 - punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) - movdqa xmm6,xmm2 - punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) - - pmullw xmm0,[rel PW_THREE] - pmullw xmm4,[rel PW_THREE] - - paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) - paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) - paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) - paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) - - movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save - movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data - movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6 - - pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) - pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) - - movdqa XMMWORD [wk(2)], xmm1 - movdqa XMMWORD [wk(3)], xmm2 + ; -- process the next column block + + movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] + movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] + + pxor xmm3, xmm3 ; xmm3=(all 0's) + movdqa xmm4, xmm0 + punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm1 + punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6, xmm2 + punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0, [rel PW_THREE] + pmullw xmm4, [rel PW_THREE] + + paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6 + + pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) + pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) + + movdqa XMMWORD [wk(2)], xmm1 + movdqa XMMWORD [wk(3)], xmm2 .upsample: - ; -- process the upper row - - movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD] - movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD] - - movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) - movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) - psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) - pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) - movdqa xmm5,xmm7 - movdqa xmm6,xmm3 - psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) - pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) - - por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) - por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) - - movdqa xmm1,xmm7 - movdqa xmm2,xmm3 - pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) - psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) - movdqa xmm4,xmm3 - psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) - - por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) - por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) - - movdqa XMMWORD [wk(0)], xmm4 - - pmullw xmm7,[rel PW_THREE] - pmullw xmm3,[rel PW_THREE] - paddw xmm1,[rel PW_EIGHT] - paddw xmm5,[rel PW_EIGHT] - paddw xmm0,[rel PW_SEVEN] - paddw xmm2,[rel PW_SEVEN] - - paddw xmm1,xmm7 - paddw xmm5,xmm3 - psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) - psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) - paddw xmm0,xmm7 - paddw xmm2,xmm3 - psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) - psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) - - psllw xmm0,BYTE_BIT - psllw xmm2,BYTE_BIT - por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) - por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) - - movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 - movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 - - ; -- process the lower row - - movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD] - movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD] - - movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) - movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) - psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) - pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) - movdqa xmm0,xmm6 - movdqa xmm2,xmm4 - psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) - pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) - - por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) - por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) - - movdqa xmm1,xmm6 - movdqa xmm5,xmm4 - pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) - psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) - movdqa xmm3,xmm4 - psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) - - por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) - por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) - - movdqa XMMWORD [wk(1)], xmm3 - - pmullw xmm6,[rel PW_THREE] - pmullw xmm4,[rel PW_THREE] - paddw xmm1,[rel PW_EIGHT] - paddw xmm0,[rel PW_EIGHT] - paddw xmm7,[rel PW_SEVEN] - paddw xmm5,[rel PW_SEVEN] - - paddw xmm1,xmm6 - paddw xmm0,xmm4 - psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) - psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) - paddw xmm7,xmm6 - paddw xmm5,xmm4 - psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) - psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) - - psllw xmm7,BYTE_BIT - psllw xmm5,BYTE_BIT - por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) - por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) - - movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1 - movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0 - - sub rax, byte SIZEOF_XMMWORD - add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above) - add rbx, byte 1*SIZEOF_XMMWORD ; inptr0 - add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below) - add rdx, byte 2*SIZEOF_XMMWORD ; outptr0 - add rdi, byte 2*SIZEOF_XMMWORD ; outptr1 - cmp rax, byte SIZEOF_XMMWORD - ja near .columnloop - test rax,rax - jnz near .columnloop_last - - pop rsi - pop rdi - pop rcx - pop rax - - add rsi, byte 1*SIZEOF_JSAMPROW ; input_data - add rdi, byte 2*SIZEOF_JSAMPROW ; output_data - sub rcx, byte 2 ; rowctr - jg near .rowloop + ; -- process the upper row + + movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD] + + movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) + movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) + psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --) + pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) + movdqa xmm5, xmm7 + movdqa xmm6, xmm3 + psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) + pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14) + + por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) + por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) + + movdqa xmm1, xmm7 + movdqa xmm2, xmm3 + pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --) + movdqa xmm4, xmm3 + psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(0)], xmm4 + + pmullw xmm7, [rel PW_THREE] + pmullw xmm3, [rel PW_THREE] + paddw xmm1, [rel PW_EIGHT] + paddw xmm5, [rel PW_EIGHT] + paddw xmm0, [rel PW_SEVEN] + paddw xmm2, [rel PW_SEVEN] + + paddw xmm1, xmm7 + paddw xmm5, xmm3 + psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) + psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) + paddw xmm0, xmm7 + paddw xmm2, xmm3 + psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) + psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) + + psllw xmm0, BYTE_BIT + psllw xmm2, BYTE_BIT + por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) + por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 + + ; -- process the lower row + + movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD] + movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD] + + movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) + movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) + psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --) + pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) + movdqa xmm0, xmm6 + movdqa xmm2, xmm4 + psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) + pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14) + + por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) + por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) + + movdqa xmm1, xmm6 + movdqa xmm5, xmm4 + pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --) + movdqa xmm3, xmm4 + psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(1)], xmm3 + + pmullw xmm6, [rel PW_THREE] + pmullw xmm4, [rel PW_THREE] + paddw xmm1, [rel PW_EIGHT] + paddw xmm0, [rel PW_EIGHT] + paddw xmm7, [rel PW_SEVEN] + paddw xmm5, [rel PW_SEVEN] + + paddw xmm1, xmm6 + paddw xmm0, xmm4 + psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) + psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) + paddw xmm7, xmm6 + paddw xmm5, xmm4 + psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) + psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) + + psllw xmm7, BYTE_BIT + psllw xmm5, BYTE_BIT + por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) + por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0 + + sub rax, byte SIZEOF_XMMWORD + add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above) + add rbx, byte 1*SIZEOF_XMMWORD ; inptr0 + add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below) + add rdx, byte 2*SIZEOF_XMMWORD ; outptr0 + add rdi, byte 2*SIZEOF_XMMWORD ; outptr1 + cmp rax, byte SIZEOF_XMMWORD + ja near .columnloop + test rax, rax + jnz near .columnloop_last + + pop rsi + pop rdi + pop rcx + pop rax + + add rsi, byte 1*SIZEOF_JSAMPROW ; input_data + add rdi, byte 2*SIZEOF_JSAMPROW ; output_data + sub rcx, byte 2 ; rowctr + jg near .rowloop .return: - pop rbx - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + pop rbx + uncollect_args + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; -------------------------------------------------------------------------- ; @@ -496,77 +496,77 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): ; r12 = JSAMPARRAY input_data ; r13 = JSAMPARRAY *output_data_ptr - align 16 - global EXTN(jsimd_h2v1_upsample_sse2) + align 16 + global EXTN(jsimd_h2v1_upsample_sse2) EXTN(jsimd_h2v1_upsample_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - - mov edx, r11d - add rdx, byte (2*SIZEOF_XMMWORD)-1 - and rdx, byte -(2*SIZEOF_XMMWORD) - jz near .return - - mov rcx, r10 ; rowctr - test rcx,rcx - jz short .return - - mov rsi, r12 ; input_data - mov rdi, r13 - mov rdi, JSAMPARRAY [rdi] ; output_data + push rbp + mov rax, rsp + mov rbp, rsp + collect_args + + mov edx, r11d + add rdx, byte (2*SIZEOF_XMMWORD)-1 + and rdx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz short .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data .rowloop: - push rdi - push rsi + push rdi + push rsi - mov rsi, JSAMPROW [rsi] ; inptr - mov rdi, JSAMPROW [rdi] ; outptr - mov rax,rdx ; colctr + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr + mov rax, rdx ; colctr .columnloop: - movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqa xmm1,xmm0 - punpcklbw xmm0,xmm0 - punpckhbw xmm1,xmm1 + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 - movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 - movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 - sub rax, byte 2*SIZEOF_XMMWORD - jz short .nextrow + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow - movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] - movdqa xmm3,xmm2 - punpcklbw xmm2,xmm2 - punpckhbw xmm3,xmm3 + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm2 + punpckhbw xmm3, xmm3 - movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 - sub rax, byte 2*SIZEOF_XMMWORD - jz short .nextrow + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow - add rsi, byte 2*SIZEOF_XMMWORD ; inptr - add rdi, byte 4*SIZEOF_XMMWORD ; outptr - jmp short .columnloop + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rdi, byte 4*SIZEOF_XMMWORD ; outptr + jmp short .columnloop .nextrow: - pop rsi - pop rdi + pop rsi + pop rdi - add rsi, byte SIZEOF_JSAMPROW ; input_data - add rdi, byte SIZEOF_JSAMPROW ; output_data - dec rcx ; rowctr - jg short .rowloop + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rcx ; rowctr + jg short .rowloop .return: - uncollect_args - pop rbp - ret + uncollect_args + pop rbp + ret ; -------------------------------------------------------------------------- ; @@ -574,7 +574,7 @@ EXTN(jsimd_h2v1_upsample_sse2): ; It's still a box filter. ; ; GLOBAL(void) -; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor, +; jsimd_h2v2_upsample_sse2 (int max_v_samp_factor, ; JDIMENSION output_width, ; JSAMPARRAY input_data, ; JSAMPARRAY *output_data_ptr); @@ -585,86 +585,86 @@ EXTN(jsimd_h2v1_upsample_sse2): ; r12 = JSAMPARRAY input_data ; r13 = JSAMPARRAY *output_data_ptr - align 16 - global EXTN(jsimd_h2v2_upsample_sse2) + align 16 + global EXTN(jsimd_h2v2_upsample_sse2) EXTN(jsimd_h2v2_upsample_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - push rbx - - mov edx, r11d - add rdx, byte (2*SIZEOF_XMMWORD)-1 - and rdx, byte -(2*SIZEOF_XMMWORD) - jz near .return - - mov rcx, r10 ; rowctr - test rcx,rcx - jz near .return - - mov rsi, r12 ; input_data - mov rdi, r13 - mov rdi, JSAMPARRAY [rdi] ; output_data + push rbp + mov rax, rsp + mov rbp, rsp + collect_args + push rbx + + mov edx, r11d + add rdx, byte (2*SIZEOF_XMMWORD)-1 + and rdx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov rcx, r10 ; rowctr + test rcx, rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data .rowloop: - push rdi - push rsi + push rdi + push rsi - mov rsi, JSAMPROW [rsi] ; inptr - mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 - mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 - mov rax,rdx ; colctr + mov rsi, JSAMPROW [rsi] ; inptr + mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 + mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 + mov rax, rdx ; colctr .columnloop: - movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqa xmm1,xmm0 - punpcklbw xmm0,xmm0 - punpckhbw xmm1,xmm1 + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 - movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 - movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 - movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 - movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 - sub rax, byte 2*SIZEOF_XMMWORD - jz short .nextrow + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow - movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] - movdqa xmm3,xmm2 - punpcklbw xmm2,xmm2 - punpckhbw xmm3,xmm3 + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm2 + punpckhbw xmm3, xmm3 - movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3 - movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 - sub rax, byte 2*SIZEOF_XMMWORD - jz short .nextrow + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow - add rsi, byte 2*SIZEOF_XMMWORD ; inptr - add rbx, byte 4*SIZEOF_XMMWORD ; outptr0 - add rdi, byte 4*SIZEOF_XMMWORD ; outptr1 - jmp short .columnloop + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rbx, byte 4*SIZEOF_XMMWORD ; outptr0 + add rdi, byte 4*SIZEOF_XMMWORD ; outptr1 + jmp short .columnloop .nextrow: - pop rsi - pop rdi + pop rsi + pop rdi - add rsi, byte 1*SIZEOF_JSAMPROW ; input_data - add rdi, byte 2*SIZEOF_JSAMPROW ; output_data - sub rcx, byte 2 ; rowctr - jg near .rowloop + add rsi, byte 1*SIZEOF_JSAMPROW ; input_data + add rdi, byte 2*SIZEOF_JSAMPROW ; output_data + sub rcx, byte 2 ; rowctr + jg near .rowloop .return: - pop rbx - uncollect_args - pop rbp - ret + pop rbx + uncollect_args + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jdsample-sse2.asm b/simd/jdsample-sse2.asm index 1d0059e..99a8c7c 100644 --- a/simd/jdsample-sse2.asm +++ b/simd/jdsample-sse2.asm @@ -18,24 +18,24 @@ %include "jsimdext.inc" ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_fancy_upsample_sse2) + alignz 16 + global EXTN(jconst_fancy_upsample_sse2) EXTN(jconst_fancy_upsample_sse2): -PW_ONE times 8 dw 1 -PW_TWO times 8 dw 2 -PW_THREE times 8 dw 3 -PW_SEVEN times 8 dw 7 -PW_EIGHT times 8 dw 8 +PW_ONE times 8 dw 1 +PW_TWO times 8 dw 2 +PW_THREE times 8 dw 3 +PW_SEVEN times 8 dw 7 +PW_EIGHT times 8 dw 8 - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. ; @@ -51,144 +51,144 @@ PW_EIGHT times 8 dw 8 ; JSAMPARRAY *output_data_ptr); ; -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr - align 16 - global EXTN(jsimd_h2v1_fancy_upsample_sse2) + align 16 + global EXTN(jsimd_h2v1_fancy_upsample_sse2) EXTN(jsimd_h2v1_fancy_upsample_sse2): - push ebp - mov ebp,esp - pushpic ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr - test eax,eax - jz near .return - - mov ecx, INT [max_v_samp(ebp)] ; rowctr - test ecx,ecx - jz near .return - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, POINTER [output_data_ptr(ebp)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 + push ebp + mov ebp, esp + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr + test eax, eax + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 .rowloop: - push eax ; colctr - push edi - push esi + push eax ; colctr + push edi + push esi - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr - test eax, SIZEOF_XMMWORD-1 - jz short .skip - mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + test eax, SIZEOF_XMMWORD-1 + jz short .skip + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample .skip: - pxor xmm0,xmm0 ; xmm0=(all 0's) - pcmpeqb xmm7,xmm7 - psrldq xmm7,(SIZEOF_XMMWORD-1) - pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD] + pxor xmm0, xmm0 ; xmm0=(all 0's) + pcmpeqb xmm7, xmm7 + psrldq xmm7, (SIZEOF_XMMWORD-1) + pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD] - add eax, byte SIZEOF_XMMWORD-1 - and eax, byte -SIZEOF_XMMWORD - cmp eax, byte SIZEOF_XMMWORD - ja short .columnloop - alignx 16,7 + add eax, byte SIZEOF_XMMWORD-1 + and eax, byte -SIZEOF_XMMWORD + cmp eax, byte SIZEOF_XMMWORD + ja short .columnloop + alignx 16, 7 .columnloop_last: - pcmpeqb xmm6,xmm6 - pslldq xmm6,(SIZEOF_XMMWORD-1) - pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] - jmp short .upsample - alignx 16,7 + pcmpeqb xmm6, xmm6 + pslldq xmm6, (SIZEOF_XMMWORD-1) + pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] + jmp short .upsample + alignx 16, 7 .columnloop: - movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] - pslldq xmm6,(SIZEOF_XMMWORD-1) + movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] + pslldq xmm6, (SIZEOF_XMMWORD-1) .upsample: - movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqa xmm2,xmm1 - movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) - pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) - psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) - - por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) - por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) - - movdqa xmm7,xmm1 - psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) - - movdqa xmm4,xmm1 - punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) - punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) - movdqa xmm5,xmm2 - punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) - punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) - movdqa xmm6,xmm3 - punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) - punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) - - pmullw xmm1,[GOTOFF(ebx,PW_THREE)] - pmullw xmm4,[GOTOFF(ebx,PW_THREE)] - paddw xmm2,[GOTOFF(ebx,PW_ONE)] - paddw xmm5,[GOTOFF(ebx,PW_ONE)] - paddw xmm3,[GOTOFF(ebx,PW_TWO)] - paddw xmm6,[GOTOFF(ebx,PW_TWO)] - - paddw xmm2,xmm1 - paddw xmm5,xmm4 - psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) - psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) - paddw xmm3,xmm1 - paddw xmm6,xmm4 - psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) - psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) - - psllw xmm3,BYTE_BIT - psllw xmm6,BYTE_BIT - por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) - por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) - - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5 - - sub eax, byte SIZEOF_XMMWORD - add esi, byte 1*SIZEOF_XMMWORD ; inptr - add edi, byte 2*SIZEOF_XMMWORD ; outptr - cmp eax, byte SIZEOF_XMMWORD - ja near .columnloop - test eax,eax - jnz near .columnloop_last - - pop esi - pop edi - pop eax - - add esi, byte SIZEOF_JSAMPROW ; input_data - add edi, byte SIZEOF_JSAMPROW ; output_data - dec ecx ; rowctr - jg near .rowloop + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm2, xmm1 + movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15) + pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14) + psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --) + + por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14) + por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16) + + movdqa xmm7, xmm1 + psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) + + movdqa xmm4, xmm1 + punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm2 + punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) + punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) + movdqa xmm6, xmm3 + punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) + punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) + + pmullw xmm1, [GOTOFF(ebx,PW_THREE)] + pmullw xmm4, [GOTOFF(ebx,PW_THREE)] + paddw xmm2, [GOTOFF(ebx,PW_ONE)] + paddw xmm5, [GOTOFF(ebx,PW_ONE)] + paddw xmm3, [GOTOFF(ebx,PW_TWO)] + paddw xmm6, [GOTOFF(ebx,PW_TWO)] + + paddw xmm2, xmm1 + paddw xmm5, xmm4 + psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) + psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) + paddw xmm3, xmm1 + paddw xmm6, xmm4 + psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) + psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) + + psllw xmm3, BYTE_BIT + psllw xmm6, BYTE_BIT + por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) + por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5 + + sub eax, byte SIZEOF_XMMWORD + add esi, byte 1*SIZEOF_XMMWORD ; inptr + add edi, byte 2*SIZEOF_XMMWORD ; outptr + cmp eax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop esi + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg near .rowloop .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - poppic ebx - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -202,322 +202,322 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2): ; JSAMPARRAY *output_data_ptr); ; -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 4 -%define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 4 +%define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr - align 16 - global EXTN(jsimd_h2v2_fancy_upsample_sse2) + align 16 + global EXTN(jsimd_h2v2_fancy_upsample_sse2) EXTN(jsimd_h2v2_fancy_upsample_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov edx,eax ; edx = original ebp - mov eax, JDIMENSION [downsamp_width(edx)] ; colctr - test eax,eax - jz near .return - - mov ecx, INT [max_v_samp(edx)] ; rowctr - test ecx,ecx - jz near .return - - mov esi, JSAMPARRAY [input_data(edx)] ; input_data - mov edi, POINTER [output_data_ptr(edx)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov edx, eax ; edx = original ebp + mov eax, JDIMENSION [downsamp_width(edx)] ; colctr + test eax, eax + jz near .return + + mov ecx, INT [max_v_samp(edx)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(edx)] ; input_data + mov edi, POINTER [output_data_ptr(edx)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 .rowloop: - push eax ; colctr - push ecx - push edi - push esi - - mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 - mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 - mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 - - test eax, SIZEOF_XMMWORD-1 - jz short .skip - push edx - mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl - mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl - mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample - pop edx + push eax ; colctr + push ecx + push edi + push esi + + mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + + test eax, SIZEOF_XMMWORD-1 + jz short .skip + push edx + mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop edx .skip: - ; -- process the first column block + ; -- process the first column block - movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] - movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] - movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] + movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] + movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] + movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address - pxor xmm3,xmm3 ; xmm3=(all 0's) - movdqa xmm4,xmm0 - punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) - punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) - movdqa xmm5,xmm1 - punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) - movdqa xmm6,xmm2 - punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + pxor xmm3, xmm3 ; xmm3=(all 0's) + movdqa xmm4, xmm0 + punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm1 + punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6, xmm2 + punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) - pmullw xmm0,[GOTOFF(ebx,PW_THREE)] - pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + pmullw xmm0, [GOTOFF(ebx,PW_THREE)] + pmullw xmm4, [GOTOFF(ebx,PW_THREE)] - pcmpeqb xmm7,xmm7 - psrldq xmm7,(SIZEOF_XMMWORD-2) + pcmpeqb xmm7, xmm7 + psrldq xmm7, (SIZEOF_XMMWORD-2) - paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) - paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) - paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) - paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) - movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save - movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6 + movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6 - pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) - pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) + pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) + pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) - movdqa XMMWORD [wk(0)], xmm1 - movdqa XMMWORD [wk(1)], xmm2 + movdqa XMMWORD [wk(0)], xmm1 + movdqa XMMWORD [wk(1)], xmm2 - poppic ebx + poppic ebx - add eax, byte SIZEOF_XMMWORD-1 - and eax, byte -SIZEOF_XMMWORD - cmp eax, byte SIZEOF_XMMWORD - ja short .columnloop - alignx 16,7 + add eax, byte SIZEOF_XMMWORD-1 + and eax, byte -SIZEOF_XMMWORD + cmp eax, byte SIZEOF_XMMWORD + ja short .columnloop + alignx 16, 7 .columnloop_last: - ; -- process the last column block + ; -- process the last column block - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address - pcmpeqb xmm1,xmm1 - pslldq xmm1,(SIZEOF_XMMWORD-2) - movdqa xmm2,xmm1 + pcmpeqb xmm1, xmm1 + pslldq xmm1, (SIZEOF_XMMWORD-2) + movdqa xmm2, xmm1 - pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD] - pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD] + pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD] + pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD] - movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) - movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) + movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) + movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) - jmp near .upsample - alignx 16,7 + jmp near .upsample + alignx 16, 7 .columnloop: - ; -- process the next column block + ; -- process the next column block - movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] - movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] - movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] + movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] + movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address - pxor xmm3,xmm3 ; xmm3=(all 0's) - movdqa xmm4,xmm0 - punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) - punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) - movdqa xmm5,xmm1 - punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) - movdqa xmm6,xmm2 - punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + pxor xmm3, xmm3 ; xmm3=(all 0's) + movdqa xmm4, xmm0 + punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5, xmm1 + punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6, xmm2 + punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) - pmullw xmm0,[GOTOFF(ebx,PW_THREE)] - pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + pmullw xmm0, [GOTOFF(ebx,PW_THREE)] + pmullw xmm4, [GOTOFF(ebx,PW_THREE)] - paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) - paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) - paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) - paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) - movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save - movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data - movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6 + movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6 - pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) - pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) + pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) + pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) - movdqa XMMWORD [wk(2)], xmm1 - movdqa XMMWORD [wk(3)], xmm2 + movdqa XMMWORD [wk(2)], xmm1 + movdqa XMMWORD [wk(3)], xmm2 .upsample: - ; -- process the upper row - - movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD] - movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD] - - movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) - movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) - psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) - pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) - movdqa xmm5,xmm7 - movdqa xmm6,xmm3 - psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) - pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) - - por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) - por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) - - movdqa xmm1,xmm7 - movdqa xmm2,xmm3 - pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) - psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) - movdqa xmm4,xmm3 - psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) - - por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) - por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) - - movdqa XMMWORD [wk(0)], xmm4 - - pmullw xmm7,[GOTOFF(ebx,PW_THREE)] - pmullw xmm3,[GOTOFF(ebx,PW_THREE)] - paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] - paddw xmm5,[GOTOFF(ebx,PW_EIGHT)] - paddw xmm0,[GOTOFF(ebx,PW_SEVEN)] - paddw xmm2,[GOTOFF(ebx,PW_SEVEN)] - - paddw xmm1,xmm7 - paddw xmm5,xmm3 - psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) - psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) - paddw xmm0,xmm7 - paddw xmm2,xmm3 - psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) - psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) - - psllw xmm0,BYTE_BIT - psllw xmm2,BYTE_BIT - por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) - por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) - - movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 - movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 - - ; -- process the lower row - - movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD] - movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD] - - movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) - movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) - psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) - pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) - movdqa xmm0,xmm6 - movdqa xmm2,xmm4 - psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) - pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) - - por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) - por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) - - movdqa xmm1,xmm6 - movdqa xmm5,xmm4 - pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) - psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) - movdqa xmm3,xmm4 - psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) - - por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) - por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) - - movdqa XMMWORD [wk(1)], xmm3 - - pmullw xmm6,[GOTOFF(ebx,PW_THREE)] - pmullw xmm4,[GOTOFF(ebx,PW_THREE)] - paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] - paddw xmm0,[GOTOFF(ebx,PW_EIGHT)] - paddw xmm7,[GOTOFF(ebx,PW_SEVEN)] - paddw xmm5,[GOTOFF(ebx,PW_SEVEN)] - - paddw xmm1,xmm6 - paddw xmm0,xmm4 - psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) - psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) - paddw xmm7,xmm6 - paddw xmm5,xmm4 - psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) - psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) - - psllw xmm7,BYTE_BIT - psllw xmm5,BYTE_BIT - por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) - por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) - - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 - movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 - - poppic ebx - - sub eax, byte SIZEOF_XMMWORD - add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) - add ebx, byte 1*SIZEOF_XMMWORD ; inptr0 - add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below) - add edx, byte 2*SIZEOF_XMMWORD ; outptr0 - add edi, byte 2*SIZEOF_XMMWORD ; outptr1 - cmp eax, byte SIZEOF_XMMWORD - ja near .columnloop - test eax,eax - jnz near .columnloop_last - - pop esi - pop edi - pop ecx - pop eax - - add esi, byte 1*SIZEOF_JSAMPROW ; input_data - add edi, byte 2*SIZEOF_JSAMPROW ; output_data - sub ecx, byte 2 ; rowctr - jg near .rowloop + ; -- process the upper row + + movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD] + + movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) + movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) + psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --) + pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) + movdqa xmm5, xmm7 + movdqa xmm6, xmm3 + psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) + pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14) + + por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) + por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) + + movdqa xmm1, xmm7 + movdqa xmm2, xmm3 + pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --) + movdqa xmm4, xmm3 + psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(0)], xmm4 + + pmullw xmm7, [GOTOFF(ebx,PW_THREE)] + pmullw xmm3, [GOTOFF(ebx,PW_THREE)] + paddw xmm1, [GOTOFF(ebx,PW_EIGHT)] + paddw xmm5, [GOTOFF(ebx,PW_EIGHT)] + paddw xmm0, [GOTOFF(ebx,PW_SEVEN)] + paddw xmm2, [GOTOFF(ebx,PW_SEVEN)] + + paddw xmm1, xmm7 + paddw xmm5, xmm3 + psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) + psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) + paddw xmm0, xmm7 + paddw xmm2, xmm3 + psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) + psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) + + psllw xmm0, BYTE_BIT + psllw xmm2, BYTE_BIT + por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) + por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 + + ; -- process the lower row + + movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD] + movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD] + + movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) + movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) + psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --) + pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) + movdqa xmm0, xmm6 + movdqa xmm2, xmm4 + psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) + pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14) + + por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) + por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) + + movdqa xmm1, xmm6 + movdqa xmm5, xmm4 + pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --) + movdqa xmm3, xmm4 + psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(1)], xmm3 + + pmullw xmm6, [GOTOFF(ebx,PW_THREE)] + pmullw xmm4, [GOTOFF(ebx,PW_THREE)] + paddw xmm1, [GOTOFF(ebx,PW_EIGHT)] + paddw xmm0, [GOTOFF(ebx,PW_EIGHT)] + paddw xmm7, [GOTOFF(ebx,PW_SEVEN)] + paddw xmm5, [GOTOFF(ebx,PW_SEVEN)] + + paddw xmm1, xmm6 + paddw xmm0, xmm4 + psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) + psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) + paddw xmm7, xmm6 + paddw xmm5, xmm4 + psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) + psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) + + psllw xmm7, BYTE_BIT + psllw xmm5, BYTE_BIT + por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) + por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 + + poppic ebx + + sub eax, byte SIZEOF_XMMWORD + add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) + add ebx, byte 1*SIZEOF_XMMWORD ; inptr0 + add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below) + add edx, byte 2*SIZEOF_XMMWORD ; outptr0 + add edi, byte 2*SIZEOF_XMMWORD ; outptr1 + cmp eax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax, eax + jnz near .columnloop_last + + pop esi + pop edi + pop ecx + pop eax + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg near .rowloop .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -531,92 +531,92 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): ; JSAMPARRAY *output_data_ptr); ; -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define output_width(b) (b)+12 ; JDIMENSION output_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define output_width(b) (b)+12 ; JDIMENSION output_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr - align 16 - global EXTN(jsimd_h2v1_upsample_sse2) + align 16 + global EXTN(jsimd_h2v1_upsample_sse2) EXTN(jsimd_h2v1_upsample_sse2): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov edx, JDIMENSION [output_width(ebp)] - add edx, byte (2*SIZEOF_XMMWORD)-1 - and edx, byte -(2*SIZEOF_XMMWORD) - jz short .return - - mov ecx, INT [max_v_samp(ebp)] ; rowctr - test ecx,ecx - jz short .return - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, POINTER [output_data_ptr(ebp)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_XMMWORD)-1 + and edx, byte -(2*SIZEOF_XMMWORD) + jz short .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 .rowloop: - push edi - push esi + push edi + push esi - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr - mov eax,edx ; colctr - alignx 16,7 + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + mov eax, edx ; colctr + alignx 16, 7 .columnloop: - movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqa xmm1,xmm0 - punpcklbw xmm0,xmm0 - punpckhbw xmm1,xmm1 + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 - movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 - sub eax, byte 2*SIZEOF_XMMWORD - jz short .nextrow + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow - movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] - movdqa xmm3,xmm2 - punpcklbw xmm2,xmm2 - punpckhbw xmm3,xmm3 + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm2 + punpckhbw xmm3, xmm3 - movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 - sub eax, byte 2*SIZEOF_XMMWORD - jz short .nextrow + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow - add esi, byte 2*SIZEOF_XMMWORD ; inptr - add edi, byte 4*SIZEOF_XMMWORD ; outptr - jmp short .columnloop - alignx 16,7 + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add edi, byte 4*SIZEOF_XMMWORD ; outptr + jmp short .columnloop + alignx 16, 7 .nextrow: - pop esi - pop edi + pop esi + pop edi - add esi, byte SIZEOF_JSAMPROW ; input_data - add edi, byte SIZEOF_JSAMPROW ; output_data - dec ecx ; rowctr - jg short .rowloop + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg short .rowloop .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved -; pop ebx ; unused - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -630,99 +630,99 @@ EXTN(jsimd_h2v1_upsample_sse2): ; JSAMPARRAY *output_data_ptr); ; -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define output_width(b) (b)+12 ; JDIMENSION output_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define output_width(b) (b)+12 ; JDIMENSION output_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr - align 16 - global EXTN(jsimd_h2v2_upsample_sse2) + align 16 + global EXTN(jsimd_h2v2_upsample_sse2) EXTN(jsimd_h2v2_upsample_sse2): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov edx, JDIMENSION [output_width(ebp)] - add edx, byte (2*SIZEOF_XMMWORD)-1 - and edx, byte -(2*SIZEOF_XMMWORD) - jz near .return - - mov ecx, INT [max_v_samp(ebp)] ; rowctr - test ecx,ecx - jz near .return - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, POINTER [output_data_ptr(ebp)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_XMMWORD)-1 + and edx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx, ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16, 7 .rowloop: - push edi - push esi - - mov esi, JSAMPROW [esi] ; inptr - mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 - mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 - mov eax,edx ; colctr - alignx 16,7 + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + mov eax, edx ; colctr + alignx 16, 7 .columnloop: - movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqa xmm1,xmm0 - punpcklbw xmm0,xmm0 - punpckhbw xmm1,xmm1 + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 - movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 - movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 - movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 - sub eax, byte 2*SIZEOF_XMMWORD - jz short .nextrow + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow - movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] - movdqa xmm3,xmm2 - punpcklbw xmm2,xmm2 - punpckhbw xmm3,xmm3 + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm2 + punpckhbw xmm3, xmm3 - movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3 - movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 - sub eax, byte 2*SIZEOF_XMMWORD - jz short .nextrow + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow - add esi, byte 2*SIZEOF_XMMWORD ; inptr - add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 - add edi, byte 4*SIZEOF_XMMWORD ; outptr1 - jmp short .columnloop - alignx 16,7 + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 + add edi, byte 4*SIZEOF_XMMWORD ; outptr1 + jmp short .columnloop + alignx 16, 7 .nextrow: - pop esi - pop edi + pop esi + pop edi - add esi, byte 1*SIZEOF_JSAMPROW ; input_data - add edi, byte 2*SIZEOF_JSAMPROW ; output_data - sub ecx, byte 2 ; rowctr - jg short .rowloop + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg short .rowloop .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jfdctflt-sse-64.asm b/simd/jfdctflt-sse-64.asm index 4b64ea4..d52568d 100644 --- a/simd/jfdctflt-sse-64.asm +++ b/simd/jfdctflt-sse-64.asm @@ -25,32 +25,32 @@ ; -------------------------------------------------------------------------- -%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) - shufps %1,%2,0x44 +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1, %2, 0x44 %endmacro -%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) - shufps %1,%2,0xEE +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1, %2, 0xEE %endmacro ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_fdct_float_sse) + alignz 16 + global EXTN(jconst_fdct_float_sse) EXTN(jconst_fdct_float_sse): -PD_0_382 times 4 dd 0.382683432365089771728460 -PD_0_707 times 4 dd 0.707106781186547524400844 -PD_0_541 times 4 dd 0.541196100146196984399723 -PD_1_306 times 4 dd 1.306562964876376527856643 +PD_0_382 times 4 dd 0.382683432365089771728460 +PD_0_707 times 4 dd 0.707106781186547524400844 +PD_0_541 times 4 dd 0.541196100146196984399723 +PD_1_306 times 4 dd 1.306562964876376527856643 - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Perform the forward DCT on one block of samples. ; @@ -60,298 +60,298 @@ PD_1_306 times 4 dd 1.306562964876376527856643 ; r10 = FAST_FLOAT *data -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_fdct_float_sse) + align 16 + global EXTN(jsimd_fdct_float_sse) EXTN(jsimd_fdct_float_sse): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args - - ; ---- Pass 1: process rows. - - mov rdx, r10 ; (FAST_FLOAT *) - mov rcx, DCTSIZE/4 + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (FAST_FLOAT *) + mov rcx, DCTSIZE/4 .rowloop: - movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)] - - ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) - ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) - - movaps xmm4,xmm0 ; transpose coefficients(phase 1) - unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31) - unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33) - movaps xmm5,xmm2 ; transpose coefficients(phase 1) - unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35) - unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37) - - movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] - - ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) - ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) - - movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) - movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) - - movaps xmm4,xmm6 ; transpose coefficients(phase 1) - unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) - unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13) - movaps xmm2,xmm1 ; transpose coefficients(phase 1) - unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15) - unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17) - - movaps xmm7,xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0 - unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1 - movaps xmm3,xmm2 ; transpose coefficients(phase 2) - unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6 - unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7 - - movaps xmm0,xmm7 - movaps xmm5,xmm6 - subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 - subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 - addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 - addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 - - movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) - movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) - movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 - movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 - - movaps xmm7,xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2 - unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3 - movaps xmm6,xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4 - unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5 - - movaps xmm2,xmm7 - movaps xmm3,xmm4 - addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 - addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 - subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 - subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 - - ; -- Even part - - movaps xmm1,xmm5 - movaps xmm6,xmm0 - subps xmm5,xmm7 ; xmm5=tmp13 - subps xmm0,xmm4 ; xmm0=tmp12 - addps xmm1,xmm7 ; xmm1=tmp10 - addps xmm6,xmm4 ; xmm6=tmp11 - - addps xmm0,xmm5 - mulps xmm0,[rel PD_0_707] ; xmm0=z1 - - movaps xmm7,xmm1 - movaps xmm4,xmm5 - subps xmm1,xmm6 ; xmm1=data4 - subps xmm5,xmm0 ; xmm5=data6 - addps xmm7,xmm6 ; xmm7=data0 - addps xmm4,xmm0 ; xmm4=data2 - - movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 - - ; -- Odd part - - movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 - movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 - - addps xmm2,xmm3 ; xmm2=tmp10 - addps xmm3,xmm6 ; xmm3=tmp11 - addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 - - mulps xmm3,[rel PD_0_707] ; xmm3=z3 - - movaps xmm1,xmm2 ; xmm1=tmp10 - subps xmm2,xmm6 - mulps xmm2,[rel PD_0_382] ; xmm2=z5 - mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) - mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) - addps xmm1,xmm2 ; xmm1=z2 - addps xmm6,xmm2 ; xmm6=z4 - - movaps xmm5,xmm0 - subps xmm0,xmm3 ; xmm0=z13 - addps xmm5,xmm3 ; xmm5=z11 - - movaps xmm7,xmm0 - movaps xmm4,xmm5 - subps xmm0,xmm1 ; xmm0=data3 - subps xmm5,xmm6 ; xmm5=data7 - addps xmm7,xmm1 ; xmm7=data5 - addps xmm4,xmm6 ; xmm4=data1 - - movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 - - add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT - dec rcx - jnz near .rowloop - - ; ---- Pass 2: process columns. - - mov rdx, r10 ; (FAST_FLOAT *) - mov rcx, DCTSIZE/4 + movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) + ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) + + movaps xmm4, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31) + unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33) + movaps xmm5, xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35) + unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) + ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) + + movaps xmm4, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13) + movaps xmm2, xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15) + unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17) + + movaps xmm7, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0 + unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1 + movaps xmm3, xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6 + unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7 + + movaps xmm0, xmm7 + movaps xmm5, xmm6 + subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2 + unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3 + movaps xmm6, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4 + unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5 + + movaps xmm2, xmm7 + movaps xmm3, xmm4 + addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1, xmm5 + movaps xmm6, xmm0 + subps xmm5, xmm7 ; xmm5=tmp13 + subps xmm0, xmm4 ; xmm0=tmp12 + addps xmm1, xmm7 ; xmm1=tmp10 + addps xmm6, xmm4 ; xmm6=tmp11 + + addps xmm0, xmm5 + mulps xmm0, [rel PD_0_707] ; xmm0=z1 + + movaps xmm7, xmm1 + movaps xmm4, xmm5 + subps xmm1, xmm6 ; xmm1=data4 + subps xmm5, xmm0 ; xmm5=data6 + addps xmm7, xmm6 ; xmm7=data0 + addps xmm4, xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2, xmm3 ; xmm2=tmp10 + addps xmm3, xmm6 ; xmm3=tmp11 + addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3, [rel PD_0_707] ; xmm3=z3 + + movaps xmm1, xmm2 ; xmm1=tmp10 + subps xmm2, xmm6 + mulps xmm2, [rel PD_0_382] ; xmm2=z5 + mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1, xmm2 ; xmm1=z2 + addps xmm6, xmm2 ; xmm6=z4 + + movaps xmm5, xmm0 + subps xmm0, xmm3 ; xmm0=z13 + addps xmm5, xmm3 ; xmm5=z11 + + movaps xmm7, xmm0 + movaps xmm4, xmm5 + subps xmm0, xmm1 ; xmm0=data3 + subps xmm5, xmm6 ; xmm5=data7 + addps xmm7, xmm1 ; xmm7=data5 + addps xmm4, xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT + dec rcx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov rdx, r10 ; (FAST_FLOAT *) + mov rcx, DCTSIZE/4 .columnloop: - movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)] - - ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) - ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) - - movaps xmm4,xmm0 ; transpose coefficients(phase 1) - unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13) - unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33) - movaps xmm5,xmm2 ; transpose coefficients(phase 1) - unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53) - unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73) - - movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)] - - ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) - ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) - - movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) - movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) - - movaps xmm4,xmm6 ; transpose coefficients(phase 1) - unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11) - unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31) - movaps xmm2,xmm1 ; transpose coefficients(phase 1) - unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51) - unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71) - - movaps xmm7,xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0 - unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1 - movaps xmm3,xmm2 ; transpose coefficients(phase 2) - unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6 - unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7 - - movaps xmm0,xmm7 - movaps xmm5,xmm6 - subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 - subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 - addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 - addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 - - movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) - movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) - movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 - movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 - - movaps xmm7,xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2 - unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3 - movaps xmm6,xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4 - unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5 - - movaps xmm2,xmm7 - movaps xmm3,xmm4 - addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 - addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 - subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 - subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 - - ; -- Even part - - movaps xmm1,xmm5 - movaps xmm6,xmm0 - subps xmm5,xmm7 ; xmm5=tmp13 - subps xmm0,xmm4 ; xmm0=tmp12 - addps xmm1,xmm7 ; xmm1=tmp10 - addps xmm6,xmm4 ; xmm6=tmp11 - - addps xmm0,xmm5 - mulps xmm0,[rel PD_0_707] ; xmm0=z1 - - movaps xmm7,xmm1 - movaps xmm4,xmm5 - subps xmm1,xmm6 ; xmm1=data4 - subps xmm5,xmm0 ; xmm5=data6 - addps xmm7,xmm6 ; xmm7=data0 - addps xmm4,xmm0 ; xmm4=data2 - - movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 - - ; -- Odd part - - movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 - movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 - - addps xmm2,xmm3 ; xmm2=tmp10 - addps xmm3,xmm6 ; xmm3=tmp11 - addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 - - mulps xmm3,[rel PD_0_707] ; xmm3=z3 - - movaps xmm1,xmm2 ; xmm1=tmp10 - subps xmm2,xmm6 - mulps xmm2,[rel PD_0_382] ; xmm2=z5 - mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) - mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) - addps xmm1,xmm2 ; xmm1=z2 - addps xmm6,xmm2 ; xmm6=z4 - - movaps xmm5,xmm0 - subps xmm0,xmm3 ; xmm0=z13 - addps xmm5,xmm3 ; xmm5=z11 - - movaps xmm7,xmm0 - movaps xmm4,xmm5 - subps xmm0,xmm1 ; xmm0=data3 - subps xmm5,xmm6 ; xmm5=data7 - addps xmm7,xmm1 ; xmm7=data5 - addps xmm4,xmm6 ; xmm4=data1 - - movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 - - add rdx, byte 4*SIZEOF_FAST_FLOAT - dec rcx - jnz near .columnloop - - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) + ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) + + movaps xmm4, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13) + unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33) + movaps xmm5, xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53) + unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) + ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) + + movaps xmm4, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11) + unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31) + movaps xmm2, xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51) + unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71) + + movaps xmm7, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0 + unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1 + movaps xmm3, xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6 + unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7 + + movaps xmm0, xmm7 + movaps xmm5, xmm6 + subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2 + unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3 + movaps xmm6, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4 + unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5 + + movaps xmm2, xmm7 + movaps xmm3, xmm4 + addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1, xmm5 + movaps xmm6, xmm0 + subps xmm5, xmm7 ; xmm5=tmp13 + subps xmm0, xmm4 ; xmm0=tmp12 + addps xmm1, xmm7 ; xmm1=tmp10 + addps xmm6, xmm4 ; xmm6=tmp11 + + addps xmm0, xmm5 + mulps xmm0, [rel PD_0_707] ; xmm0=z1 + + movaps xmm7, xmm1 + movaps xmm4, xmm5 + subps xmm1, xmm6 ; xmm1=data4 + subps xmm5, xmm0 ; xmm5=data6 + addps xmm7, xmm6 ; xmm7=data0 + addps xmm4, xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2, xmm3 ; xmm2=tmp10 + addps xmm3, xmm6 ; xmm3=tmp11 + addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3, [rel PD_0_707] ; xmm3=z3 + + movaps xmm1, xmm2 ; xmm1=tmp10 + subps xmm2, xmm6 + mulps xmm2, [rel PD_0_382] ; xmm2=z5 + mulps xmm1, [rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6, [rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1, xmm2 ; xmm1=z2 + addps xmm6, xmm2 ; xmm6=z4 + + movaps xmm5, xmm0 + subps xmm0, xmm3 ; xmm0=z13 + addps xmm5, xmm3 ; xmm5=z11 + + movaps xmm7, xmm0 + movaps xmm4, xmm5 + subps xmm0, xmm1 ; xmm0=data3 + subps xmm5, xmm6 ; xmm5=data7 + addps xmm7, xmm1 ; xmm7=data5 + addps xmm4, xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + add rdx, byte 4*SIZEOF_FAST_FLOAT + dec rcx + jnz near .columnloop + + uncollect_args + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jfdctflt-sse.asm b/simd/jfdctflt-sse.asm index e7ede26..8b1ce18 100644 --- a/simd/jfdctflt-sse.asm +++ b/simd/jfdctflt-sse.asm @@ -24,32 +24,32 @@ ; -------------------------------------------------------------------------- -%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) - shufps %1,%2,0x44 +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1, %2, 0x44 %endmacro -%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) - shufps %1,%2,0xEE +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1, %2, 0xEE %endmacro ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_fdct_float_sse) + alignz 16 + global EXTN(jconst_fdct_float_sse) EXTN(jconst_fdct_float_sse): -PD_0_382 times 4 dd 0.382683432365089771728460 -PD_0_707 times 4 dd 0.707106781186547524400844 -PD_0_541 times 4 dd 0.541196100146196984399723 -PD_1_306 times 4 dd 1.306562964876376527856643 +PD_0_382 times 4 dd 0.382683432365089771728460 +PD_0_707 times 4 dd 0.707106781186547524400844 +PD_0_541 times 4 dd 0.541196100146196984399723 +PD_1_306 times 4 dd 1.306562964876376527856643 - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform the forward DCT on one block of samples. ; @@ -57,313 +57,313 @@ PD_1_306 times 4 dd 1.306562964876376527856643 ; jsimd_fdct_float_sse (FAST_FLOAT *data) ; -%define data(b) (b)+8 ; FAST_FLOAT *data +%define data(b) (b)+8 ; FAST_FLOAT *data -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_fdct_float_sse) + align 16 + global EXTN(jsimd_fdct_float_sse) EXTN(jsimd_fdct_float_sse): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved -; push esi ; unused -; push edi ; unused - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process rows. - - mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) - mov ecx, DCTSIZE/4 - alignx 16,7 + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/4 + alignx 16, 7 .rowloop: - movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)] - - ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) - ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) - - movaps xmm4,xmm0 ; transpose coefficients(phase 1) - unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31) - unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33) - movaps xmm5,xmm2 ; transpose coefficients(phase 1) - unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35) - unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37) - - movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] - - ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) - ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) - - movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) - movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) - - movaps xmm4,xmm6 ; transpose coefficients(phase 1) - unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) - unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13) - movaps xmm2,xmm1 ; transpose coefficients(phase 1) - unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15) - unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17) - - movaps xmm7,xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0 - unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1 - movaps xmm3,xmm2 ; transpose coefficients(phase 2) - unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6 - unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7 - - movaps xmm0,xmm7 - movaps xmm5,xmm6 - subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 - subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 - addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 - addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 - - movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) - movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) - movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 - movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 - - movaps xmm7,xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2 - unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3 - movaps xmm6,xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4 - unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5 - - movaps xmm2,xmm7 - movaps xmm3,xmm4 - addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 - addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 - subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 - subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 - - ; -- Even part - - movaps xmm1,xmm5 - movaps xmm6,xmm0 - subps xmm5,xmm7 ; xmm5=tmp13 - subps xmm0,xmm4 ; xmm0=tmp12 - addps xmm1,xmm7 ; xmm1=tmp10 - addps xmm6,xmm4 ; xmm6=tmp11 - - addps xmm0,xmm5 - mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1 - - movaps xmm7,xmm1 - movaps xmm4,xmm5 - subps xmm1,xmm6 ; xmm1=data4 - subps xmm5,xmm0 ; xmm5=data6 - addps xmm7,xmm6 ; xmm7=data0 - addps xmm4,xmm0 ; xmm4=data2 - - movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 - - ; -- Odd part - - movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 - movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 - - addps xmm2,xmm3 ; xmm2=tmp10 - addps xmm3,xmm6 ; xmm3=tmp11 - addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 - - mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3 - - movaps xmm1,xmm2 ; xmm1=tmp10 - subps xmm2,xmm6 - mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5 - mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) - mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) - addps xmm1,xmm2 ; xmm1=z2 - addps xmm6,xmm2 ; xmm6=z4 - - movaps xmm5,xmm0 - subps xmm0,xmm3 ; xmm0=z13 - addps xmm5,xmm3 ; xmm5=z11 - - movaps xmm7,xmm0 - movaps xmm4,xmm5 - subps xmm0,xmm1 ; xmm0=data3 - subps xmm5,xmm6 ; xmm5=data7 - addps xmm7,xmm1 ; xmm7=data5 - addps xmm4,xmm6 ; xmm4=data1 - - movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 - - add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT - dec ecx - jnz near .rowloop - - ; ---- Pass 2: process columns. - - mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) - mov ecx, DCTSIZE/4 - alignx 16,7 + movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) + ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) + + movaps xmm4, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31) + unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33) + movaps xmm5, xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35) + unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) + ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) + + movaps xmm4, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13) + movaps xmm2, xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15) + unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17) + + movaps xmm7, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0 + unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1 + movaps xmm3, xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6 + unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7 + + movaps xmm0, xmm7 + movaps xmm5, xmm6 + subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2 + unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3 + movaps xmm6, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4 + unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5 + + movaps xmm2, xmm7 + movaps xmm3, xmm4 + addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1, xmm5 + movaps xmm6, xmm0 + subps xmm5, xmm7 ; xmm5=tmp13 + subps xmm0, xmm4 ; xmm0=tmp12 + addps xmm1, xmm7 ; xmm1=tmp10 + addps xmm6, xmm4 ; xmm6=tmp11 + + addps xmm0, xmm5 + mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1 + + movaps xmm7, xmm1 + movaps xmm4, xmm5 + subps xmm1, xmm6 ; xmm1=data4 + subps xmm5, xmm0 ; xmm5=data6 + addps xmm7, xmm6 ; xmm7=data0 + addps xmm4, xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2, xmm3 ; xmm2=tmp10 + addps xmm3, xmm6 ; xmm3=tmp11 + addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3 + + movaps xmm1, xmm2 ; xmm1=tmp10 + subps xmm2, xmm6 + mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5 + mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1, xmm2 ; xmm1=z2 + addps xmm6, xmm2 ; xmm6=z4 + + movaps xmm5, xmm0 + subps xmm0, xmm3 ; xmm0=z13 + addps xmm5, xmm3 ; xmm5=z11 + + movaps xmm7, xmm0 + movaps xmm4, xmm5 + subps xmm0, xmm1 ; xmm0=data3 + subps xmm5, xmm6 ; xmm5=data7 + addps xmm7, xmm1 ; xmm7=data5 + addps xmm4, xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/4 + alignx 16, 7 .columnloop: - movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] - - ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) - ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) - - movaps xmm4,xmm0 ; transpose coefficients(phase 1) - unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13) - unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33) - movaps xmm5,xmm2 ; transpose coefficients(phase 1) - unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53) - unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73) - - movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] - - ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) - ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) - - movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) - movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) - - movaps xmm4,xmm6 ; transpose coefficients(phase 1) - unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11) - unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31) - movaps xmm2,xmm1 ; transpose coefficients(phase 1) - unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51) - unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71) - - movaps xmm7,xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0 - unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1 - movaps xmm3,xmm2 ; transpose coefficients(phase 2) - unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6 - unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7 - - movaps xmm0,xmm7 - movaps xmm5,xmm6 - subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 - subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 - addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 - addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 - - movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) - movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) - movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 - movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 - - movaps xmm7,xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2 - unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3 - movaps xmm6,xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4 - unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5 - - movaps xmm2,xmm7 - movaps xmm3,xmm4 - addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 - addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 - subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 - subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 - - ; -- Even part - - movaps xmm1,xmm5 - movaps xmm6,xmm0 - subps xmm5,xmm7 ; xmm5=tmp13 - subps xmm0,xmm4 ; xmm0=tmp12 - addps xmm1,xmm7 ; xmm1=tmp10 - addps xmm6,xmm4 ; xmm6=tmp11 - - addps xmm0,xmm5 - mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1 - - movaps xmm7,xmm1 - movaps xmm4,xmm5 - subps xmm1,xmm6 ; xmm1=data4 - subps xmm5,xmm0 ; xmm5=data6 - addps xmm7,xmm6 ; xmm7=data0 - addps xmm4,xmm0 ; xmm4=data2 - - movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 - - ; -- Odd part - - movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 - movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 - - addps xmm2,xmm3 ; xmm2=tmp10 - addps xmm3,xmm6 ; xmm3=tmp11 - addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 - - mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3 - - movaps xmm1,xmm2 ; xmm1=tmp10 - subps xmm2,xmm6 - mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5 - mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) - mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) - addps xmm1,xmm2 ; xmm1=z2 - addps xmm6,xmm2 ; xmm6=z4 - - movaps xmm5,xmm0 - subps xmm0,xmm3 ; xmm0=z13 - addps xmm5,xmm3 ; xmm5=z11 - - movaps xmm7,xmm0 - movaps xmm4,xmm5 - subps xmm0,xmm1 ; xmm0=data3 - subps xmm5,xmm6 ; xmm5=data7 - addps xmm7,xmm1 ; xmm7=data5 - addps xmm4,xmm6 ; xmm4=data1 - - movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 - - add edx, byte 4*SIZEOF_FAST_FLOAT - dec ecx - jnz near .columnloop - -; pop edi ; unused -; pop esi ; unused -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) + ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) + + movaps xmm4, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13) + unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33) + movaps xmm5, xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53) + unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) + ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) + + movaps xmm4, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11) + unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31) + movaps xmm2, xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51) + unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71) + + movaps xmm7, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0 + unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1 + movaps xmm3, xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6 + unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7 + + movaps xmm0, xmm7 + movaps xmm5, xmm6 + subps xmm7, xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6, xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0, xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5, xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2 + unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3 + movaps xmm6, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4 + unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5 + + movaps xmm2, xmm7 + movaps xmm3, xmm4 + addps xmm7, xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4, xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2, xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1, xmm5 + movaps xmm6, xmm0 + subps xmm5, xmm7 ; xmm5=tmp13 + subps xmm0, xmm4 ; xmm0=tmp12 + addps xmm1, xmm7 ; xmm1=tmp10 + addps xmm6, xmm4 ; xmm6=tmp11 + + addps xmm0, xmm5 + mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1 + + movaps xmm7, xmm1 + movaps xmm4, xmm5 + subps xmm1, xmm6 ; xmm1=data4 + subps xmm5, xmm0 ; xmm5=data6 + addps xmm7, xmm6 ; xmm7=data0 + addps xmm4, xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2, xmm3 ; xmm2=tmp10 + addps xmm3, xmm6 ; xmm3=tmp11 + addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3 + + movaps xmm1, xmm2 ; xmm1=tmp10 + subps xmm2, xmm6 + mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5 + mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1, xmm2 ; xmm1=z2 + addps xmm6, xmm2 ; xmm6=z4 + + movaps xmm5, xmm0 + subps xmm0, xmm3 ; xmm0=z13 + addps xmm5, xmm3 ; xmm5=z11 + + movaps xmm7, xmm0 + movaps xmm4, xmm5 + subps xmm0, xmm1 ; xmm0=data3 + subps xmm5, xmm6 ; xmm5=data7 + addps xmm7, xmm1 ; xmm7=data5 + addps xmm4, xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + add edx, byte 4*SIZEOF_FAST_FLOAT + dec ecx + jnz near .columnloop + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jfdctfst-sse2-64.asm b/simd/jfdctfst-sse2-64.asm index 4c96685..19aa304 100644 --- a/simd/jfdctfst-sse2-64.asm +++ b/simd/jfdctfst-sse2-64.asm @@ -26,46 +26,46 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 8 ; 14 is also OK. +%define CONST_BITS 8 ; 14 is also OK. %if CONST_BITS == 8 -F_0_382 equ 98 ; FIX(0.382683433) -F_0_541 equ 139 ; FIX(0.541196100) -F_0_707 equ 181 ; FIX(0.707106781) -F_1_306 equ 334 ; FIX(1.306562965) +F_0_382 equ 98 ; FIX(0.382683433) +F_0_541 equ 139 ; FIX(0.541196100) +F_0_707 equ 181 ; FIX(0.707106781) +F_1_306 equ 334 ; FIX(1.306562965) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) -F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) +F_0_382 equ DESCALE( 410903207, 30-CONST_BITS) ; FIX(0.382683433) +F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100) +F_0_707 equ DESCALE( 759250124, 30-CONST_BITS) ; FIX(0.707106781) +F_1_306 equ DESCALE(1402911301, 30-CONST_BITS) ; FIX(1.306562965) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) -%define PRE_MULTIPLY_SCALE_BITS 2 -%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 16 - global EXTN(jconst_fdct_ifast_sse2) + alignz 16 + global EXTN(jconst_fdct_ifast_sse2) EXTN(jconst_fdct_ifast_sse2): -PW_F0707 times 8 dw F_0_707 << CONST_SHIFT -PW_F0382 times 8 dw F_0_382 << CONST_SHIFT -PW_F0541 times 8 dw F_0_541 << CONST_SHIFT -PW_F1306 times 8 dw F_1_306 << CONST_SHIFT +PW_F0707 times 8 dw F_0_707 << CONST_SHIFT +PW_F0382 times 8 dw F_0_382 << CONST_SHIFT +PW_F0541 times 8 dw F_0_541 << CONST_SHIFT +PW_F1306 times 8 dw F_1_306 << CONST_SHIFT - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Perform the forward DCT on one block of samples. ; @@ -75,317 +75,317 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT ; r10 = DCTELEM *data -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_fdct_ifast_sse2) + align 16 + global EXTN(jsimd_fdct_ifast_sse2) EXTN(jsimd_fdct_ifast_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args - - ; ---- Pass 1: process rows. - - mov rdx, r10 ; (DCTELEM *) - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] - - ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) - ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) - - movdqa xmm4,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) - punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) - movdqa xmm5,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) - punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) - - movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] - - ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) - ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) - movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) - - movdqa xmm2,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) - punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) - movdqa xmm5,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) - punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) - - movdqa xmm7,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) - punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) - movdqa xmm3,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) - punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) - movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) - - movdqa xmm7,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) - punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) - movdqa xmm2,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) - punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) - - movdqa xmm1,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 - punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 - movdqa xmm5,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 - punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 - - movdqa xmm6,xmm1 - movdqa xmm3,xmm0 - psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 - psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 - paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 - paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 - - movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) - movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 - - movdqa xmm1,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 - punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 - movdqa xmm0,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 - punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 - - movdqa xmm2,xmm1 - movdqa xmm5,xmm7 - paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 - paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 - psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 - psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm4,xmm3 - movdqa xmm0,xmm6 - psubw xmm3,xmm1 ; xmm3=tmp13 - psubw xmm6,xmm7 ; xmm6=tmp12 - paddw xmm4,xmm1 ; xmm4=tmp10 - paddw xmm0,xmm7 ; xmm0=tmp11 - - paddw xmm6,xmm3 - psllw xmm6,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm6,[rel PW_F0707] ; xmm6=z1 - - movdqa xmm1,xmm4 - movdqa xmm7,xmm3 - psubw xmm4,xmm0 ; xmm4=data4 - psubw xmm3,xmm6 ; xmm3=data6 - paddw xmm1,xmm0 ; xmm1=data0 - paddw xmm7,xmm6 ; xmm7=data2 - - movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 - movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 - - ; -- Odd part - - paddw xmm2,xmm5 ; xmm2=tmp10 - paddw xmm5,xmm0 ; xmm5=tmp11 - paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7 - - psllw xmm2,PRE_MULTIPLY_SCALE_BITS - psllw xmm0,PRE_MULTIPLY_SCALE_BITS - - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm5,[rel PW_F0707] ; xmm5=z3 - - movdqa xmm4,xmm2 ; xmm4=tmp10 - psubw xmm2,xmm0 - pmulhw xmm2,[rel PW_F0382] ; xmm2=z5 - pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) - pmulhw xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) - paddw xmm4,xmm2 ; xmm4=z2 - paddw xmm0,xmm2 ; xmm0=z4 - - movdqa xmm3,xmm6 - psubw xmm6,xmm5 ; xmm6=z13 - paddw xmm3,xmm5 ; xmm3=z11 - - movdqa xmm2,xmm6 - movdqa xmm5,xmm3 - psubw xmm6,xmm4 ; xmm6=data3 - psubw xmm3,xmm0 ; xmm3=data7 - paddw xmm2,xmm4 ; xmm2=data5 - paddw xmm5,xmm0 ; xmm5=data1 - - ; ---- Pass 2: process columns. - - ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) - ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) - - movdqa xmm4,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31) - punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71) - movdqa xmm0,xmm7 ; transpose coefficients(phase 1) - punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33) - punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73) - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 - movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 - - ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) - ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) - - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) - - movdqa xmm7,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35) - punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75) - movdqa xmm0,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37) - punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77) - - movdqa xmm2,xmm5 ; transpose coefficients(phase 2) - punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17) - punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37) - movdqa xmm3,xmm7 ; transpose coefficients(phase 2) - punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57) - punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77) - - movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) - movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) - movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) - - movdqa xmm2,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13) - punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33) - movdqa xmm7,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53) - punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73) - - movdqa xmm6,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 - punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 - movdqa xmm0,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 - punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 - - movdqa xmm5,xmm6 - movdqa xmm3,xmm1 - psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6 - psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7 - paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1 - paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0 - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) - movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) - movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 - - movdqa xmm6,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 - punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 - movdqa xmm1,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 - punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 - - movdqa xmm7,xmm6 - movdqa xmm0,xmm2 - paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3 - paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2 - psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4 - psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm4,xmm3 - movdqa xmm1,xmm5 - psubw xmm3,xmm6 ; xmm3=tmp13 - psubw xmm5,xmm2 ; xmm5=tmp12 - paddw xmm4,xmm6 ; xmm4=tmp10 - paddw xmm1,xmm2 ; xmm1=tmp11 - - paddw xmm5,xmm3 - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm5,[rel PW_F0707] ; xmm5=z1 - - movdqa xmm6,xmm4 - movdqa xmm2,xmm3 - psubw xmm4,xmm1 ; xmm4=data4 - psubw xmm3,xmm5 ; xmm3=data6 - paddw xmm6,xmm1 ; xmm6=data0 - paddw xmm2,xmm5 ; xmm2=data2 - - movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4 - movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3 - movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6 - movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2 - - ; -- Odd part - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 - - paddw xmm7,xmm0 ; xmm7=tmp10 - paddw xmm0,xmm1 ; xmm0=tmp11 - paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7 - - psllw xmm7,PRE_MULTIPLY_SCALE_BITS - psllw xmm1,PRE_MULTIPLY_SCALE_BITS - - psllw xmm0,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm0,[rel PW_F0707] ; xmm0=z3 - - movdqa xmm4,xmm7 ; xmm4=tmp10 - psubw xmm7,xmm1 - pmulhw xmm7,[rel PW_F0382] ; xmm7=z5 - pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) - pmulhw xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) - paddw xmm4,xmm7 ; xmm4=z2 - paddw xmm1,xmm7 ; xmm1=z4 - - movdqa xmm3,xmm5 - psubw xmm5,xmm0 ; xmm5=z13 - paddw xmm3,xmm0 ; xmm3=z11 - - movdqa xmm6,xmm5 - movdqa xmm2,xmm3 - psubw xmm5,xmm4 ; xmm5=data3 - psubw xmm3,xmm1 ; xmm3=data7 - paddw xmm6,xmm4 ; xmm6=data5 - paddw xmm2,xmm1 ; xmm2=data1 - - movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5 - movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3 - movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6 - movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2 - - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) + + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6, xmm1 + movdqa xmm3, xmm0 + psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2, xmm1 + movdqa xmm5, xmm7 + paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm0, xmm6 + psubw xmm3, xmm1 ; xmm3=tmp13 + psubw xmm6, xmm7 ; xmm6=tmp12 + paddw xmm4, xmm1 ; xmm4=tmp10 + paddw xmm0, xmm7 ; xmm0=tmp11 + + paddw xmm6, xmm3 + psllw xmm6, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm6, [rel PW_F0707] ; xmm6=z1 + + movdqa xmm1, xmm4 + movdqa xmm7, xmm3 + psubw xmm4, xmm0 ; xmm4=data4 + psubw xmm3, xmm6 ; xmm3=data6 + paddw xmm1, xmm0 ; xmm1=data0 + paddw xmm7, xmm6 ; xmm7=data2 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 + + ; -- Odd part + + paddw xmm2, xmm5 ; xmm2=tmp10 + paddw xmm5, xmm0 ; xmm5=tmp11 + paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7 + + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [rel PW_F0707] ; xmm5=z3 + + movdqa xmm4, xmm2 ; xmm4=tmp10 + psubw xmm2, xmm0 + pmulhw xmm2, [rel PW_F0382] ; xmm2=z5 + pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm0, [rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4, xmm2 ; xmm4=z2 + paddw xmm0, xmm2 ; xmm0=z4 + + movdqa xmm3, xmm6 + psubw xmm6, xmm5 ; xmm6=z13 + paddw xmm3, xmm5 ; xmm3=z11 + + movdqa xmm2, xmm6 + movdqa xmm5, xmm3 + psubw xmm6, xmm4 ; xmm6=data3 + psubw xmm3, xmm0 ; xmm3=data7 + paddw xmm2, xmm4 ; xmm2=data5 + paddw xmm5, xmm0 ; xmm5=data1 + + ; ---- Pass 2: process columns. + + ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) + ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) + + movdqa xmm4, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31) + punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71) + movdqa xmm0, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33) + punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 + + ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) + ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm7, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35) + punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75) + movdqa xmm0, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37) + punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77) + + movdqa xmm2, xmm5 ; transpose coefficients(phase 2) + punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17) + punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37) + movdqa xmm3, xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57) + punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77) + + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) + + movdqa xmm2, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13) + punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33) + movdqa xmm7, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53) + punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73) + + movdqa xmm6, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm0, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm5, xmm6 + movdqa xmm3, xmm1 + psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6 + psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7 + paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1 + paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0 + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 + + movdqa xmm6, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm1, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm7, xmm6 + movdqa xmm0, xmm2 + paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3 + paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2 + psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4 + psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm1, xmm5 + psubw xmm3, xmm6 ; xmm3=tmp13 + psubw xmm5, xmm2 ; xmm5=tmp12 + paddw xmm4, xmm6 ; xmm4=tmp10 + paddw xmm1, xmm2 ; xmm1=tmp11 + + paddw xmm5, xmm3 + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [rel PW_F0707] ; xmm5=z1 + + movdqa xmm6, xmm4 + movdqa xmm2, xmm3 + psubw xmm4, xmm1 ; xmm4=data4 + psubw xmm3, xmm5 ; xmm3=data6 + paddw xmm6, xmm1 ; xmm6=data0 + paddw xmm2, xmm5 ; xmm2=data2 + + movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2 + + ; -- Odd part + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + paddw xmm7, xmm0 ; xmm7=tmp10 + paddw xmm0, xmm1 ; xmm0=tmp11 + paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7 + + psllw xmm7, PRE_MULTIPLY_SCALE_BITS + psllw xmm1, PRE_MULTIPLY_SCALE_BITS + + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm0, [rel PW_F0707] ; xmm0=z3 + + movdqa xmm4, xmm7 ; xmm4=tmp10 + psubw xmm7, xmm1 + pmulhw xmm7, [rel PW_F0382] ; xmm7=z5 + pmulhw xmm4, [rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm1, [rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4, xmm7 ; xmm4=z2 + paddw xmm1, xmm7 ; xmm1=z4 + + movdqa xmm3, xmm5 + psubw xmm5, xmm0 ; xmm5=z13 + paddw xmm3, xmm0 ; xmm3=z11 + + movdqa xmm6, xmm5 + movdqa xmm2, xmm3 + psubw xmm5, xmm4 ; xmm5=data3 + psubw xmm3, xmm1 ; xmm3=data7 + paddw xmm6, xmm4 ; xmm6=data5 + paddw xmm2, xmm1 ; xmm2=data1 + + movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5 + movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2 + + uncollect_args + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jfdctfst-sse2.asm b/simd/jfdctfst-sse2.asm index 54856a2..bd7723c 100644 --- a/simd/jfdctfst-sse2.asm +++ b/simd/jfdctfst-sse2.asm @@ -25,46 +25,46 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 8 ; 14 is also OK. +%define CONST_BITS 8 ; 14 is also OK. %if CONST_BITS == 8 -F_0_382 equ 98 ; FIX(0.382683433) -F_0_541 equ 139 ; FIX(0.541196100) -F_0_707 equ 181 ; FIX(0.707106781) -F_1_306 equ 334 ; FIX(1.306562965) +F_0_382 equ 98 ; FIX(0.382683433) +F_0_541 equ 139 ; FIX(0.541196100) +F_0_707 equ 181 ; FIX(0.707106781) +F_1_306 equ 334 ; FIX(1.306562965) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) -F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) +F_0_382 equ DESCALE( 410903207, 30-CONST_BITS) ; FIX(0.382683433) +F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100) +F_0_707 equ DESCALE( 759250124, 30-CONST_BITS) ; FIX(0.707106781) +F_1_306 equ DESCALE(1402911301, 30-CONST_BITS) ; FIX(1.306562965) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) -%define PRE_MULTIPLY_SCALE_BITS 2 -%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 16 - global EXTN(jconst_fdct_ifast_sse2) + alignz 16 + global EXTN(jconst_fdct_ifast_sse2) EXTN(jconst_fdct_ifast_sse2): -PW_F0707 times 8 dw F_0_707 << CONST_SHIFT -PW_F0382 times 8 dw F_0_382 << CONST_SHIFT -PW_F0541 times 8 dw F_0_541 << CONST_SHIFT -PW_F1306 times 8 dw F_1_306 << CONST_SHIFT +PW_F0707 times 8 dw F_0_707 << CONST_SHIFT +PW_F0382 times 8 dw F_0_382 << CONST_SHIFT +PW_F0541 times 8 dw F_0_541 << CONST_SHIFT +PW_F1306 times 8 dw F_1_306 << CONST_SHIFT - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform the forward DCT on one block of samples. ; @@ -72,332 +72,332 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT ; jsimd_fdct_ifast_sse2 (DCTELEM *data) ; -%define data(b) (b)+8 ; DCTELEM *data +%define data(b) (b)+8 ; DCTELEM *data -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_fdct_ifast_sse2) + align 16 + global EXTN(jsimd_fdct_ifast_sse2) EXTN(jsimd_fdct_ifast_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; unused -; push edx ; need not be preserved -; push esi ; unused -; push edi ; unused - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process rows. - - mov edx, POINTER [data(eax)] ; (DCTELEM *) - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] - movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] - - ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) - ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) - - movdqa xmm4,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) - punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) - movdqa xmm5,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) - punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) - - movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] - movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] - - ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) - ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) - movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) - - movdqa xmm2,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) - punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) - movdqa xmm5,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) - punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) - - movdqa xmm7,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) - punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) - movdqa xmm3,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) - punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) - movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) - - movdqa xmm7,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) - punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) - movdqa xmm2,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) - punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) - - movdqa xmm1,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 - punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 - movdqa xmm5,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 - punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 - - movdqa xmm6,xmm1 - movdqa xmm3,xmm0 - psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 - psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 - paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 - paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 - - movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) - movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 - - movdqa xmm1,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 - punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 - movdqa xmm0,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 - punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 - - movdqa xmm2,xmm1 - movdqa xmm5,xmm7 - paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 - paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 - psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 - psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm4,xmm3 - movdqa xmm0,xmm6 - psubw xmm3,xmm1 ; xmm3=tmp13 - psubw xmm6,xmm7 ; xmm6=tmp12 - paddw xmm4,xmm1 ; xmm4=tmp10 - paddw xmm0,xmm7 ; xmm0=tmp11 - - paddw xmm6,xmm3 - psllw xmm6,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1 - - movdqa xmm1,xmm4 - movdqa xmm7,xmm3 - psubw xmm4,xmm0 ; xmm4=data4 - psubw xmm3,xmm6 ; xmm3=data6 - paddw xmm1,xmm0 ; xmm1=data0 - paddw xmm7,xmm6 ; xmm7=data2 - - movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 - movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 - - ; -- Odd part - - paddw xmm2,xmm5 ; xmm2=tmp10 - paddw xmm5,xmm0 ; xmm5=tmp11 - paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7 - - psllw xmm2,PRE_MULTIPLY_SCALE_BITS - psllw xmm0,PRE_MULTIPLY_SCALE_BITS - - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3 - - movdqa xmm4,xmm2 ; xmm4=tmp10 - psubw xmm2,xmm0 - pmulhw xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5 - pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) - pmulhw xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) - paddw xmm4,xmm2 ; xmm4=z2 - paddw xmm0,xmm2 ; xmm0=z4 - - movdqa xmm3,xmm6 - psubw xmm6,xmm5 ; xmm6=z13 - paddw xmm3,xmm5 ; xmm3=z11 - - movdqa xmm2,xmm6 - movdqa xmm5,xmm3 - psubw xmm6,xmm4 ; xmm6=data3 - psubw xmm3,xmm0 ; xmm3=data7 - paddw xmm2,xmm4 ; xmm2=data5 - paddw xmm5,xmm0 ; xmm5=data1 - - ; ---- Pass 2: process columns. - -; mov edx, POINTER [data(eax)] ; (DCTELEM *) - - ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) - ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) - - movdqa xmm4,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31) - punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71) - movdqa xmm0,xmm7 ; transpose coefficients(phase 1) - punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33) - punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73) - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 - movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 - - ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) - ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) - - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) - - movdqa xmm7,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35) - punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75) - movdqa xmm0,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37) - punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77) - - movdqa xmm2,xmm5 ; transpose coefficients(phase 2) - punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17) - punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37) - movdqa xmm3,xmm7 ; transpose coefficients(phase 2) - punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57) - punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77) - - movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) - movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) - movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) - - movdqa xmm2,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13) - punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33) - movdqa xmm7,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53) - punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73) - - movdqa xmm6,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 - punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 - movdqa xmm0,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 - punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 - - movdqa xmm5,xmm6 - movdqa xmm3,xmm1 - psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6 - psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7 - paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1 - paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0 - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) - movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) - movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 - - movdqa xmm6,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 - punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 - movdqa xmm1,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 - punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 - - movdqa xmm7,xmm6 - movdqa xmm0,xmm2 - paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3 - paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2 - psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4 - psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm4,xmm3 - movdqa xmm1,xmm5 - psubw xmm3,xmm6 ; xmm3=tmp13 - psubw xmm5,xmm2 ; xmm5=tmp12 - paddw xmm4,xmm6 ; xmm4=tmp10 - paddw xmm1,xmm2 ; xmm1=tmp11 - - paddw xmm5,xmm3 - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1 - - movdqa xmm6,xmm4 - movdqa xmm2,xmm3 - psubw xmm4,xmm1 ; xmm4=data4 - psubw xmm3,xmm5 ; xmm3=data6 - paddw xmm6,xmm1 ; xmm6=data0 - paddw xmm2,xmm5 ; xmm2=data2 - - movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4 - movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3 - movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6 - movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2 - - ; -- Odd part - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 - - paddw xmm7,xmm0 ; xmm7=tmp10 - paddw xmm0,xmm1 ; xmm0=tmp11 - paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7 - - psllw xmm7,PRE_MULTIPLY_SCALE_BITS - psllw xmm1,PRE_MULTIPLY_SCALE_BITS - - psllw xmm0,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3 - - movdqa xmm4,xmm7 ; xmm4=tmp10 - psubw xmm7,xmm1 - pmulhw xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5 - pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) - pmulhw xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) - paddw xmm4,xmm7 ; xmm4=z2 - paddw xmm1,xmm7 ; xmm1=z4 - - movdqa xmm3,xmm5 - psubw xmm5,xmm0 ; xmm5=z13 - paddw xmm3,xmm0 ; xmm3=z11 - - movdqa xmm6,xmm5 - movdqa xmm2,xmm3 - psubw xmm5,xmm4 ; xmm5=data3 - psubw xmm3,xmm1 ; xmm3=data7 - paddw xmm6,xmm4 ; xmm6=data5 - paddw xmm2,xmm1 ; xmm2=data1 - - movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5 - movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3 - movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6 - movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2 - -; pop edi ; unused -; pop esi ; unused -; pop edx ; need not be preserved -; pop ecx ; unused - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) + + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6, xmm1 + movdqa xmm3, xmm0 + psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2, xmm1 + movdqa xmm5, xmm7 + paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm0, xmm6 + psubw xmm3, xmm1 ; xmm3=tmp13 + psubw xmm6, xmm7 ; xmm6=tmp12 + paddw xmm4, xmm1 ; xmm4=tmp10 + paddw xmm0, xmm7 ; xmm0=tmp11 + + paddw xmm6, xmm3 + psllw xmm6, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm6, [GOTOFF(ebx,PW_F0707)] ; xmm6=z1 + + movdqa xmm1, xmm4 + movdqa xmm7, xmm3 + psubw xmm4, xmm0 ; xmm4=data4 + psubw xmm3, xmm6 ; xmm3=data6 + paddw xmm1, xmm0 ; xmm1=data0 + paddw xmm7, xmm6 ; xmm7=data2 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 + + ; -- Odd part + + paddw xmm2, xmm5 ; xmm2=tmp10 + paddw xmm5, xmm0 ; xmm5=tmp11 + paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7 + + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z3 + + movdqa xmm4, xmm2 ; xmm4=tmp10 + psubw xmm2, xmm0 + pmulhw xmm2, [GOTOFF(ebx,PW_F0382)] ; xmm2=z5 + pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm0, [GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4, xmm2 ; xmm4=z2 + paddw xmm0, xmm2 ; xmm0=z4 + + movdqa xmm3, xmm6 + psubw xmm6, xmm5 ; xmm6=z13 + paddw xmm3, xmm5 ; xmm3=z11 + + movdqa xmm2, xmm6 + movdqa xmm5, xmm3 + psubw xmm6, xmm4 ; xmm6=data3 + psubw xmm3, xmm0 ; xmm3=data7 + paddw xmm2, xmm4 ; xmm2=data5 + paddw xmm5, xmm0 ; xmm5=data1 + + ; ---- Pass 2: process columns. + +; mov edx, POINTER [data(eax)] ; (DCTELEM *) + + ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) + ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) + + movdqa xmm4, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31) + punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71) + movdqa xmm0, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33) + punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 + + ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) + ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm7, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35) + punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75) + movdqa xmm0, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37) + punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77) + + movdqa xmm2, xmm5 ; transpose coefficients(phase 2) + punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17) + punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37) + movdqa xmm3, xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57) + punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77) + + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) + + movdqa xmm2, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13) + punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33) + movdqa xmm7, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53) + punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73) + + movdqa xmm6, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm0, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm5, xmm6 + movdqa xmm3, xmm1 + psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6 + psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7 + paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1 + paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0 + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 + + movdqa xmm6, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm1, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm7, xmm6 + movdqa xmm0, xmm2 + paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3 + paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2 + psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4 + psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm1, xmm5 + psubw xmm3, xmm6 ; xmm3=tmp13 + psubw xmm5, xmm2 ; xmm5=tmp12 + paddw xmm4, xmm6 ; xmm4=tmp10 + paddw xmm1, xmm2 ; xmm1=tmp11 + + paddw xmm5, xmm3 + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z1 + + movdqa xmm6, xmm4 + movdqa xmm2, xmm3 + psubw xmm4, xmm1 ; xmm4=data4 + psubw xmm3, xmm5 ; xmm3=data6 + paddw xmm6, xmm1 ; xmm6=data0 + paddw xmm2, xmm5 ; xmm2=data2 + + movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2 + + ; -- Odd part + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + paddw xmm7, xmm0 ; xmm7=tmp10 + paddw xmm0, xmm1 ; xmm0=tmp11 + paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7 + + psllw xmm7, PRE_MULTIPLY_SCALE_BITS + psllw xmm1, PRE_MULTIPLY_SCALE_BITS + + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm0, [GOTOFF(ebx,PW_F0707)] ; xmm0=z3 + + movdqa xmm4, xmm7 ; xmm4=tmp10 + psubw xmm7, xmm1 + pmulhw xmm7, [GOTOFF(ebx,PW_F0382)] ; xmm7=z5 + pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm1, [GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4, xmm7 ; xmm4=z2 + paddw xmm1, xmm7 ; xmm1=z4 + + movdqa xmm3, xmm5 + psubw xmm5, xmm0 ; xmm5=z13 + paddw xmm3, xmm0 ; xmm3=z11 + + movdqa xmm6, xmm5 + movdqa xmm2, xmm3 + psubw xmm5, xmm4 ; xmm5=data3 + psubw xmm3, xmm1 ; xmm3=data7 + paddw xmm6, xmm4 ; xmm6=data5 + paddw xmm2, xmm1 ; xmm2=data1 + + movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5 + movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2 + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jfdctint-sse2-64.asm b/simd/jfdctint-sse2-64.asm index 9a0ca0f..0f82cdf 100644 --- a/simd/jfdctint-sse2-64.asm +++ b/simd/jfdctint-sse2-64.asm @@ -26,67 +26,67 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 13 -%define PASS1_BITS 2 +%define CONST_BITS 13 +%define PASS1_BITS 2 -%define DESCALE_P1 (CONST_BITS-PASS1_BITS) -%define DESCALE_P2 (CONST_BITS+PASS1_BITS) +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS) %if CONST_BITS == 13 -F_0_298 equ 2446 ; FIX(0.298631336) -F_0_390 equ 3196 ; FIX(0.390180644) -F_0_541 equ 4433 ; FIX(0.541196100) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_175 equ 9633 ; FIX(1.175875602) -F_1_501 equ 12299 ; FIX(1.501321110) -F_1_847 equ 15137 ; FIX(1.847759065) -F_1_961 equ 16069 ; FIX(1.961570560) -F_2_053 equ 16819 ; FIX(2.053119869) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_072 equ 25172 ; FIX(3.072711026) +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) -F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) -F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) -F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +F_0_298 equ DESCALE( 320652955, 30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_fdct_islow_sse2) + alignz 16 + global EXTN(jconst_fdct_islow_sse2) EXTN(jconst_fdct_islow_sse2): -PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 -PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) -PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 -PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) -PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 -PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) -PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 -PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) -PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) -PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) -PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) +PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) +PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Perform the forward DCT on one block of samples. ; @@ -96,526 +96,526 @@ PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) ; r10 = DCTELEM *data -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 6 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 6 - align 16 - global EXTN(jsimd_fdct_islow_sse2) + align 16 + global EXTN(jsimd_fdct_islow_sse2) EXTN(jsimd_fdct_islow_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args - - ; ---- Pass 1: process rows. - - mov rdx, r10 ; (DCTELEM *) - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] - - ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) - ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) - - movdqa xmm4,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) - punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) - movdqa xmm5,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) - punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) - - movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] - - ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) - ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) - movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) - - movdqa xmm2,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) - punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) - movdqa xmm5,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) - punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) - - movdqa xmm7,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) - punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) - movdqa xmm3,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) - punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) - movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) - movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) - - movdqa xmm7,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) - punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) - movdqa xmm2,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) - punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) - - movdqa xmm1,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 - punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 - movdqa xmm5,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 - punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 - - movdqa xmm6,xmm1 - movdqa xmm3,xmm0 - psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 - psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 - paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 - paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 - - movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) - movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) - movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 - - movdqa xmm1,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 - punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 - movdqa xmm0,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 - punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 - - movdqa xmm2,xmm1 - movdqa xmm5,xmm7 - paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 - paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 - psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 - psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm4,xmm3 - movdqa xmm0,xmm6 - paddw xmm3,xmm1 ; xmm3=tmp10 - paddw xmm6,xmm7 ; xmm6=tmp11 - psubw xmm4,xmm1 ; xmm4=tmp13 - psubw xmm0,xmm7 ; xmm0=tmp12 - - movdqa xmm1,xmm3 - paddw xmm3,xmm6 ; xmm3=tmp10+tmp11 - psubw xmm1,xmm6 ; xmm1=tmp10-tmp11 - - psllw xmm3,PASS1_BITS ; xmm3=data0 - psllw xmm1,PASS1_BITS ; xmm1=data4 - - movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 - movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 - - ; (Original) - ; z1 = (tmp12 + tmp13) * 0.541196100; - ; data2 = z1 + tmp13 * 0.765366865; - ; data6 = z1 + tmp12 * -1.847759065; - ; - ; (This implementation) - ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; - ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); - - movdqa xmm7,xmm4 ; xmm4=tmp13 - movdqa xmm6,xmm4 - punpcklwd xmm7,xmm0 ; xmm0=tmp12 - punpckhwd xmm6,xmm0 - movdqa xmm4,xmm7 - movdqa xmm0,xmm6 - pmaddwd xmm7,[rel PW_F130_F054] ; xmm7=data2L - pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=data2H - pmaddwd xmm4,[rel PW_F054_MF130] ; xmm4=data6L - pmaddwd xmm0,[rel PW_F054_MF130] ; xmm0=data6H - - paddd xmm7,[rel PD_DESCALE_P1] - paddd xmm6,[rel PD_DESCALE_P1] - psrad xmm7,DESCALE_P1 - psrad xmm6,DESCALE_P1 - paddd xmm4,[rel PD_DESCALE_P1] - paddd xmm0,[rel PD_DESCALE_P1] - psrad xmm4,DESCALE_P1 - psrad xmm0,DESCALE_P1 - - packssdw xmm7,xmm6 ; xmm7=data2 - packssdw xmm4,xmm0 ; xmm4=data6 - - movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 - movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 - - ; -- Odd part - - movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 - - movdqa xmm6,xmm2 ; xmm2=tmp4 - movdqa xmm0,xmm5 ; xmm5=tmp5 - paddw xmm6,xmm3 ; xmm6=z3 - paddw xmm0,xmm1 ; xmm0=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm7,xmm6 - movdqa xmm4,xmm6 - punpcklwd xmm7,xmm0 - punpckhwd xmm4,xmm0 - movdqa xmm6,xmm7 - movdqa xmm0,xmm4 - pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3L - pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3H - pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4L - pmaddwd xmm0,[rel PW_F117_F078] ; xmm0=z4H - - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L - movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H - - ; (Original) - ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; - ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; - ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; - ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; - ; - ; (This implementation) - ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; - ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; - ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); - ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); - ; data7 = tmp4 + z3; data5 = tmp5 + z4; - ; data3 = tmp6 + z3; data1 = tmp7 + z4; - - movdqa xmm7,xmm2 - movdqa xmm4,xmm2 - punpcklwd xmm7,xmm1 - punpckhwd xmm4,xmm1 - movdqa xmm2,xmm7 - movdqa xmm1,xmm4 - pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp4L - pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4H - pmaddwd xmm2,[rel PW_MF089_F060] ; xmm2=tmp7L - pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp7H - - paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L - paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H - paddd xmm2,xmm6 ; xmm2=data1L - paddd xmm1,xmm0 ; xmm1=data1H - - paddd xmm7,[rel PD_DESCALE_P1] - paddd xmm4,[rel PD_DESCALE_P1] - psrad xmm7,DESCALE_P1 - psrad xmm4,DESCALE_P1 - paddd xmm2,[rel PD_DESCALE_P1] - paddd xmm1,[rel PD_DESCALE_P1] - psrad xmm2,DESCALE_P1 - psrad xmm1,DESCALE_P1 - - packssdw xmm7,xmm4 ; xmm7=data7 - packssdw xmm2,xmm1 ; xmm2=data1 - - movdqa xmm4,xmm5 - movdqa xmm1,xmm5 - punpcklwd xmm4,xmm3 - punpckhwd xmm1,xmm3 - movdqa xmm5,xmm4 - movdqa xmm3,xmm1 - pmaddwd xmm4,[rel PW_MF050_MF256] ; xmm4=tmp5L - pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5H - pmaddwd xmm5,[rel PW_MF256_F050] ; xmm5=tmp6L - pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6H - - paddd xmm4,xmm6 ; xmm4=data5L - paddd xmm1,xmm0 ; xmm1=data5H - paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L - paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H - - paddd xmm4,[rel PD_DESCALE_P1] - paddd xmm1,[rel PD_DESCALE_P1] - psrad xmm4,DESCALE_P1 - psrad xmm1,DESCALE_P1 - paddd xmm5,[rel PD_DESCALE_P1] - paddd xmm3,[rel PD_DESCALE_P1] - psrad xmm5,DESCALE_P1 - psrad xmm3,DESCALE_P1 - - packssdw xmm4,xmm1 ; xmm4=data5 - packssdw xmm5,xmm3 ; xmm5=data3 - - ; ---- Pass 2: process columns. - - movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 - movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 - - ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) - ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) - - movdqa xmm1,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31) - punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71) - movdqa xmm3,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33) - punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73) - - movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 - movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 - - ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) - ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) - - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) - - movdqa xmm0,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35) - punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75) - movdqa xmm3,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37) - punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77) - - movdqa xmm4,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17) - punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37) - movdqa xmm7,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57) - punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77) - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) - movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) - movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) - movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) - - movdqa xmm4,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13) - punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33) - movdqa xmm0,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53) - punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73) - - movdqa xmm5,xmm6 ; transpose coefficients(phase 3) - punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 - punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 - movdqa xmm3,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 - punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 - - movdqa xmm2,xmm5 - movdqa xmm7,xmm6 - psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6 - psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7 - paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1 - paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0 - - movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) - movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 - - movdqa xmm5,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 - punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 - movdqa xmm6,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 - punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 - - movdqa xmm0,xmm5 - movdqa xmm3,xmm4 - paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3 - paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2 - psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4 - psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm1,xmm7 - movdqa xmm6,xmm2 - paddw xmm7,xmm5 ; xmm7=tmp10 - paddw xmm2,xmm4 ; xmm2=tmp11 - psubw xmm1,xmm5 ; xmm1=tmp13 - psubw xmm6,xmm4 ; xmm6=tmp12 - - movdqa xmm5,xmm7 - paddw xmm7,xmm2 ; xmm7=tmp10+tmp11 - psubw xmm5,xmm2 ; xmm5=tmp10-tmp11 - - paddw xmm7,[rel PW_DESCALE_P2X] - paddw xmm5,[rel PW_DESCALE_P2X] - psraw xmm7,PASS1_BITS ; xmm7=data0 - psraw xmm5,PASS1_BITS ; xmm5=data4 - - movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7 - movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5 - - ; (Original) - ; z1 = (tmp12 + tmp13) * 0.541196100; - ; data2 = z1 + tmp13 * 0.765366865; - ; data6 = z1 + tmp12 * -1.847759065; - ; - ; (This implementation) - ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; - ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); - - movdqa xmm4,xmm1 ; xmm1=tmp13 - movdqa xmm2,xmm1 - punpcklwd xmm4,xmm6 ; xmm6=tmp12 - punpckhwd xmm2,xmm6 - movdqa xmm1,xmm4 - movdqa xmm6,xmm2 - pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=data2L - pmaddwd xmm2,[rel PW_F130_F054] ; xmm2=data2H - pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=data6L - pmaddwd xmm6,[rel PW_F054_MF130] ; xmm6=data6H - - paddd xmm4,[rel PD_DESCALE_P2] - paddd xmm2,[rel PD_DESCALE_P2] - psrad xmm4,DESCALE_P2 - psrad xmm2,DESCALE_P2 - paddd xmm1,[rel PD_DESCALE_P2] - paddd xmm6,[rel PD_DESCALE_P2] - psrad xmm1,DESCALE_P2 - psrad xmm6,DESCALE_P2 - - packssdw xmm4,xmm2 ; xmm4=data2 - packssdw xmm1,xmm6 ; xmm1=data6 - - movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4 - movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1 - - ; -- Odd part - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 - - movdqa xmm2,xmm0 ; xmm0=tmp4 - movdqa xmm6,xmm3 ; xmm3=tmp5 - paddw xmm2,xmm7 ; xmm2=z3 - paddw xmm6,xmm5 ; xmm6=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm4,xmm2 - movdqa xmm1,xmm2 - punpcklwd xmm4,xmm6 - punpckhwd xmm1,xmm6 - movdqa xmm2,xmm4 - movdqa xmm6,xmm1 - pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3L - pmaddwd xmm1,[rel PW_MF078_F117] ; xmm1=z3H - pmaddwd xmm2,[rel PW_F117_F078] ; xmm2=z4L - pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4H - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L - movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H - - ; (Original) - ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; - ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; - ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; - ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; - ; - ; (This implementation) - ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; - ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; - ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); - ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); - ; data7 = tmp4 + z3; data5 = tmp5 + z4; - ; data3 = tmp6 + z3; data1 = tmp7 + z4; - - movdqa xmm4,xmm0 - movdqa xmm1,xmm0 - punpcklwd xmm4,xmm5 - punpckhwd xmm1,xmm5 - movdqa xmm0,xmm4 - movdqa xmm5,xmm1 - pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4L - pmaddwd xmm1,[rel PW_MF060_MF089] ; xmm1=tmp4H - pmaddwd xmm0,[rel PW_MF089_F060] ; xmm0=tmp7L - pmaddwd xmm5,[rel PW_MF089_F060] ; xmm5=tmp7H - - paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L - paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H - paddd xmm0,xmm2 ; xmm0=data1L - paddd xmm5,xmm6 ; xmm5=data1H - - paddd xmm4,[rel PD_DESCALE_P2] - paddd xmm1,[rel PD_DESCALE_P2] - psrad xmm4,DESCALE_P2 - psrad xmm1,DESCALE_P2 - paddd xmm0,[rel PD_DESCALE_P2] - paddd xmm5,[rel PD_DESCALE_P2] - psrad xmm0,DESCALE_P2 - psrad xmm5,DESCALE_P2 - - packssdw xmm4,xmm1 ; xmm4=data7 - packssdw xmm0,xmm5 ; xmm0=data1 - - movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4 - movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0 - - movdqa xmm1,xmm3 - movdqa xmm5,xmm3 - punpcklwd xmm1,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm3,xmm1 - movdqa xmm7,xmm5 - pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5L - pmaddwd xmm5,[rel PW_MF050_MF256] ; xmm5=tmp5H - pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6L - pmaddwd xmm7,[rel PW_MF256_F050] ; xmm7=tmp6H - - paddd xmm1,xmm2 ; xmm1=data5L - paddd xmm5,xmm6 ; xmm5=data5H - paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L - paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H - - paddd xmm1,[rel PD_DESCALE_P2] - paddd xmm5,[rel PD_DESCALE_P2] - psrad xmm1,DESCALE_P2 - psrad xmm5,DESCALE_P2 - paddd xmm3,[rel PD_DESCALE_P2] - paddd xmm7,[rel PD_DESCALE_P2] - psrad xmm3,DESCALE_P2 - psrad xmm7,DESCALE_P2 - - packssdw xmm1,xmm5 ; xmm1=data5 - packssdw xmm3,xmm7 ; xmm3=data3 - - movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1 - movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3 - - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) + + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6, xmm1 + movdqa xmm3, xmm0 + psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2, xmm1 + movdqa xmm5, xmm7 + paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm0, xmm6 + paddw xmm3, xmm1 ; xmm3=tmp10 + paddw xmm6, xmm7 ; xmm6=tmp11 + psubw xmm4, xmm1 ; xmm4=tmp13 + psubw xmm0, xmm7 ; xmm0=tmp12 + + movdqa xmm1, xmm3 + paddw xmm3, xmm6 ; xmm3=tmp10+tmp11 + psubw xmm1, xmm6 ; xmm1=tmp10-tmp11 + + psllw xmm3, PASS1_BITS ; xmm3=data0 + psllw xmm1, PASS1_BITS ; xmm1=data4 + + movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 + movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm7, xmm4 ; xmm4=tmp13 + movdqa xmm6, xmm4 + punpcklwd xmm7, xmm0 ; xmm0=tmp12 + punpckhwd xmm6, xmm0 + movdqa xmm4, xmm7 + movdqa xmm0, xmm6 + pmaddwd xmm7, [rel PW_F130_F054] ; xmm7=data2L + pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=data2H + pmaddwd xmm4, [rel PW_F054_MF130] ; xmm4=data6L + pmaddwd xmm0, [rel PW_F054_MF130] ; xmm0=data6H + + paddd xmm7, [rel PD_DESCALE_P1] + paddd xmm6, [rel PD_DESCALE_P1] + psrad xmm7, DESCALE_P1 + psrad xmm6, DESCALE_P1 + paddd xmm4, [rel PD_DESCALE_P1] + paddd xmm0, [rel PD_DESCALE_P1] + psrad xmm4, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm7, xmm6 ; xmm7=data2 + packssdw xmm4, xmm0 ; xmm4=data6 + + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 + + ; -- Odd part + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 + + movdqa xmm6, xmm2 ; xmm2=tmp4 + movdqa xmm0, xmm5 ; xmm5=tmp5 + paddw xmm6, xmm3 ; xmm6=z3 + paddw xmm0, xmm1 ; xmm0=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm7, xmm6 + movdqa xmm4, xmm6 + punpcklwd xmm7, xmm0 + punpckhwd xmm4, xmm0 + movdqa xmm6, xmm7 + movdqa xmm0, xmm4 + pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3L + pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3H + pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4L + pmaddwd xmm0, [rel PW_F117_F078] ; xmm0=z4H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + punpcklwd xmm7, xmm1 + punpckhwd xmm4, xmm1 + movdqa xmm2, xmm7 + movdqa xmm1, xmm4 + pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp4L + pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4H + pmaddwd xmm2, [rel PW_MF089_F060] ; xmm2=tmp7L + pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp7H + + paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L + paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H + paddd xmm2, xmm6 ; xmm2=data1L + paddd xmm1, xmm0 ; xmm1=data1H + + paddd xmm7, [rel PD_DESCALE_P1] + paddd xmm4, [rel PD_DESCALE_P1] + psrad xmm7, DESCALE_P1 + psrad xmm4, DESCALE_P1 + paddd xmm2, [rel PD_DESCALE_P1] + paddd xmm1, [rel PD_DESCALE_P1] + psrad xmm2, DESCALE_P1 + psrad xmm1, DESCALE_P1 + + packssdw xmm7, xmm4 ; xmm7=data7 + packssdw xmm2, xmm1 ; xmm2=data1 + + movdqa xmm4, xmm5 + movdqa xmm1, xmm5 + punpcklwd xmm4, xmm3 + punpckhwd xmm1, xmm3 + movdqa xmm5, xmm4 + movdqa xmm3, xmm1 + pmaddwd xmm4, [rel PW_MF050_MF256] ; xmm4=tmp5L + pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5H + pmaddwd xmm5, [rel PW_MF256_F050] ; xmm5=tmp6L + pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6H + + paddd xmm4, xmm6 ; xmm4=data5L + paddd xmm1, xmm0 ; xmm1=data5H + paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L + paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H + + paddd xmm4, [rel PD_DESCALE_P1] + paddd xmm1, [rel PD_DESCALE_P1] + psrad xmm4, DESCALE_P1 + psrad xmm1, DESCALE_P1 + paddd xmm5, [rel PD_DESCALE_P1] + paddd xmm3, [rel PD_DESCALE_P1] + psrad xmm5, DESCALE_P1 + psrad xmm3, DESCALE_P1 + + packssdw xmm4, xmm1 ; xmm4=data5 + packssdw xmm5, xmm3 ; xmm5=data3 + + ; ---- Pass 2: process columns. + + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 + movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 + + ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) + ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) + + movdqa xmm1, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31) + punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71) + movdqa xmm3, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33) + punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73) + + movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 + movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 + + ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) + ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm0, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35) + punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75) + movdqa xmm3, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37) + punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77) + + movdqa xmm4, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17) + punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37) + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57) + punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) + + movdqa xmm4, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13) + punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33) + movdqa xmm0, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53) + punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73) + + movdqa xmm5, xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm3, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm2, xmm5 + movdqa xmm7, xmm6 + psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6 + psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7 + paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1 + paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0 + + movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) + movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movdqa xmm5, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm6, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm0, xmm5 + movdqa xmm3, xmm4 + paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3 + paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2 + psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4 + psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm1, xmm7 + movdqa xmm6, xmm2 + paddw xmm7, xmm5 ; xmm7=tmp10 + paddw xmm2, xmm4 ; xmm2=tmp11 + psubw xmm1, xmm5 ; xmm1=tmp13 + psubw xmm6, xmm4 ; xmm6=tmp12 + + movdqa xmm5, xmm7 + paddw xmm7, xmm2 ; xmm7=tmp10+tmp11 + psubw xmm5, xmm2 ; xmm5=tmp10-tmp11 + + paddw xmm7, [rel PW_DESCALE_P2X] + paddw xmm5, [rel PW_DESCALE_P2X] + psraw xmm7, PASS1_BITS ; xmm7=data0 + psraw xmm5, PASS1_BITS ; xmm5=data4 + + movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7 + movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm4, xmm1 ; xmm1=tmp13 + movdqa xmm2, xmm1 + punpcklwd xmm4, xmm6 ; xmm6=tmp12 + punpckhwd xmm2, xmm6 + movdqa xmm1, xmm4 + movdqa xmm6, xmm2 + pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=data2L + pmaddwd xmm2, [rel PW_F130_F054] ; xmm2=data2H + pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=data6L + pmaddwd xmm6, [rel PW_F054_MF130] ; xmm6=data6H + + paddd xmm4, [rel PD_DESCALE_P2] + paddd xmm2, [rel PD_DESCALE_P2] + psrad xmm4, DESCALE_P2 + psrad xmm2, DESCALE_P2 + paddd xmm1, [rel PD_DESCALE_P2] + paddd xmm6, [rel PD_DESCALE_P2] + psrad xmm1, DESCALE_P2 + psrad xmm6, DESCALE_P2 + + packssdw xmm4, xmm2 ; xmm4=data2 + packssdw xmm1, xmm6 ; xmm1=data6 + + movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1 + + ; -- Odd part + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + movdqa xmm2, xmm0 ; xmm0=tmp4 + movdqa xmm6, xmm3 ; xmm3=tmp5 + paddw xmm2, xmm7 ; xmm2=z3 + paddw xmm6, xmm5 ; xmm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm4, xmm2 + movdqa xmm1, xmm2 + punpcklwd xmm4, xmm6 + punpckhwd xmm1, xmm6 + movdqa xmm2, xmm4 + movdqa xmm6, xmm1 + pmaddwd xmm4, [rel PW_MF078_F117] ; xmm4=z3L + pmaddwd xmm1, [rel PW_MF078_F117] ; xmm1=z3H + pmaddwd xmm2, [rel PW_F117_F078] ; xmm2=z4L + pmaddwd xmm6, [rel PW_F117_F078] ; xmm6=z4H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm4, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm4, xmm5 + punpckhwd xmm1, xmm5 + movdqa xmm0, xmm4 + movdqa xmm5, xmm1 + pmaddwd xmm4, [rel PW_MF060_MF089] ; xmm4=tmp4L + pmaddwd xmm1, [rel PW_MF060_MF089] ; xmm1=tmp4H + pmaddwd xmm0, [rel PW_MF089_F060] ; xmm0=tmp7L + pmaddwd xmm5, [rel PW_MF089_F060] ; xmm5=tmp7H + + paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L + paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H + paddd xmm0, xmm2 ; xmm0=data1L + paddd xmm5, xmm6 ; xmm5=data1H + + paddd xmm4, [rel PD_DESCALE_P2] + paddd xmm1, [rel PD_DESCALE_P2] + psrad xmm4, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm0, [rel PD_DESCALE_P2] + paddd xmm5, [rel PD_DESCALE_P2] + psrad xmm0, DESCALE_P2 + psrad xmm5, DESCALE_P2 + + packssdw xmm4, xmm1 ; xmm4=data7 + packssdw xmm0, xmm5 ; xmm0=data1 + + movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0 + + movdqa xmm1, xmm3 + movdqa xmm5, xmm3 + punpcklwd xmm1, xmm7 + punpckhwd xmm5, xmm7 + movdqa xmm3, xmm1 + movdqa xmm7, xmm5 + pmaddwd xmm1, [rel PW_MF050_MF256] ; xmm1=tmp5L + pmaddwd xmm5, [rel PW_MF050_MF256] ; xmm5=tmp5H + pmaddwd xmm3, [rel PW_MF256_F050] ; xmm3=tmp6L + pmaddwd xmm7, [rel PW_MF256_F050] ; xmm7=tmp6H + + paddd xmm1, xmm2 ; xmm1=data5L + paddd xmm5, xmm6 ; xmm5=data5H + paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L + paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H + + paddd xmm1, [rel PD_DESCALE_P2] + paddd xmm5, [rel PD_DESCALE_P2] + psrad xmm1, DESCALE_P2 + psrad xmm5, DESCALE_P2 + paddd xmm3, [rel PD_DESCALE_P2] + paddd xmm7, [rel PD_DESCALE_P2] + psrad xmm3, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm1, xmm5 ; xmm1=data5 + packssdw xmm3, xmm7 ; xmm3=data3 + + movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3 + + uncollect_args + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jfdctint-sse2.asm b/simd/jfdctint-sse2.asm index db9d0bb..ecfff30 100644 --- a/simd/jfdctint-sse2.asm +++ b/simd/jfdctint-sse2.asm @@ -25,67 +25,67 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 13 -%define PASS1_BITS 2 +%define CONST_BITS 13 +%define PASS1_BITS 2 -%define DESCALE_P1 (CONST_BITS-PASS1_BITS) -%define DESCALE_P2 (CONST_BITS+PASS1_BITS) +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS) %if CONST_BITS == 13 -F_0_298 equ 2446 ; FIX(0.298631336) -F_0_390 equ 3196 ; FIX(0.390180644) -F_0_541 equ 4433 ; FIX(0.541196100) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_175 equ 9633 ; FIX(1.175875602) -F_1_501 equ 12299 ; FIX(1.501321110) -F_1_847 equ 15137 ; FIX(1.847759065) -F_1_961 equ 16069 ; FIX(1.961570560) -F_2_053 equ 16819 ; FIX(2.053119869) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_072 equ 25172 ; FIX(3.072711026) +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) -F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) -F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) -F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +F_0_298 equ DESCALE( 320652955, 30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_fdct_islow_sse2) + alignz 16 + global EXTN(jconst_fdct_islow_sse2) EXTN(jconst_fdct_islow_sse2): -PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 -PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) -PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 -PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) -PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 -PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) -PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 -PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) -PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) -PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) -PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) +PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) +PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform the forward DCT on one block of samples. ; @@ -93,541 +93,541 @@ PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) ; jsimd_fdct_islow_sse2 (DCTELEM *data) ; -%define data(b) (b)+8 ; DCTELEM *data +%define data(b) (b)+8 ; DCTELEM *data -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 6 +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 6 - align 16 - global EXTN(jsimd_fdct_islow_sse2) + align 16 + global EXTN(jsimd_fdct_islow_sse2) EXTN(jsimd_fdct_islow_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; unused -; push edx ; need not be preserved -; push esi ; unused -; push edi ; unused - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process rows. - - mov edx, POINTER [data(eax)] ; (DCTELEM *) - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] - movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] - - ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) - ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) - - movdqa xmm4,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) - punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) - movdqa xmm5,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) - punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) - - movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] - movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] - - ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) - ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) - movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) - - movdqa xmm2,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) - punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) - movdqa xmm5,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) - punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) - - movdqa xmm7,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) - punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) - movdqa xmm3,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) - punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) - movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) - movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) - - movdqa xmm7,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) - punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) - movdqa xmm2,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) - punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) - - movdqa xmm1,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 - punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 - movdqa xmm5,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 - punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 - - movdqa xmm6,xmm1 - movdqa xmm3,xmm0 - psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 - psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 - paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 - paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 - - movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) - movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) - movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 - - movdqa xmm1,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 - punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 - movdqa xmm0,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 - punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 - - movdqa xmm2,xmm1 - movdqa xmm5,xmm7 - paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 - paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 - psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 - psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm4,xmm3 - movdqa xmm0,xmm6 - paddw xmm3,xmm1 ; xmm3=tmp10 - paddw xmm6,xmm7 ; xmm6=tmp11 - psubw xmm4,xmm1 ; xmm4=tmp13 - psubw xmm0,xmm7 ; xmm0=tmp12 - - movdqa xmm1,xmm3 - paddw xmm3,xmm6 ; xmm3=tmp10+tmp11 - psubw xmm1,xmm6 ; xmm1=tmp10-tmp11 - - psllw xmm3,PASS1_BITS ; xmm3=data0 - psllw xmm1,PASS1_BITS ; xmm1=data4 - - movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 - movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 - - ; (Original) - ; z1 = (tmp12 + tmp13) * 0.541196100; - ; data2 = z1 + tmp13 * 0.765366865; - ; data6 = z1 + tmp12 * -1.847759065; - ; - ; (This implementation) - ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; - ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); - - movdqa xmm7,xmm4 ; xmm4=tmp13 - movdqa xmm6,xmm4 - punpcklwd xmm7,xmm0 ; xmm0=tmp12 - punpckhwd xmm6,xmm0 - movdqa xmm4,xmm7 - movdqa xmm0,xmm6 - pmaddwd xmm7,[GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L - pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H - pmaddwd xmm4,[GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L - pmaddwd xmm0,[GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H - - paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm7,DESCALE_P1 - psrad xmm6,DESCALE_P1 - paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm4,DESCALE_P1 - psrad xmm0,DESCALE_P1 - - packssdw xmm7,xmm6 ; xmm7=data2 - packssdw xmm4,xmm0 ; xmm4=data6 - - movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 - movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 - - ; -- Odd part - - movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 - - movdqa xmm6,xmm2 ; xmm2=tmp4 - movdqa xmm0,xmm5 ; xmm5=tmp5 - paddw xmm6,xmm3 ; xmm6=z3 - paddw xmm0,xmm1 ; xmm0=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm7,xmm6 - movdqa xmm4,xmm6 - punpcklwd xmm7,xmm0 - punpckhwd xmm4,xmm0 - movdqa xmm6,xmm7 - movdqa xmm0,xmm4 - pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L - pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H - pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L - pmaddwd xmm0,[GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H - - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L - movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H - - ; (Original) - ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; - ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; - ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; - ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; - ; - ; (This implementation) - ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; - ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; - ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); - ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); - ; data7 = tmp4 + z3; data5 = tmp5 + z4; - ; data3 = tmp6 + z3; data1 = tmp7 + z4; - - movdqa xmm7,xmm2 - movdqa xmm4,xmm2 - punpcklwd xmm7,xmm1 - punpckhwd xmm4,xmm1 - movdqa xmm2,xmm7 - movdqa xmm1,xmm4 - pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L - pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H - pmaddwd xmm2,[GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L - pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H - - paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L - paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H - paddd xmm2,xmm6 ; xmm2=data1L - paddd xmm1,xmm0 ; xmm1=data1H - - paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm7,DESCALE_P1 - psrad xmm4,DESCALE_P1 - paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm2,DESCALE_P1 - psrad xmm1,DESCALE_P1 - - packssdw xmm7,xmm4 ; xmm7=data7 - packssdw xmm2,xmm1 ; xmm2=data1 - - movdqa xmm4,xmm5 - movdqa xmm1,xmm5 - punpcklwd xmm4,xmm3 - punpckhwd xmm1,xmm3 - movdqa xmm5,xmm4 - movdqa xmm3,xmm1 - pmaddwd xmm4,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L - pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H - pmaddwd xmm5,[GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L - pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H - - paddd xmm4,xmm6 ; xmm4=data5L - paddd xmm1,xmm0 ; xmm1=data5H - paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L - paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H - - paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm4,DESCALE_P1 - psrad xmm1,DESCALE_P1 - paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm5,DESCALE_P1 - psrad xmm3,DESCALE_P1 - - packssdw xmm4,xmm1 ; xmm4=data5 - packssdw xmm5,xmm3 ; xmm5=data3 - - ; ---- Pass 2: process columns. - -; mov edx, POINTER [data(eax)] ; (DCTELEM *) - - movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 - movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 - - ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) - ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) - - movdqa xmm1,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31) - punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71) - movdqa xmm3,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33) - punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73) - - movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 - movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 - - ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) - ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) - - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) - - movdqa xmm0,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35) - punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75) - movdqa xmm3,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37) - punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77) - - movdqa xmm4,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17) - punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37) - movdqa xmm7,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57) - punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77) - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) - movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) - movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) - movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) - - movdqa xmm4,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13) - punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33) - movdqa xmm0,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53) - punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73) - - movdqa xmm5,xmm6 ; transpose coefficients(phase 3) - punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 - punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 - movdqa xmm3,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 - punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 - - movdqa xmm2,xmm5 - movdqa xmm7,xmm6 - psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6 - psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7 - paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1 - paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0 - - movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) - movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 - - movdqa xmm5,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 - punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 - movdqa xmm6,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 - punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 - - movdqa xmm0,xmm5 - movdqa xmm3,xmm4 - paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3 - paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2 - psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4 - psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm1,xmm7 - movdqa xmm6,xmm2 - paddw xmm7,xmm5 ; xmm7=tmp10 - paddw xmm2,xmm4 ; xmm2=tmp11 - psubw xmm1,xmm5 ; xmm1=tmp13 - psubw xmm6,xmm4 ; xmm6=tmp12 - - movdqa xmm5,xmm7 - paddw xmm7,xmm2 ; xmm7=tmp10+tmp11 - psubw xmm5,xmm2 ; xmm5=tmp10-tmp11 - - paddw xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)] - paddw xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)] - psraw xmm7,PASS1_BITS ; xmm7=data0 - psraw xmm5,PASS1_BITS ; xmm5=data4 - - movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7 - movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5 - - ; (Original) - ; z1 = (tmp12 + tmp13) * 0.541196100; - ; data2 = z1 + tmp13 * 0.765366865; - ; data6 = z1 + tmp12 * -1.847759065; - ; - ; (This implementation) - ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; - ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); - - movdqa xmm4,xmm1 ; xmm1=tmp13 - movdqa xmm2,xmm1 - punpcklwd xmm4,xmm6 ; xmm6=tmp12 - punpckhwd xmm2,xmm6 - movdqa xmm1,xmm4 - movdqa xmm6,xmm2 - pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L - pmaddwd xmm2,[GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H - pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L - pmaddwd xmm6,[GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H - - paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm4,DESCALE_P2 - psrad xmm2,DESCALE_P2 - paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm1,DESCALE_P2 - psrad xmm6,DESCALE_P2 - - packssdw xmm4,xmm2 ; xmm4=data2 - packssdw xmm1,xmm6 ; xmm1=data6 - - movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4 - movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1 - - ; -- Odd part - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 - - movdqa xmm2,xmm0 ; xmm0=tmp4 - movdqa xmm6,xmm3 ; xmm3=tmp5 - paddw xmm2,xmm7 ; xmm2=z3 - paddw xmm6,xmm5 ; xmm6=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm4,xmm2 - movdqa xmm1,xmm2 - punpcklwd xmm4,xmm6 - punpckhwd xmm1,xmm6 - movdqa xmm2,xmm4 - movdqa xmm6,xmm1 - pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L - pmaddwd xmm1,[GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H - pmaddwd xmm2,[GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L - pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L - movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H - - ; (Original) - ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; - ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; - ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; - ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; - ; - ; (This implementation) - ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; - ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; - ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); - ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); - ; data7 = tmp4 + z3; data5 = tmp5 + z4; - ; data3 = tmp6 + z3; data1 = tmp7 + z4; - - movdqa xmm4,xmm0 - movdqa xmm1,xmm0 - punpcklwd xmm4,xmm5 - punpckhwd xmm1,xmm5 - movdqa xmm0,xmm4 - movdqa xmm5,xmm1 - pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L - pmaddwd xmm1,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H - pmaddwd xmm0,[GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L - pmaddwd xmm5,[GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H - - paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L - paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H - paddd xmm0,xmm2 ; xmm0=data1L - paddd xmm5,xmm6 ; xmm5=data1H - - paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm4,DESCALE_P2 - psrad xmm1,DESCALE_P2 - paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm0,DESCALE_P2 - psrad xmm5,DESCALE_P2 - - packssdw xmm4,xmm1 ; xmm4=data7 - packssdw xmm0,xmm5 ; xmm0=data1 - - movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4 - movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0 - - movdqa xmm1,xmm3 - movdqa xmm5,xmm3 - punpcklwd xmm1,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm3,xmm1 - movdqa xmm7,xmm5 - pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L - pmaddwd xmm5,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H - pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L - pmaddwd xmm7,[GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H - - paddd xmm1,xmm2 ; xmm1=data5L - paddd xmm5,xmm6 ; xmm5=data5H - paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L - paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H - - paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm1,DESCALE_P2 - psrad xmm5,DESCALE_P2 - paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm3,DESCALE_P2 - psrad xmm7,DESCALE_P2 - - packssdw xmm1,xmm5 ; xmm1=data5 - packssdw xmm3,xmm7 ; xmm3=data3 - - movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1 - movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3 - -; pop edi ; unused -; pop esi ; unused -; pop edx ; need not be preserved -; pop ecx ; unused - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) + + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6, xmm1 + movdqa xmm3, xmm0 + psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2, xmm1 + movdqa xmm5, xmm7 + paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4, xmm3 + movdqa xmm0, xmm6 + paddw xmm3, xmm1 ; xmm3=tmp10 + paddw xmm6, xmm7 ; xmm6=tmp11 + psubw xmm4, xmm1 ; xmm4=tmp13 + psubw xmm0, xmm7 ; xmm0=tmp12 + + movdqa xmm1, xmm3 + paddw xmm3, xmm6 ; xmm3=tmp10+tmp11 + psubw xmm1, xmm6 ; xmm1=tmp10-tmp11 + + psllw xmm3, PASS1_BITS ; xmm3=data0 + psllw xmm1, PASS1_BITS ; xmm1=data4 + + movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 + movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm7, xmm4 ; xmm4=tmp13 + movdqa xmm6, xmm4 + punpcklwd xmm7, xmm0 ; xmm0=tmp12 + punpckhwd xmm6, xmm0 + movdqa xmm4, xmm7 + movdqa xmm0, xmm6 + pmaddwd xmm7, [GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L + pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H + pmaddwd xmm4, [GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L + pmaddwd xmm0, [GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H + + paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm7, DESCALE_P1 + psrad xmm6, DESCALE_P1 + paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm4, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm7, xmm6 ; xmm7=data2 + packssdw xmm4, xmm0 ; xmm4=data6 + + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 + + ; -- Odd part + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 + + movdqa xmm6, xmm2 ; xmm2=tmp4 + movdqa xmm0, xmm5 ; xmm5=tmp5 + paddw xmm6, xmm3 ; xmm6=z3 + paddw xmm0, xmm1 ; xmm0=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm7, xmm6 + movdqa xmm4, xmm6 + punpcklwd xmm7, xmm0 + punpckhwd xmm4, xmm0 + movdqa xmm6, xmm7 + movdqa xmm0, xmm4 + pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L + pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H + pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L + pmaddwd xmm0, [GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + punpcklwd xmm7, xmm1 + punpckhwd xmm4, xmm1 + movdqa xmm2, xmm7 + movdqa xmm1, xmm4 + pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L + pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H + pmaddwd xmm2, [GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L + pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H + + paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L + paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H + paddd xmm2, xmm6 ; xmm2=data1L + paddd xmm1, xmm0 ; xmm1=data1H + + paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm7, DESCALE_P1 + psrad xmm4, DESCALE_P1 + paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm2, DESCALE_P1 + psrad xmm1, DESCALE_P1 + + packssdw xmm7, xmm4 ; xmm7=data7 + packssdw xmm2, xmm1 ; xmm2=data1 + + movdqa xmm4, xmm5 + movdqa xmm1, xmm5 + punpcklwd xmm4, xmm3 + punpckhwd xmm1, xmm3 + movdqa xmm5, xmm4 + movdqa xmm3, xmm1 + pmaddwd xmm4, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L + pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H + pmaddwd xmm5, [GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L + pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H + + paddd xmm4, xmm6 ; xmm4=data5L + paddd xmm1, xmm0 ; xmm1=data5H + paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L + paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H + + paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm4, DESCALE_P1 + psrad xmm1, DESCALE_P1 + paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm5, DESCALE_P1 + psrad xmm3, DESCALE_P1 + + packssdw xmm4, xmm1 ; xmm4=data5 + packssdw xmm5, xmm3 ; xmm5=data3 + + ; ---- Pass 2: process columns. + +; mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 + movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 + + ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) + ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) + + movdqa xmm1, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31) + punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71) + movdqa xmm3, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33) + punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73) + + movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 + movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 + + ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) + ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm0, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35) + punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75) + movdqa xmm3, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37) + punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77) + + movdqa xmm4, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17) + punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37) + movdqa xmm7, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57) + punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) + + movdqa xmm4, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13) + punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33) + movdqa xmm0, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53) + punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73) + + movdqa xmm5, xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm3, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm2, xmm5 + movdqa xmm7, xmm6 + psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6 + psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7 + paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1 + paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0 + + movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) + movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movdqa xmm5, xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm6, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm0, xmm5 + movdqa xmm3, xmm4 + paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3 + paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2 + psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4 + psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm1, xmm7 + movdqa xmm6, xmm2 + paddw xmm7, xmm5 ; xmm7=tmp10 + paddw xmm2, xmm4 ; xmm2=tmp11 + psubw xmm1, xmm5 ; xmm1=tmp13 + psubw xmm6, xmm4 ; xmm6=tmp12 + + movdqa xmm5, xmm7 + paddw xmm7, xmm2 ; xmm7=tmp10+tmp11 + psubw xmm5, xmm2 ; xmm5=tmp10-tmp11 + + paddw xmm7, [GOTOFF(ebx,PW_DESCALE_P2X)] + paddw xmm5, [GOTOFF(ebx,PW_DESCALE_P2X)] + psraw xmm7, PASS1_BITS ; xmm7=data0 + psraw xmm5, PASS1_BITS ; xmm5=data4 + + movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7 + movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm4, xmm1 ; xmm1=tmp13 + movdqa xmm2, xmm1 + punpcklwd xmm4, xmm6 ; xmm6=tmp12 + punpckhwd xmm2, xmm6 + movdqa xmm1, xmm4 + movdqa xmm6, xmm2 + pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L + pmaddwd xmm2, [GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H + pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L + pmaddwd xmm6, [GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H + + paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm4, DESCALE_P2 + psrad xmm2, DESCALE_P2 + paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm1, DESCALE_P2 + psrad xmm6, DESCALE_P2 + + packssdw xmm4, xmm2 ; xmm4=data2 + packssdw xmm1, xmm6 ; xmm1=data6 + + movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1 + + ; -- Odd part + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + movdqa xmm2, xmm0 ; xmm0=tmp4 + movdqa xmm6, xmm3 ; xmm3=tmp5 + paddw xmm2, xmm7 ; xmm2=z3 + paddw xmm6, xmm5 ; xmm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm4, xmm2 + movdqa xmm1, xmm2 + punpcklwd xmm4, xmm6 + punpckhwd xmm1, xmm6 + movdqa xmm2, xmm4 + movdqa xmm6, xmm1 + pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L + pmaddwd xmm1, [GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H + pmaddwd xmm2, [GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L + pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm4, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm4, xmm5 + punpckhwd xmm1, xmm5 + movdqa xmm0, xmm4 + movdqa xmm5, xmm1 + pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L + pmaddwd xmm1, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H + pmaddwd xmm0, [GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L + pmaddwd xmm5, [GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H + + paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L + paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H + paddd xmm0, xmm2 ; xmm0=data1L + paddd xmm5, xmm6 ; xmm5=data1H + + paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm4, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm0, DESCALE_P2 + psrad xmm5, DESCALE_P2 + + packssdw xmm4, xmm1 ; xmm4=data7 + packssdw xmm0, xmm5 ; xmm0=data1 + + movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0 + + movdqa xmm1, xmm3 + movdqa xmm5, xmm3 + punpcklwd xmm1, xmm7 + punpckhwd xmm5, xmm7 + movdqa xmm3, xmm1 + movdqa xmm7, xmm5 + pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L + pmaddwd xmm5, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H + pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L + pmaddwd xmm7, [GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H + + paddd xmm1, xmm2 ; xmm1=data5L + paddd xmm5, xmm6 ; xmm5=data5H + paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L + paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H + + paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm1, DESCALE_P2 + psrad xmm5, DESCALE_P2 + paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm3, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm1, xmm5 ; xmm1=data5 + packssdw xmm3, xmm7 ; xmm3=data3 + + movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3 + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jidctflt-sse2-64.asm b/simd/jidctflt-sse2-64.asm index bdda05d..9d78fa1 100644 --- a/simd/jidctflt-sse2-64.asm +++ b/simd/jidctflt-sse2-64.asm @@ -25,34 +25,34 @@ ; -------------------------------------------------------------------------- -%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) - shufps %1,%2,0x44 +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1, %2, 0x44 %endmacro -%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) - shufps %1,%2,0xEE +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1, %2, 0xEE %endmacro ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_idct_float_sse2) + alignz 16 + global EXTN(jconst_idct_float_sse2) EXTN(jconst_idct_float_sse2): -PD_1_414 times 4 dd 1.414213562373095048801689 -PD_1_847 times 4 dd 1.847759065022573512256366 -PD_1_082 times 4 dd 1.082392200292393968799446 -PD_M2_613 times 4 dd -2.613125929752753055713286 -PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) -PB_CENTERJSAMP times 16 db CENTERJSAMPLE +PD_1_414 times 4 dd 1.414213562373095048801689 +PD_1_847 times 4 dd 1.847759065022573512256366 +PD_1_082 times 4 dd 1.082392200292393968799446 +PD_M2_613 times 4 dd -2.613125929752753055713286 +PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Perform dequantization and inverse DCT on one block of coefficients. ; @@ -66,417 +66,417 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; r12 = JSAMPARRAY output_buf ; r13 = JDIMENSION output_col -%define original_rbp rbp+0 -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 -%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT +%define original_rbp rbp+0 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT ; FAST_FLOAT workspace[DCTSIZE2] - align 16 - global EXTN(jsimd_idct_float_sse2) + align 16 + global EXTN(jsimd_idct_float_sse2) EXTN(jsimd_idct_float_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [workspace] - collect_args - push rbx - - ; ---- Pass 1: process columns from input, store into work array. - - mov rdx, r10 ; quantptr - mov rsi, r11 ; inptr - lea rdi, [workspace] ; FAST_FLOAT *wsptr - mov rcx, DCTSIZE/4 ; ctr + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [workspace] + collect_args + push rbx + + ; ---- Pass 1: process columns from input, store into work array. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + lea rdi, [workspace] ; FAST_FLOAT *wsptr + mov rcx, DCTSIZE/4 ; ctr .columnloop: %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE - mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] - jnz near .columnDCT - - movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] - movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] - movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] - movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - por xmm1,xmm2 - por xmm3,xmm4 - por xmm5,xmm6 - por xmm1,xmm3 - por xmm5,xmm7 - por xmm1,xmm5 - packsswb xmm1,xmm1 - movd eax,xmm1 - test rax,rax - jnz short .columnDCT - - ; -- AC terms all zero - - movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) - cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) - - mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm1,xmm0 - movaps xmm2,xmm0 - movaps xmm3,xmm0 - - shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) - shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) - shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) - shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) - - movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 - jmp near .nextcolumn + mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1, xmm2 + por xmm3, xmm4 + por xmm5, xmm6 + por xmm1, xmm3 + por xmm5, xmm7 + por xmm1, xmm5 + packsswb xmm1, xmm1 + movd eax, xmm1 + test rax, rax + jnz short .columnDCT + + ; -- AC terms all zero + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1, xmm0 + movaps xmm2, xmm0 + movaps xmm3, xmm0 + + shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00) + shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01) + shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02) + shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn %endif .columnDCT: - ; -- Even part - - movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] - movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] - movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] - - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) - psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) - psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) - cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) - cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) - - punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) - punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) - psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) - psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) - cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) - cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) - - mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm4,xmm0 - movaps xmm5,xmm1 - subps xmm0,xmm2 ; xmm0=tmp11 - subps xmm1,xmm3 - addps xmm4,xmm2 ; xmm4=tmp10 - addps xmm5,xmm3 ; xmm5=tmp13 - - mulps xmm1,[rel PD_1_414] - subps xmm1,xmm5 ; xmm1=tmp12 - - movaps xmm6,xmm4 - movaps xmm7,xmm0 - subps xmm4,xmm5 ; xmm4=tmp3 - subps xmm0,xmm1 ; xmm0=tmp2 - addps xmm6,xmm5 ; xmm6=tmp0 - addps xmm7,xmm1 ; xmm7=tmp1 - - movaps XMMWORD [wk(1)], xmm4 ; tmp3 - movaps XMMWORD [wk(0)], xmm0 ; tmp2 - - ; -- Odd part - - movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - - punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) - punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) - psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) - psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) - cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) - cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) - - punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) - punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) - psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) - psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) - cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) - cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) - - mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm4,xmm2 - movaps xmm0,xmm5 - addps xmm2,xmm1 ; xmm2=z11 - addps xmm5,xmm3 ; xmm5=z13 - subps xmm4,xmm1 ; xmm4=z12 - subps xmm0,xmm3 ; xmm0=z10 - - movaps xmm1,xmm2 - subps xmm2,xmm5 - addps xmm1,xmm5 ; xmm1=tmp7 - - mulps xmm2,[rel PD_1_414] ; xmm2=tmp11 - - movaps xmm3,xmm0 - addps xmm0,xmm4 - mulps xmm0,[rel PD_1_847] ; xmm0=z5 - mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930) - mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200) - addps xmm3,xmm0 ; xmm3=tmp12 - subps xmm4,xmm0 ; xmm4=tmp10 - - ; -- Final output stage - - subps xmm3,xmm1 ; xmm3=tmp6 - movaps xmm5,xmm6 - movaps xmm0,xmm7 - addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) - addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) - subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) - subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) - subps xmm2,xmm3 ; xmm2=tmp5 - - movaps xmm1,xmm6 ; transpose coefficients(phase 1) - unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) - unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) - movaps xmm3,xmm0 ; transpose coefficients(phase 1) - unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) - unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) - - movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 - movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 - - movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) - movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) - - addps xmm4,xmm2 ; xmm4=tmp4 - movaps xmm0,xmm7 - movaps xmm3,xmm5 - addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) - addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) - subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) - subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) - - movaps xmm2,xmm7 ; transpose coefficients(phase 1) - unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) - unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) - movaps xmm4,xmm5 ; transpose coefficients(phase 1) - unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) - unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) - - movaps xmm3,xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) - unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) - movaps xmm0,xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) - unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) - - movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) - movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) - - movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6 - movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 - - movaps xmm6,xmm5 ; transpose coefficients(phase 2) - unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) - unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) - movaps xmm3,xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) - unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) - - movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6 - movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4 - movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 + ; -- Even part + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23) + psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) + cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23) + + punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43) + punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63) + psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) + psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) + cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43) + cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + subps xmm0, xmm2 ; xmm0=tmp11 + subps xmm1, xmm3 + addps xmm4, xmm2 ; xmm4=tmp10 + addps xmm5, xmm3 ; xmm5=tmp13 + + mulps xmm1, [rel PD_1_414] + subps xmm1, xmm5 ; xmm1=tmp12 + + movaps xmm6, xmm4 + movaps xmm7, xmm0 + subps xmm4, xmm5 ; xmm4=tmp3 + subps xmm0, xmm1 ; xmm0=tmp2 + addps xmm6, xmm5 ; xmm6=tmp0 + addps xmm7, xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13) + punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33) + psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) + psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) + cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13) + cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33) + + punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53) + punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73) + psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) + psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) + cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53) + cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4, xmm2 + movaps xmm0, xmm5 + addps xmm2, xmm1 ; xmm2=z11 + addps xmm5, xmm3 ; xmm5=z13 + subps xmm4, xmm1 ; xmm4=z12 + subps xmm0, xmm3 ; xmm0=z10 + + movaps xmm1, xmm2 + subps xmm2, xmm5 + addps xmm1, xmm5 ; xmm1=tmp7 + + mulps xmm2, [rel PD_1_414] ; xmm2=tmp11 + + movaps xmm3, xmm0 + addps xmm0, xmm4 + mulps xmm0, [rel PD_1_847] ; xmm0=z5 + mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930) + mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200) + addps xmm3, xmm0 ; xmm3=tmp12 + subps xmm4, xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3, xmm1 ; xmm3=tmp6 + movaps xmm5, xmm6 + movaps xmm0, xmm7 + addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2, xmm3 ; xmm2=tmp5 + + movaps xmm1, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13) + movaps xmm3, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4, xmm2 ; xmm4=tmp4 + movaps xmm0, xmm7 + movaps xmm3, xmm5 + addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2, xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33) + movaps xmm4, xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) + movaps xmm0, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6, xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) + movaps xmm3, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 .nextcolumn: - add rsi, byte 4*SIZEOF_JCOEF ; coef_block - add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr - add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr - dec rcx ; ctr - jnz near .columnloop - - ; -- Prefetch the next coefficient block - - prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] - prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] - prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] - prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov rax, [original_rbp] - lea rsi, [workspace] ; FAST_FLOAT *wsptr - mov rdi, r12 ; (JSAMPROW *) - mov eax, r13d - mov rcx, DCTSIZE/4 ; ctr + add rsi, byte 4*SIZEOF_JCOEF ; coef_block + add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec rcx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + lea rsi, [workspace] ; FAST_FLOAT *wsptr + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + mov rcx, DCTSIZE/4 ; ctr .rowloop: - ; -- Even part - - movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)] - - movaps xmm4,xmm0 - movaps xmm5,xmm1 - subps xmm0,xmm2 ; xmm0=tmp11 - subps xmm1,xmm3 - addps xmm4,xmm2 ; xmm4=tmp10 - addps xmm5,xmm3 ; xmm5=tmp13 - - mulps xmm1,[rel PD_1_414] - subps xmm1,xmm5 ; xmm1=tmp12 - - movaps xmm6,xmm4 - movaps xmm7,xmm0 - subps xmm4,xmm5 ; xmm4=tmp3 - subps xmm0,xmm1 ; xmm0=tmp2 - addps xmm6,xmm5 ; xmm6=tmp0 - addps xmm7,xmm1 ; xmm7=tmp1 - - movaps XMMWORD [wk(1)], xmm4 ; tmp3 - movaps XMMWORD [wk(0)], xmm0 ; tmp2 - - ; -- Odd part - - movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)] - movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)] - - movaps xmm4,xmm2 - movaps xmm0,xmm5 - addps xmm2,xmm1 ; xmm2=z11 - addps xmm5,xmm3 ; xmm5=z13 - subps xmm4,xmm1 ; xmm4=z12 - subps xmm0,xmm3 ; xmm0=z10 - - movaps xmm1,xmm2 - subps xmm2,xmm5 - addps xmm1,xmm5 ; xmm1=tmp7 - - mulps xmm2,[rel PD_1_414] ; xmm2=tmp11 - - movaps xmm3,xmm0 - addps xmm0,xmm4 - mulps xmm0,[rel PD_1_847] ; xmm0=z5 - mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930) - mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200) - addps xmm3,xmm0 ; xmm3=tmp12 - subps xmm4,xmm0 ; xmm4=tmp10 - - ; -- Final output stage - - subps xmm3,xmm1 ; xmm3=tmp6 - movaps xmm5,xmm6 - movaps xmm0,xmm7 - addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) - addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) - subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) - subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) - subps xmm2,xmm3 ; xmm2=tmp5 - - movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC] - pcmpeqd xmm3,xmm3 - psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} - - addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) - addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) - addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) - addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) - - pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) - pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) - pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) - pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) - por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) - por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) - - movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 - movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 - - addps xmm4,xmm2 ; xmm4=tmp4 - movaps xmm7,xmm1 - movaps xmm5,xmm3 - addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) - addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) - subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) - subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) - - movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC] - pcmpeqd xmm4,xmm4 - psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} - - addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) - addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) - addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) - addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) - - pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) - pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) - pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) - pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) - por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) - por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) - - movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] - - packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) - packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) - paddb xmm6,xmm2 - paddb xmm1,xmm2 - - movdqa xmm4,xmm6 ; transpose coefficients(phase 2) - punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) - punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) - - movdqa xmm7,xmm6 ; transpose coefficients(phase 3) - punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) - punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) - - pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) - pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) - - mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] - mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 - movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7 - mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] - mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 - movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3 - - add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr - add rdi, byte 4*SIZEOF_JSAMPROW - dec rcx ; ctr - jnz near .rowloop - - pop rbx - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)] + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + subps xmm0, xmm2 ; xmm0=tmp11 + subps xmm1, xmm3 + addps xmm4, xmm2 ; xmm4=tmp10 + addps xmm5, xmm3 ; xmm5=tmp13 + + mulps xmm1, [rel PD_1_414] + subps xmm1, xmm5 ; xmm1=tmp12 + + movaps xmm6, xmm4 + movaps xmm7, xmm0 + subps xmm4, xmm5 ; xmm4=tmp3 + subps xmm0, xmm1 ; xmm0=tmp2 + addps xmm6, xmm5 ; xmm6=tmp0 + addps xmm7, xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)] + + movaps xmm4, xmm2 + movaps xmm0, xmm5 + addps xmm2, xmm1 ; xmm2=z11 + addps xmm5, xmm3 ; xmm5=z13 + subps xmm4, xmm1 ; xmm4=z12 + subps xmm0, xmm3 ; xmm0=z10 + + movaps xmm1, xmm2 + subps xmm2, xmm5 + addps xmm1, xmm5 ; xmm1=tmp7 + + mulps xmm2, [rel PD_1_414] ; xmm2=tmp11 + + movaps xmm3, xmm0 + addps xmm0, xmm4 + mulps xmm0, [rel PD_1_847] ; xmm0=z5 + mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930) + mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200) + addps xmm3, xmm0 ; xmm3=tmp12 + subps xmm4, xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3, xmm1 ; xmm3=tmp6 + movaps xmm5, xmm6 + movaps xmm0, xmm7 + addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2, xmm3 ; xmm2=tmp5 + + movaps xmm1, [rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC] + pcmpeqd xmm3, xmm3 + psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) + addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) + addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) + addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) + + pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) + pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) + pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) + pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) + por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31) + por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37) + + movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 + movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 + + addps xmm4, xmm2 ; xmm4=tmp4 + movaps xmm7, xmm1 + movaps xmm5, xmm3 + addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32) + addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34) + subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35) + subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33) + + movaps xmm2, [rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC] + pcmpeqd xmm4, xmm4 + psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) + addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) + addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) + addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) + + pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) + pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) + pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) + pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) + por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35) + por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33) + + movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] + + packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) + packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) + paddb xmm6, xmm2 + paddb xmm1, xmm2 + + movdqa xmm4, xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 3) + punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + + pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7 + mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3 + + add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add rdi, byte 4*SIZEOF_JSAMPROW + dec rcx ; ctr + jnz near .rowloop + + pop rbx + uncollect_args + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jidctflt-sse2.asm b/simd/jidctflt-sse2.asm index a15a9c1..e34d297 100644 --- a/simd/jidctflt-sse2.asm +++ b/simd/jidctflt-sse2.asm @@ -24,34 +24,34 @@ ; -------------------------------------------------------------------------- -%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) - shufps %1,%2,0x44 +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1, %2, 0x44 %endmacro -%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) - shufps %1,%2,0xEE +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1, %2, 0xEE %endmacro ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_idct_float_sse2) + alignz 16 + global EXTN(jconst_idct_float_sse2) EXTN(jconst_idct_float_sse2): -PD_1_414 times 4 dd 1.414213562373095048801689 -PD_1_847 times 4 dd 1.847759065022573512256366 -PD_1_082 times 4 dd 1.082392200292393968799446 -PD_M2_613 times 4 dd -2.613125929752753055713286 -PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) -PB_CENTERJSAMP times 16 db CENTERJSAMPLE +PD_1_414 times 4 dd 1.414213562373095048801689 +PD_1_847 times 4 dd 1.847759065022573512256366 +PD_1_082 times 4 dd 1.082392200292393968799446 +PD_M2_613 times 4 dd -2.613125929752753055713286 +PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform dequantization and inverse DCT on one block of coefficients. ; @@ -60,438 +60,438 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; JSAMPARRAY output_buf, JDIMENSION output_col) ; -%define dct_table(b) (b)+8 ; void *dct_table -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col +%define dct_table(b) (b)+8 ; void *dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 -%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT ; FAST_FLOAT workspace[DCTSIZE2] - align 16 - global EXTN(jsimd_idct_float_sse2) + align 16 + global EXTN(jsimd_idct_float_sse2) EXTN(jsimd_idct_float_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [workspace] - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input, store into work array. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr - lea edi, [workspace] ; FAST_FLOAT *wsptr - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; FAST_FLOAT *wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz near .columnDCT - - movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por xmm1,xmm2 - por xmm3,xmm4 - por xmm5,xmm6 - por xmm1,xmm3 - por xmm5,xmm7 - por xmm1,xmm5 - packsswb xmm1,xmm1 - movd eax,xmm1 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) - cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) - - mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm1,xmm0 - movaps xmm2,xmm0 - movaps xmm3,xmm0 - - shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) - shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) - shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) - shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) - - movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 - jmp near .nextcolumn - alignx 16,7 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1, xmm2 + por xmm3, xmm4 + por xmm5, xmm6 + por xmm1, xmm3 + por xmm5, xmm7 + por xmm1, xmm5 + packsswb xmm1, xmm1 + movd eax, xmm1 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1, xmm0 + movaps xmm2, xmm0 + movaps xmm3, xmm0 + + shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00) + shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01) + shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02) + shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn + alignx 16, 7 %endif .columnDCT: - ; -- Even part - - movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) - psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) - psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) - cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) - cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) - - punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) - punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) - psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) - psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) - cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) - cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) - - mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm4,xmm0 - movaps xmm5,xmm1 - subps xmm0,xmm2 ; xmm0=tmp11 - subps xmm1,xmm3 - addps xmm4,xmm2 ; xmm4=tmp10 - addps xmm5,xmm3 ; xmm5=tmp13 - - mulps xmm1,[GOTOFF(ebx,PD_1_414)] - subps xmm1,xmm5 ; xmm1=tmp12 - - movaps xmm6,xmm4 - movaps xmm7,xmm0 - subps xmm4,xmm5 ; xmm4=tmp3 - subps xmm0,xmm1 ; xmm0=tmp2 - addps xmm6,xmm5 ; xmm6=tmp0 - addps xmm7,xmm1 ; xmm7=tmp1 - - movaps XMMWORD [wk(1)], xmm4 ; tmp3 - movaps XMMWORD [wk(0)], xmm0 ; tmp2 - - ; -- Odd part - - movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - - punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) - punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) - psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) - psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) - cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) - cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) - - punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) - punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) - psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) - psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) - cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) - cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) - - mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm4,xmm2 - movaps xmm0,xmm5 - addps xmm2,xmm1 ; xmm2=z11 - addps xmm5,xmm3 ; xmm5=z13 - subps xmm4,xmm1 ; xmm4=z12 - subps xmm0,xmm3 ; xmm0=z10 - - movaps xmm1,xmm2 - subps xmm2,xmm5 - addps xmm1,xmm5 ; xmm1=tmp7 - - mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 - - movaps xmm3,xmm0 - addps xmm0,xmm4 - mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 - mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) - mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) - addps xmm3,xmm0 ; xmm3=tmp12 - subps xmm4,xmm0 ; xmm4=tmp10 - - ; -- Final output stage - - subps xmm3,xmm1 ; xmm3=tmp6 - movaps xmm5,xmm6 - movaps xmm0,xmm7 - addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) - addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) - subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) - subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) - subps xmm2,xmm3 ; xmm2=tmp5 - - movaps xmm1,xmm6 ; transpose coefficients(phase 1) - unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) - unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) - movaps xmm3,xmm0 ; transpose coefficients(phase 1) - unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) - unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) - - movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 - movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 - - movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) - movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) - - addps xmm4,xmm2 ; xmm4=tmp4 - movaps xmm0,xmm7 - movaps xmm3,xmm5 - addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) - addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) - subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) - subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) - - movaps xmm2,xmm7 ; transpose coefficients(phase 1) - unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) - unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) - movaps xmm4,xmm5 ; transpose coefficients(phase 1) - unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) - unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) - - movaps xmm3,xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) - unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) - movaps xmm0,xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) - unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) - - movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) - movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) - - movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 - movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 - - movaps xmm6,xmm5 ; transpose coefficients(phase 2) - unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) - unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) - movaps xmm3,xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) - unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) - - movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 - movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 - movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + ; -- Even part + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23) + psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) + cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23) + + punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43) + punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63) + psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) + psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) + cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43) + cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + subps xmm0, xmm2 ; xmm0=tmp11 + subps xmm1, xmm3 + addps xmm4, xmm2 ; xmm4=tmp10 + addps xmm5, xmm3 ; xmm5=tmp13 + + mulps xmm1, [GOTOFF(ebx,PD_1_414)] + subps xmm1, xmm5 ; xmm1=tmp12 + + movaps xmm6, xmm4 + movaps xmm7, xmm0 + subps xmm4, xmm5 ; xmm4=tmp3 + subps xmm0, xmm1 ; xmm0=tmp2 + addps xmm6, xmm5 ; xmm6=tmp0 + addps xmm7, xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13) + punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33) + psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) + psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) + cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13) + cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33) + + punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53) + punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73) + psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) + psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) + cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53) + cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4, xmm2 + movaps xmm0, xmm5 + addps xmm2, xmm1 ; xmm2=z11 + addps xmm5, xmm3 ; xmm5=z13 + subps xmm4, xmm1 ; xmm4=z12 + subps xmm0, xmm3 ; xmm0=z10 + + movaps xmm1, xmm2 + subps xmm2, xmm5 + addps xmm1, xmm5 ; xmm1=tmp7 + + mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3, xmm0 + addps xmm0, xmm4 + mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3, xmm0 ; xmm3=tmp12 + subps xmm4, xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3, xmm1 ; xmm3=tmp6 + movaps xmm5, xmm6 + movaps xmm0, xmm7 + addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2, xmm3 ; xmm2=tmp5 + + movaps xmm1, xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13) + movaps xmm3, xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4, xmm2 ; xmm4=tmp4 + movaps xmm0, xmm7 + movaps xmm3, xmm5 + addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2, xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33) + movaps xmm4, xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3, xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) + movaps xmm0, xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6, xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) + movaps xmm3, xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 .nextcolumn: - add esi, byte 4*SIZEOF_JCOEF ; coef_block - add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr - add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr - dec ecx ; ctr - jnz near .columnloop - - ; -- Prefetch the next coefficient block - - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - lea esi, [workspace] ; FAST_FLOAT *wsptr - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; FAST_FLOAT *wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16, 7 .rowloop: - ; -- Even part - - movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] - - movaps xmm4,xmm0 - movaps xmm5,xmm1 - subps xmm0,xmm2 ; xmm0=tmp11 - subps xmm1,xmm3 - addps xmm4,xmm2 ; xmm4=tmp10 - addps xmm5,xmm3 ; xmm5=tmp13 - - mulps xmm1,[GOTOFF(ebx,PD_1_414)] - subps xmm1,xmm5 ; xmm1=tmp12 - - movaps xmm6,xmm4 - movaps xmm7,xmm0 - subps xmm4,xmm5 ; xmm4=tmp3 - subps xmm0,xmm1 ; xmm0=tmp2 - addps xmm6,xmm5 ; xmm6=tmp0 - addps xmm7,xmm1 ; xmm7=tmp1 - - movaps XMMWORD [wk(1)], xmm4 ; tmp3 - movaps XMMWORD [wk(0)], xmm0 ; tmp2 - - ; -- Odd part - - movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] - - movaps xmm4,xmm2 - movaps xmm0,xmm5 - addps xmm2,xmm1 ; xmm2=z11 - addps xmm5,xmm3 ; xmm5=z13 - subps xmm4,xmm1 ; xmm4=z12 - subps xmm0,xmm3 ; xmm0=z10 - - movaps xmm1,xmm2 - subps xmm2,xmm5 - addps xmm1,xmm5 ; xmm1=tmp7 - - mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 - - movaps xmm3,xmm0 - addps xmm0,xmm4 - mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 - mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) - mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) - addps xmm3,xmm0 ; xmm3=tmp12 - subps xmm4,xmm0 ; xmm4=tmp10 - - ; -- Final output stage - - subps xmm3,xmm1 ; xmm3=tmp6 - movaps xmm5,xmm6 - movaps xmm0,xmm7 - addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) - addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) - subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) - subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) - subps xmm2,xmm3 ; xmm2=tmp5 - - movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] - pcmpeqd xmm3,xmm3 - psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} - - addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) - addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) - addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) - addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) - - pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) - pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) - pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) - pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) - por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) - por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) - - movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 - movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 - - addps xmm4,xmm2 ; xmm4=tmp4 - movaps xmm7,xmm1 - movaps xmm5,xmm3 - addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) - addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) - subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) - subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) - - movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] - pcmpeqd xmm4,xmm4 - psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} - - addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) - addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) - addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) - addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) - - pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) - pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) - pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) - pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) - por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) - por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) - - movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] - - packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) - packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) - paddb xmm6,xmm2 - paddb xmm1,xmm2 - - movdqa xmm4,xmm6 ; transpose coefficients(phase 2) - punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) - punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) - - movdqa xmm7,xmm6 ; transpose coefficients(phase 3) - punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) - punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) - - pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) - pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) - - pushpic ebx ; save GOT address - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 - movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 - mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 - movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 - - poppic ebx ; restore GOT address - - add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr - add edi, byte 4*SIZEOF_JSAMPROW - dec ecx ; ctr - jnz near .rowloop - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + subps xmm0, xmm2 ; xmm0=tmp11 + subps xmm1, xmm3 + addps xmm4, xmm2 ; xmm4=tmp10 + addps xmm5, xmm3 ; xmm5=tmp13 + + mulps xmm1, [GOTOFF(ebx,PD_1_414)] + subps xmm1, xmm5 ; xmm1=tmp12 + + movaps xmm6, xmm4 + movaps xmm7, xmm0 + subps xmm4, xmm5 ; xmm4=tmp3 + subps xmm0, xmm1 ; xmm0=tmp2 + addps xmm6, xmm5 ; xmm6=tmp0 + addps xmm7, xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4, xmm2 + movaps xmm0, xmm5 + addps xmm2, xmm1 ; xmm2=z11 + addps xmm5, xmm3 ; xmm5=z13 + subps xmm4, xmm1 ; xmm4=z12 + subps xmm0, xmm3 ; xmm0=z10 + + movaps xmm1, xmm2 + subps xmm2, xmm5 + addps xmm1, xmm5 ; xmm1=tmp7 + + mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3, xmm0 + addps xmm0, xmm4 + mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3, xmm0 ; xmm3=tmp12 + subps xmm4, xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3, xmm1 ; xmm3=tmp6 + movaps xmm5, xmm6 + movaps xmm0, xmm7 + addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2, xmm3 ; xmm2=tmp5 + + movaps xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] + pcmpeqd xmm3, xmm3 + psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) + addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) + addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) + addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) + + pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) + pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) + pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) + pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) + por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31) + por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37) + + movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 + movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 + + addps xmm4, xmm2 ; xmm4=tmp4 + movaps xmm7, xmm1 + movaps xmm5, xmm3 + addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32) + addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34) + subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35) + subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33) + + movaps xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] + pcmpeqd xmm4, xmm4 + psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) + addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) + addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) + addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) + + pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) + pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) + pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) + pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) + por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35) + por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33) + + movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] + + packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) + packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) + paddb xmm6, xmm2 + paddb xmm1, xmm2 + + movdqa xmm4, xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + + movdqa xmm7, xmm6 ; transpose coefficients(phase 3) + punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + + pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jidctfst-sse2-64.asm b/simd/jidctfst-sse2-64.asm index 4884642..93dd6aa 100644 --- a/simd/jidctfst-sse2-64.asm +++ b/simd/jidctfst-sse2-64.asm @@ -26,54 +26,54 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 8 ; 14 is also OK. -%define PASS1_BITS 2 +%define CONST_BITS 8 ; 14 is also OK. +%define PASS1_BITS 2 %if IFAST_SCALE_BITS != PASS1_BITS %error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." %endif %if CONST_BITS == 8 -F_1_082 equ 277 ; FIX(1.082392200) -F_1_414 equ 362 ; FIX(1.414213562) -F_1_847 equ 473 ; FIX(1.847759065) -F_2_613 equ 669 ; FIX(2.613125930) -F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) +F_1_082 equ 277 ; FIX(1.082392200) +F_1_414 equ 362 ; FIX(1.414213562) +F_1_847 equ 473 ; FIX(1.847759065) +F_2_613 equ 669 ; FIX(2.613125930) +F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) -F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) -F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) +F_1_082 equ DESCALE(1162209775, 30-CONST_BITS) ; FIX(1.082392200) +F_1_414 equ DESCALE(1518500249, 30-CONST_BITS) ; FIX(1.414213562) +F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065) +F_2_613 equ DESCALE(2805822602, 30-CONST_BITS) ; FIX(2.613125930) +F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) -%define PRE_MULTIPLY_SCALE_BITS 2 -%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 16 - global EXTN(jconst_idct_ifast_sse2) + alignz 16 + global EXTN(jconst_idct_ifast_sse2) EXTN(jconst_idct_ifast_sse2): -PW_F1414 times 8 dw F_1_414 << CONST_SHIFT -PW_F1847 times 8 dw F_1_847 << CONST_SHIFT -PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT -PW_F1082 times 8 dw F_1_082 << CONST_SHIFT -PB_CENTERJSAMP times 16 db CENTERJSAMPLE +PW_F1414 times 8 dw F_1_414 << CONST_SHIFT +PW_F1847 times 8 dw F_1_847 << CONST_SHIFT +PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT +PW_F1082 times 8 dw F_1_082 << CONST_SHIFT +PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Perform dequantization and inverse DCT on one block of coefficients. ; @@ -87,405 +87,405 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; r12 = JSAMPARRAY output_buf ; r13 = JDIMENSION output_col -%define original_rbp rbp+0 -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define original_rbp rbp+0 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_idct_ifast_sse2) + align 16 + global EXTN(jsimd_idct_ifast_sse2) EXTN(jsimd_idct_ifast_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args - ; ---- Pass 1: process columns from input. + ; ---- Pass 1: process columns from input. - mov rdx, r10 ; quantptr - mov rsi, r11 ; inptr + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr %ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 - mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] - jnz near .columnDCT - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - por xmm1,xmm0 - packsswb xmm1,xmm1 - packsswb xmm1,xmm1 - movd eax,xmm1 - test rax,rax - jnz short .columnDCT - - ; -- AC terms all zero - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - - movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) - - pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) - pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) - pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) - pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) - pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) - pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) - pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) - pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 - jmp near .column_end + mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1, xmm0 + packsswb xmm1, xmm1 + packsswb xmm1, xmm1 + movd eax, xmm1 + test rax, rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07) + + pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) + pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) + pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) + pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) + pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) + pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) + pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) + pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 + jmp near .column_end %endif .columnDCT: - ; -- Even part - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)] - - movdqa xmm4,xmm0 - movdqa xmm5,xmm1 - psubw xmm0,xmm2 ; xmm0=tmp11 - psubw xmm1,xmm3 - paddw xmm4,xmm2 ; xmm4=tmp10 - paddw xmm5,xmm3 ; xmm5=tmp13 - - psllw xmm1,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm1,[rel PW_F1414] - psubw xmm1,xmm5 ; xmm1=tmp12 - - movdqa xmm6,xmm4 - movdqa xmm7,xmm0 - psubw xmm4,xmm5 ; xmm4=tmp3 - psubw xmm0,xmm1 ; xmm0=tmp2 - paddw xmm6,xmm5 ; xmm6=tmp0 - paddw xmm7,xmm1 ; xmm7=tmp1 - - movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 - - ; -- Odd part - - movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)] - movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)] - - movdqa xmm4,xmm2 - movdqa xmm0,xmm5 - psubw xmm2,xmm1 ; xmm2=z12 - psubw xmm5,xmm3 ; xmm5=z10 - paddw xmm4,xmm1 ; xmm4=z11 - paddw xmm0,xmm3 ; xmm0=z13 - - movdqa xmm1,xmm5 ; xmm1=z10(unscaled) - psllw xmm2,PRE_MULTIPLY_SCALE_BITS - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - - movdqa xmm3,xmm4 - psubw xmm4,xmm0 - paddw xmm3,xmm0 ; xmm3=tmp7 - - psllw xmm4,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm4,[rel PW_F1414] ; xmm4=tmp11 - - ; To avoid overflow... - ; - ; (Original) - ; tmp12 = -2.613125930 * z10 + z5; - ; - ; (This implementation) - ; tmp12 = (-1.613125930 - 1) * z10 + z5; - ; = -1.613125930 * z10 - z10 + z5; - - movdqa xmm0,xmm5 - paddw xmm5,xmm2 - pmulhw xmm5,[rel PW_F1847] ; xmm5=z5 - pmulhw xmm0,[rel PW_MF1613] - pmulhw xmm2,[rel PW_F1082] - psubw xmm0,xmm1 - psubw xmm2,xmm5 ; xmm2=tmp10 - paddw xmm0,xmm5 ; xmm0=tmp12 - - ; -- Final output stage - - psubw xmm0,xmm3 ; xmm0=tmp6 - movdqa xmm1,xmm6 - movdqa xmm5,xmm7 - paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) - paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) - psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) - psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) - psubw xmm4,xmm0 ; xmm4=tmp5 - - movdqa xmm3,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) - punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) - movdqa xmm0,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) - punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 - - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) - - paddw xmm2,xmm4 ; xmm2=tmp4 - movdqa xmm5,xmm7 - movdqa xmm0,xmm1 - paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) - paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) - psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) - psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) - - movdqa xmm4,xmm7 ; transpose coefficients(phase 1) - punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) - punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) - movdqa xmm2,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) - punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) - - movdqa xmm0,xmm3 ; transpose coefficients(phase 2) - punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) - punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) - movdqa xmm5,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) - punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) - - movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) - movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) - - movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) - - movdqa xmm3,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) - punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) - movdqa xmm0,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) - punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) - - movdqa xmm4,xmm6 ; transpose coefficients(phase 3) - punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) - punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) - movdqa xmm7,xmm5 ; transpose coefficients(phase 3) - punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) - punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) - movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 - movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 - - movdqa xmm4,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) - punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) - movdqa xmm7,xmm3 ; transpose coefficients(phase 3) - punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) - punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4, xmm0 + movdqa xmm5, xmm1 + psubw xmm0, xmm2 ; xmm0=tmp11 + psubw xmm1, xmm3 + paddw xmm4, xmm2 ; xmm4=tmp10 + paddw xmm5, xmm3 ; xmm5=tmp13 + + psllw xmm1, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm1, [rel PW_F1414] + psubw xmm1, xmm5 ; xmm1=tmp12 + + movdqa xmm6, xmm4 + movdqa xmm7, xmm0 + psubw xmm4, xmm5 ; xmm4=tmp3 + psubw xmm0, xmm1 ; xmm0=tmp2 + paddw xmm6, xmm5 ; xmm6=tmp0 + paddw xmm7, xmm1 ; xmm7=tmp1 + + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 + + ; -- Odd part + + movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4, xmm2 + movdqa xmm0, xmm5 + psubw xmm2, xmm1 ; xmm2=z12 + psubw xmm5, xmm3 ; xmm5=z10 + paddw xmm4, xmm1 ; xmm4=z11 + paddw xmm0, xmm3 ; xmm0=z13 + + movdqa xmm1, xmm5 ; xmm1=z10(unscaled) + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + + movdqa xmm3, xmm4 + psubw xmm4, xmm0 + paddw xmm3, xmm0 ; xmm3=tmp7 + + psllw xmm4, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm4, [rel PW_F1414] ; xmm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm0, xmm5 + paddw xmm5, xmm2 + pmulhw xmm5, [rel PW_F1847] ; xmm5=z5 + pmulhw xmm0, [rel PW_MF1613] + pmulhw xmm2, [rel PW_F1082] + psubw xmm0, xmm1 + psubw xmm2, xmm5 ; xmm2=tmp10 + paddw xmm0, xmm5 ; xmm0=tmp12 + + ; -- Final output stage + + psubw xmm0, xmm3 ; xmm0=tmp6 + movdqa xmm1, xmm6 + movdqa xmm5, xmm7 + paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) + paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) + psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) + psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) + psubw xmm4, xmm0 ; xmm4=tmp5 + + movdqa xmm3, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13) + punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17) + movdqa xmm0, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73) + punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) + + paddw xmm2, xmm4 ; xmm2=tmp4 + movdqa xmm5, xmm7 + movdqa xmm0, xmm1 + paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) + paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) + psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) + psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33) + punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm2, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57) + + movdqa xmm0, xmm3 ; transpose coefficients(phase 2) + punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35) + punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37) + movdqa xmm5, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31) + punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33) + + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) + + movdqa xmm3, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71) + punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73) + movdqa xmm0, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77) + + movdqa xmm4, xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) + movdqa xmm7, xmm5 ; transpose coefficients(phase 3) + punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 + + movdqa xmm4, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) + movdqa xmm7, xmm3 ; transpose coefficients(phase 3) + punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) .column_end: - ; -- Prefetch the next coefficient block - - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov rax, [original_rbp] - mov rdi, r12 ; (JSAMPROW *) - mov eax, r13d - - ; -- Even part - - ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 - - movdqa xmm2,xmm6 - movdqa xmm0,xmm5 - psubw xmm6,xmm1 ; xmm6=tmp11 - psubw xmm5,xmm3 - paddw xmm2,xmm1 ; xmm2=tmp10 - paddw xmm0,xmm3 ; xmm0=tmp13 - - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm5,[rel PW_F1414] - psubw xmm5,xmm0 ; xmm5=tmp12 - - movdqa xmm1,xmm2 - movdqa xmm3,xmm6 - psubw xmm2,xmm0 ; xmm2=tmp3 - psubw xmm6,xmm5 ; xmm6=tmp2 - paddw xmm1,xmm0 ; xmm1=tmp0 - paddw xmm3,xmm5 ; xmm3=tmp1 - - movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 - - ; -- Odd part - - ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 - - movdqa xmm2,xmm0 - movdqa xmm6,xmm4 - psubw xmm0,xmm7 ; xmm0=z12 - psubw xmm4,xmm5 ; xmm4=z10 - paddw xmm2,xmm7 ; xmm2=z11 - paddw xmm6,xmm5 ; xmm6=z13 - - movdqa xmm7,xmm4 ; xmm7=z10(unscaled) - psllw xmm0,PRE_MULTIPLY_SCALE_BITS - psllw xmm4,PRE_MULTIPLY_SCALE_BITS - - movdqa xmm5,xmm2 - psubw xmm2,xmm6 - paddw xmm5,xmm6 ; xmm5=tmp7 - - psllw xmm2,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm2,[rel PW_F1414] ; xmm2=tmp11 - - ; To avoid overflow... - ; - ; (Original) - ; tmp12 = -2.613125930 * z10 + z5; - ; - ; (This implementation) - ; tmp12 = (-1.613125930 - 1) * z10 + z5; - ; = -1.613125930 * z10 - z10 + z5; - - movdqa xmm6,xmm4 - paddw xmm4,xmm0 - pmulhw xmm4,[rel PW_F1847] ; xmm4=z5 - pmulhw xmm6,[rel PW_MF1613] - pmulhw xmm0,[rel PW_F1082] - psubw xmm6,xmm7 - psubw xmm0,xmm4 ; xmm0=tmp10 - paddw xmm6,xmm4 ; xmm6=tmp12 - - ; -- Final output stage - - psubw xmm6,xmm5 ; xmm6=tmp6 - movdqa xmm7,xmm1 - movdqa xmm4,xmm3 - paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) - paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) - psraw xmm1,(PASS1_BITS+3) ; descale - psraw xmm3,(PASS1_BITS+3) ; descale - psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) - psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) - psraw xmm7,(PASS1_BITS+3) ; descale - psraw xmm4,(PASS1_BITS+3) ; descale - psubw xmm2,xmm6 ; xmm2=tmp5 - - packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 - movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 - - paddw xmm0,xmm2 ; xmm0=tmp4 - movdqa xmm4,xmm5 - movdqa xmm7,xmm6 - paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) - paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) - psraw xmm5,(PASS1_BITS+3) ; descale - psraw xmm6,(PASS1_BITS+3) ; descale - psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) - psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) - psraw xmm4,(PASS1_BITS+3) ; descale - psraw xmm7,(PASS1_BITS+3) ; descale - - movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] - - packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) - packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) - - paddb xmm1,xmm2 - paddb xmm3,xmm2 - paddb xmm5,xmm2 - paddb xmm7,xmm2 - - movdqa xmm0,xmm1 ; transpose coefficients(phase 1) - punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) - punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) - movdqa xmm6,xmm5 ; transpose coefficients(phase 1) - punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) - punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) - - movdqa xmm4,xmm1 ; transpose coefficients(phase 2) - punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) - punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) - movdqa xmm2,xmm6 ; transpose coefficients(phase 2) - punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) - punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) - - movdqa xmm3,xmm1 ; transpose coefficients(phase 3) - punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) - punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) - movdqa xmm7,xmm4 ; transpose coefficients(phase 3) - punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) - punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) - - pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) - pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) - pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) - pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) - - mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 - movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 - mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 - movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7 - - mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 - movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 - mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 - movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 - - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret - ret + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; -- Even part + + ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 + + movdqa xmm2, xmm6 + movdqa xmm0, xmm5 + psubw xmm6, xmm1 ; xmm6=tmp11 + psubw xmm5, xmm3 + paddw xmm2, xmm1 ; xmm2=tmp10 + paddw xmm0, xmm3 ; xmm0=tmp13 + + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [rel PW_F1414] + psubw xmm5, xmm0 ; xmm5=tmp12 + + movdqa xmm1, xmm2 + movdqa xmm3, xmm6 + psubw xmm2, xmm0 ; xmm2=tmp3 + psubw xmm6, xmm5 ; xmm6=tmp2 + paddw xmm1, xmm0 ; xmm1=tmp0 + paddw xmm3, xmm5 ; xmm3=tmp1 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 + + ; -- Odd part + + ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 + + movdqa xmm2, xmm0 + movdqa xmm6, xmm4 + psubw xmm0, xmm7 ; xmm0=z12 + psubw xmm4, xmm5 ; xmm4=z10 + paddw xmm2, xmm7 ; xmm2=z11 + paddw xmm6, xmm5 ; xmm6=z13 + + movdqa xmm7, xmm4 ; xmm7=z10(unscaled) + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + psllw xmm4, PRE_MULTIPLY_SCALE_BITS + + movdqa xmm5, xmm2 + psubw xmm2, xmm6 + paddw xmm5, xmm6 ; xmm5=tmp7 + + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm2, [rel PW_F1414] ; xmm2=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm6, xmm4 + paddw xmm4, xmm0 + pmulhw xmm4, [rel PW_F1847] ; xmm4=z5 + pmulhw xmm6, [rel PW_MF1613] + pmulhw xmm0, [rel PW_F1082] + psubw xmm6, xmm7 + psubw xmm0, xmm4 ; xmm0=tmp10 + paddw xmm6, xmm4 ; xmm6=tmp12 + + ; -- Final output stage + + psubw xmm6, xmm5 ; xmm6=tmp6 + movdqa xmm7, xmm1 + movdqa xmm4, xmm3 + paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) + paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) + psraw xmm1, (PASS1_BITS+3) ; descale + psraw xmm3, (PASS1_BITS+3) ; descale + psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) + psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) + psraw xmm7, (PASS1_BITS+3) ; descale + psraw xmm4, (PASS1_BITS+3) ; descale + psubw xmm2, xmm6 ; xmm2=tmp5 + + packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 + + paddw xmm0, xmm2 ; xmm0=tmp4 + movdqa xmm4, xmm5 + movdqa xmm7, xmm6 + paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) + paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) + psraw xmm5, (PASS1_BITS+3) ; descale + psraw xmm6, (PASS1_BITS+3) ; descale + psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) + psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) + psraw xmm4, (PASS1_BITS+3) ; descale + psraw xmm7, (PASS1_BITS+3) ; descale + + movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] + + packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm1, xmm2 + paddb xmm3, xmm2 + paddb xmm5, xmm2 + paddb xmm7, xmm2 + + movdqa xmm0, xmm1 ; transpose coefficients(phase 1) + punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm6, xmm5 ; transpose coefficients(phase 1) + punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4, xmm1 ; transpose coefficients(phase 2) + punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm2, xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm3, xmm1 ; transpose coefficients(phase 3) + punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm7, xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7 + + mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 + mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 + + uncollect_args + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jidctfst-sse2.asm b/simd/jidctfst-sse2.asm index f591e55..c5e5f33 100644 --- a/simd/jidctfst-sse2.asm +++ b/simd/jidctfst-sse2.asm @@ -25,54 +25,54 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 8 ; 14 is also OK. -%define PASS1_BITS 2 +%define CONST_BITS 8 ; 14 is also OK. +%define PASS1_BITS 2 %if IFAST_SCALE_BITS != PASS1_BITS %error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." %endif %if CONST_BITS == 8 -F_1_082 equ 277 ; FIX(1.082392200) -F_1_414 equ 362 ; FIX(1.414213562) -F_1_847 equ 473 ; FIX(1.847759065) -F_2_613 equ 669 ; FIX(2.613125930) -F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) +F_1_082 equ 277 ; FIX(1.082392200) +F_1_414 equ 362 ; FIX(1.414213562) +F_1_847 equ 473 ; FIX(1.847759065) +F_2_613 equ 669 ; FIX(2.613125930) +F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) -F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) -F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) +F_1_082 equ DESCALE(1162209775, 30-CONST_BITS) ; FIX(1.082392200) +F_1_414 equ DESCALE(1518500249, 30-CONST_BITS) ; FIX(1.414213562) +F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065) +F_2_613 equ DESCALE(2805822602, 30-CONST_BITS) ; FIX(2.613125930) +F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) -%define PRE_MULTIPLY_SCALE_BITS 2 -%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) +%define PRE_MULTIPLY_SCALE_BITS 2 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 16 - global EXTN(jconst_idct_ifast_sse2) + alignz 16 + global EXTN(jconst_idct_ifast_sse2) EXTN(jconst_idct_ifast_sse2): -PW_F1414 times 8 dw F_1_414 << CONST_SHIFT -PW_F1847 times 8 dw F_1_847 << CONST_SHIFT -PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT -PW_F1082 times 8 dw F_1_082 << CONST_SHIFT -PB_CENTERJSAMP times 16 db CENTERJSAMPLE +PW_F1414 times 8 dw F_1_414 << CONST_SHIFT +PW_F1847 times 8 dw F_1_847 << CONST_SHIFT +PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT +PW_F1082 times 8 dw F_1_082 << CONST_SHIFT +PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform dequantization and inverse DCT on one block of coefficients. ; @@ -81,421 +81,421 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; JSAMPARRAY output_buf, JDIMENSION output_col) ; -%define dct_table(b) (b)+8 ; jpeg_component_info *compptr -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col +%define dct_table(b) (b)+8 ; jpeg_component_info *compptr +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_idct_ifast_sse2) + align 16 + global EXTN(jsimd_idct_ifast_sse2) EXTN(jsimd_idct_ifast_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr %ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz near .columnDCT - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por xmm1,xmm0 - packsswb xmm1,xmm1 - packsswb xmm1,xmm1 - movd eax,xmm1 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) - - pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) - pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) - pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) - pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) - pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) - pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) - pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) - pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 - jmp near .column_end - alignx 16,7 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1, xmm0 + packsswb xmm1, xmm1 + packsswb xmm1, xmm1 + movd eax, xmm1 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07) + + pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) + pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) + pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) + pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) + pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) + pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) + pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) + pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 + jmp near .column_end + alignx 16, 7 %endif .columnDCT: - ; -- Even part - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] - - movdqa xmm4,xmm0 - movdqa xmm5,xmm1 - psubw xmm0,xmm2 ; xmm0=tmp11 - psubw xmm1,xmm3 - paddw xmm4,xmm2 ; xmm4=tmp10 - paddw xmm5,xmm3 ; xmm5=tmp13 - - psllw xmm1,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm1,[GOTOFF(ebx,PW_F1414)] - psubw xmm1,xmm5 ; xmm1=tmp12 - - movdqa xmm6,xmm4 - movdqa xmm7,xmm0 - psubw xmm4,xmm5 ; xmm4=tmp3 - psubw xmm0,xmm1 ; xmm0=tmp2 - paddw xmm6,xmm5 ; xmm6=tmp0 - paddw xmm7,xmm1 ; xmm7=tmp1 - - movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 - - ; -- Odd part - - movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] - movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] - - movdqa xmm4,xmm2 - movdqa xmm0,xmm5 - psubw xmm2,xmm1 ; xmm2=z12 - psubw xmm5,xmm3 ; xmm5=z10 - paddw xmm4,xmm1 ; xmm4=z11 - paddw xmm0,xmm3 ; xmm0=z13 - - movdqa xmm1,xmm5 ; xmm1=z10(unscaled) - psllw xmm2,PRE_MULTIPLY_SCALE_BITS - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - - movdqa xmm3,xmm4 - psubw xmm4,xmm0 - paddw xmm3,xmm0 ; xmm3=tmp7 - - psllw xmm4,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm4,[GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11 - - ; To avoid overflow... - ; - ; (Original) - ; tmp12 = -2.613125930 * z10 + z5; - ; - ; (This implementation) - ; tmp12 = (-1.613125930 - 1) * z10 + z5; - ; = -1.613125930 * z10 - z10 + z5; - - movdqa xmm0,xmm5 - paddw xmm5,xmm2 - pmulhw xmm5,[GOTOFF(ebx,PW_F1847)] ; xmm5=z5 - pmulhw xmm0,[GOTOFF(ebx,PW_MF1613)] - pmulhw xmm2,[GOTOFF(ebx,PW_F1082)] - psubw xmm0,xmm1 - psubw xmm2,xmm5 ; xmm2=tmp10 - paddw xmm0,xmm5 ; xmm0=tmp12 - - ; -- Final output stage - - psubw xmm0,xmm3 ; xmm0=tmp6 - movdqa xmm1,xmm6 - movdqa xmm5,xmm7 - paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) - paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) - psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) - psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) - psubw xmm4,xmm0 ; xmm4=tmp5 - - movdqa xmm3,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) - punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) - movdqa xmm0,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) - punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 - - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) - - paddw xmm2,xmm4 ; xmm2=tmp4 - movdqa xmm5,xmm7 - movdqa xmm0,xmm1 - paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) - paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) - psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) - psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) - - movdqa xmm4,xmm7 ; transpose coefficients(phase 1) - punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) - punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) - movdqa xmm2,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) - punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) - - movdqa xmm0,xmm3 ; transpose coefficients(phase 2) - punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) - punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) - movdqa xmm5,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) - punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) - - movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) - movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) - - movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) - - movdqa xmm3,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) - punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) - movdqa xmm0,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) - punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) - - movdqa xmm4,xmm6 ; transpose coefficients(phase 3) - punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) - punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) - movdqa xmm7,xmm5 ; transpose coefficients(phase 3) - punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) - punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) - movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 - movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 - - movdqa xmm4,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) - punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) - movdqa xmm7,xmm3 ; transpose coefficients(phase 3) - punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) - punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4, xmm0 + movdqa xmm5, xmm1 + psubw xmm0, xmm2 ; xmm0=tmp11 + psubw xmm1, xmm3 + paddw xmm4, xmm2 ; xmm4=tmp10 + paddw xmm5, xmm3 ; xmm5=tmp13 + + psllw xmm1, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm1, [GOTOFF(ebx,PW_F1414)] + psubw xmm1, xmm5 ; xmm1=tmp12 + + movdqa xmm6, xmm4 + movdqa xmm7, xmm0 + psubw xmm4, xmm5 ; xmm4=tmp3 + psubw xmm0, xmm1 ; xmm0=tmp2 + paddw xmm6, xmm5 ; xmm6=tmp0 + paddw xmm7, xmm1 ; xmm7=tmp1 + + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 + + ; -- Odd part + + movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4, xmm2 + movdqa xmm0, xmm5 + psubw xmm2, xmm1 ; xmm2=z12 + psubw xmm5, xmm3 ; xmm5=z10 + paddw xmm4, xmm1 ; xmm4=z11 + paddw xmm0, xmm3 ; xmm0=z13 + + movdqa xmm1, xmm5 ; xmm1=z10(unscaled) + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + + movdqa xmm3, xmm4 + psubw xmm4, xmm0 + paddw xmm3, xmm0 ; xmm3=tmp7 + + psllw xmm4, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm4, [GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm0, xmm5 + paddw xmm5, xmm2 + pmulhw xmm5, [GOTOFF(ebx,PW_F1847)] ; xmm5=z5 + pmulhw xmm0, [GOTOFF(ebx,PW_MF1613)] + pmulhw xmm2, [GOTOFF(ebx,PW_F1082)] + psubw xmm0, xmm1 + psubw xmm2, xmm5 ; xmm2=tmp10 + paddw xmm0, xmm5 ; xmm0=tmp12 + + ; -- Final output stage + + psubw xmm0, xmm3 ; xmm0=tmp6 + movdqa xmm1, xmm6 + movdqa xmm5, xmm7 + paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) + paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) + psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) + psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) + psubw xmm4, xmm0 ; xmm4=tmp5 + + movdqa xmm3, xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13) + punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17) + movdqa xmm0, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73) + punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) + + paddw xmm2, xmm4 ; xmm2=tmp4 + movdqa xmm5, xmm7 + movdqa xmm0, xmm1 + paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) + paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) + psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) + psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm4, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33) + punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm2, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53) + punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57) + + movdqa xmm0, xmm3 ; transpose coefficients(phase 2) + punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35) + punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37) + movdqa xmm5, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31) + punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33) + + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) + + movdqa xmm3, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71) + punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73) + movdqa xmm0, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77) + + movdqa xmm4, xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) + movdqa xmm7, xmm5 ; transpose coefficients(phase 3) + punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 + + movdqa xmm4, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) + movdqa xmm7, xmm3 ; transpose coefficients(phase 3) + punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) .column_end: - ; -- Prefetch the next coefficient block - - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - - ; -- Even part - - ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 - - movdqa xmm2,xmm6 - movdqa xmm0,xmm5 - psubw xmm6,xmm1 ; xmm6=tmp11 - psubw xmm5,xmm3 - paddw xmm2,xmm1 ; xmm2=tmp10 - paddw xmm0,xmm3 ; xmm0=tmp13 - - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm5,[GOTOFF(ebx,PW_F1414)] - psubw xmm5,xmm0 ; xmm5=tmp12 - - movdqa xmm1,xmm2 - movdqa xmm3,xmm6 - psubw xmm2,xmm0 ; xmm2=tmp3 - psubw xmm6,xmm5 ; xmm6=tmp2 - paddw xmm1,xmm0 ; xmm1=tmp0 - paddw xmm3,xmm5 ; xmm3=tmp1 - - movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 - - ; -- Odd part - - ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 - - movdqa xmm2,xmm0 - movdqa xmm6,xmm4 - psubw xmm0,xmm7 ; xmm0=z12 - psubw xmm4,xmm5 ; xmm4=z10 - paddw xmm2,xmm7 ; xmm2=z11 - paddw xmm6,xmm5 ; xmm6=z13 - - movdqa xmm7,xmm4 ; xmm7=z10(unscaled) - psllw xmm0,PRE_MULTIPLY_SCALE_BITS - psllw xmm4,PRE_MULTIPLY_SCALE_BITS - - movdqa xmm5,xmm2 - psubw xmm2,xmm6 - paddw xmm5,xmm6 ; xmm5=tmp7 - - psllw xmm2,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm2,[GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11 - - ; To avoid overflow... - ; - ; (Original) - ; tmp12 = -2.613125930 * z10 + z5; - ; - ; (This implementation) - ; tmp12 = (-1.613125930 - 1) * z10 + z5; - ; = -1.613125930 * z10 - z10 + z5; - - movdqa xmm6,xmm4 - paddw xmm4,xmm0 - pmulhw xmm4,[GOTOFF(ebx,PW_F1847)] ; xmm4=z5 - pmulhw xmm6,[GOTOFF(ebx,PW_MF1613)] - pmulhw xmm0,[GOTOFF(ebx,PW_F1082)] - psubw xmm6,xmm7 - psubw xmm0,xmm4 ; xmm0=tmp10 - paddw xmm6,xmm4 ; xmm6=tmp12 - - ; -- Final output stage - - psubw xmm6,xmm5 ; xmm6=tmp6 - movdqa xmm7,xmm1 - movdqa xmm4,xmm3 - paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) - paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) - psraw xmm1,(PASS1_BITS+3) ; descale - psraw xmm3,(PASS1_BITS+3) ; descale - psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) - psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) - psraw xmm7,(PASS1_BITS+3) ; descale - psraw xmm4,(PASS1_BITS+3) ; descale - psubw xmm2,xmm6 ; xmm2=tmp5 - - packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 - movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 - - paddw xmm0,xmm2 ; xmm0=tmp4 - movdqa xmm4,xmm5 - movdqa xmm7,xmm6 - paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) - paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) - psraw xmm5,(PASS1_BITS+3) ; descale - psraw xmm6,(PASS1_BITS+3) ; descale - psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) - psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) - psraw xmm4,(PASS1_BITS+3) ; descale - psraw xmm7,(PASS1_BITS+3) ; descale - - movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] - - packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) - packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) - - paddb xmm1,xmm2 - paddb xmm3,xmm2 - paddb xmm5,xmm2 - paddb xmm7,xmm2 - - movdqa xmm0,xmm1 ; transpose coefficients(phase 1) - punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) - punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) - movdqa xmm6,xmm5 ; transpose coefficients(phase 1) - punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) - punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) - - movdqa xmm4,xmm1 ; transpose coefficients(phase 2) - punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) - punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) - movdqa xmm2,xmm6 ; transpose coefficients(phase 2) - punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) - punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) - - movdqa xmm3,xmm1 ; transpose coefficients(phase 3) - punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) - punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) - movdqa xmm7,xmm4 ; transpose coefficients(phase 3) - punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) - punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) - - pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) - pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) - pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) - pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 - mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7 - - mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 - mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 + + movdqa xmm2, xmm6 + movdqa xmm0, xmm5 + psubw xmm6, xmm1 ; xmm6=tmp11 + psubw xmm5, xmm3 + paddw xmm2, xmm1 ; xmm2=tmp10 + paddw xmm0, xmm3 ; xmm0=tmp13 + + psllw xmm5, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5, [GOTOFF(ebx,PW_F1414)] + psubw xmm5, xmm0 ; xmm5=tmp12 + + movdqa xmm1, xmm2 + movdqa xmm3, xmm6 + psubw xmm2, xmm0 ; xmm2=tmp3 + psubw xmm6, xmm5 ; xmm6=tmp2 + paddw xmm1, xmm0 ; xmm1=tmp0 + paddw xmm3, xmm5 ; xmm3=tmp1 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 + + ; -- Odd part + + ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 + + movdqa xmm2, xmm0 + movdqa xmm6, xmm4 + psubw xmm0, xmm7 ; xmm0=z12 + psubw xmm4, xmm5 ; xmm4=z10 + paddw xmm2, xmm7 ; xmm2=z11 + paddw xmm6, xmm5 ; xmm6=z13 + + movdqa xmm7, xmm4 ; xmm7=z10(unscaled) + psllw xmm0, PRE_MULTIPLY_SCALE_BITS + psllw xmm4, PRE_MULTIPLY_SCALE_BITS + + movdqa xmm5, xmm2 + psubw xmm2, xmm6 + paddw xmm5, xmm6 ; xmm5=tmp7 + + psllw xmm2, PRE_MULTIPLY_SCALE_BITS + pmulhw xmm2, [GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm6, xmm4 + paddw xmm4, xmm0 + pmulhw xmm4, [GOTOFF(ebx,PW_F1847)] ; xmm4=z5 + pmulhw xmm6, [GOTOFF(ebx,PW_MF1613)] + pmulhw xmm0, [GOTOFF(ebx,PW_F1082)] + psubw xmm6, xmm7 + psubw xmm0, xmm4 ; xmm0=tmp10 + paddw xmm6, xmm4 ; xmm6=tmp12 + + ; -- Final output stage + + psubw xmm6, xmm5 ; xmm6=tmp6 + movdqa xmm7, xmm1 + movdqa xmm4, xmm3 + paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) + paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) + psraw xmm1, (PASS1_BITS+3) ; descale + psraw xmm3, (PASS1_BITS+3) ; descale + psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) + psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) + psraw xmm7, (PASS1_BITS+3) ; descale + psraw xmm4, (PASS1_BITS+3) ; descale + psubw xmm2, xmm6 ; xmm2=tmp5 + + packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 + + paddw xmm0, xmm2 ; xmm0=tmp4 + movdqa xmm4, xmm5 + movdqa xmm7, xmm6 + paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) + paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) + psraw xmm5, (PASS1_BITS+3) ; descale + psraw xmm6, (PASS1_BITS+3) ; descale + psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) + psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) + psraw xmm4, (PASS1_BITS+3) ; descale + psraw xmm7, (PASS1_BITS+3) ; descale + + movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] + + packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm1, xmm2 + paddb xmm3, xmm2 + paddb xmm5, xmm2 + paddb xmm7, xmm2 + + movdqa xmm0, xmm1 ; transpose coefficients(phase 1) + punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm6, xmm5 ; transpose coefficients(phase 1) + punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4, xmm1 ; transpose coefficients(phase 2) + punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm2, xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm3, xmm1 ; transpose coefficients(phase 3) + punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm7, xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7 + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 + mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jidctint-sse2-64.asm b/simd/jidctint-sse2-64.asm index afe1d6a..6331181 100644 --- a/simd/jidctint-sse2-64.asm +++ b/simd/jidctint-sse2-64.asm @@ -26,67 +26,67 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 13 -%define PASS1_BITS 2 +%define CONST_BITS 13 +%define PASS1_BITS 2 -%define DESCALE_P1 (CONST_BITS-PASS1_BITS) -%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) %if CONST_BITS == 13 -F_0_298 equ 2446 ; FIX(0.298631336) -F_0_390 equ 3196 ; FIX(0.390180644) -F_0_541 equ 4433 ; FIX(0.541196100) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_175 equ 9633 ; FIX(1.175875602) -F_1_501 equ 12299 ; FIX(1.501321110) -F_1_847 equ 15137 ; FIX(1.847759065) -F_1_961 equ 16069 ; FIX(1.961570560) -F_2_053 equ 16819 ; FIX(2.053119869) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_072 equ 25172 ; FIX(3.072711026) +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) -F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) -F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) -F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +F_0_298 equ DESCALE( 320652955, 30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_idct_islow_sse2) + alignz 16 + global EXTN(jconst_idct_islow_sse2) EXTN(jconst_idct_islow_sse2): -PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 -PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) -PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 -PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) -PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 -PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) -PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 -PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) -PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) -PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) -PB_CENTERJSAMP times 16 db CENTERJSAMPLE +PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Perform dequantization and inverse DCT on one block of coefficients. ; @@ -100,748 +100,748 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; r12 = JSAMPARRAY output_buf ; r13 = JDIMENSION output_col -%define original_rbp rbp+0 -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 12 +%define original_rbp rbp+0 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 12 - align 16 - global EXTN(jsimd_idct_islow_sse2) + align 16 + global EXTN(jsimd_idct_islow_sse2) EXTN(jsimd_idct_islow_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args - ; ---- Pass 1: process columns from input. + ; ---- Pass 1: process columns from input. - mov rdx, r10 ; quantptr - mov rsi, r11 ; inptr + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 - mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] - jnz near .columnDCT - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - por xmm1,xmm0 - packsswb xmm1,xmm1 - packsswb xmm1,xmm1 - movd eax,xmm1 - test rax,rax - jnz short .columnDCT - - ; -- AC terms all zero - - movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - - psllw xmm5,PASS1_BITS - - movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) - punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03) - punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07) - - pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) - pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) - pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) - pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) - pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) - pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) - pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) - pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) - - movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 - movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 - movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 - movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 - jmp near .column_end + mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1, xmm0 + packsswb xmm1, xmm1 + packsswb xmm1, xmm1 + movd eax, xmm1 + test rax, rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm5, PASS1_BITS + + movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03) + punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07) + + pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) + pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) + pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) + pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) + pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) + pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) + pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) + pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + jmp near .column_end %endif .columnDCT: - ; -- Even part - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - - ; (Original) - ; z1 = (z2 + z3) * 0.541196100; - ; tmp2 = z1 + z3 * -1.847759065; - ; tmp3 = z1 + z2 * 0.765366865; - ; - ; (This implementation) - ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); - ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; - - movdqa xmm4,xmm1 ; xmm1=in2=z2 - movdqa xmm5,xmm1 - punpcklwd xmm4,xmm3 ; xmm3=in6=z3 - punpckhwd xmm5,xmm3 - movdqa xmm1,xmm4 - movdqa xmm3,xmm5 - pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=tmp3L - pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H - pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L - pmaddwd xmm3,[rel PW_F054_MF130] ; xmm3=tmp2H - - movdqa xmm6,xmm0 - paddw xmm0,xmm2 ; xmm0=in0+in4 - psubw xmm6,xmm2 ; xmm6=in0-in4 - - pxor xmm7,xmm7 - pxor xmm2,xmm2 - punpcklwd xmm7,xmm0 ; xmm7=tmp0L - punpckhwd xmm2,xmm0 ; xmm2=tmp0H - psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS - psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS - - movdqa xmm0,xmm7 - paddd xmm7,xmm4 ; xmm7=tmp10L - psubd xmm0,xmm4 ; xmm0=tmp13L - movdqa xmm4,xmm2 - paddd xmm2,xmm5 ; xmm2=tmp10H - psubd xmm4,xmm5 ; xmm4=tmp13H - - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L - movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H - movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L - movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H - - pxor xmm5,xmm5 - pxor xmm7,xmm7 - punpcklwd xmm5,xmm6 ; xmm5=tmp1L - punpckhwd xmm7,xmm6 ; xmm7=tmp1H - psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS - psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS - - movdqa xmm2,xmm5 - paddd xmm5,xmm1 ; xmm5=tmp11L - psubd xmm2,xmm1 ; xmm2=tmp12L - movdqa xmm0,xmm7 - paddd xmm7,xmm3 ; xmm7=tmp11H - psubd xmm0,xmm3 ; xmm0=tmp12H - - movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L - movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H - movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L - movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H - - ; -- Odd part - - movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - - movdqa xmm5,xmm6 - movdqa xmm7,xmm4 - paddw xmm5,xmm3 ; xmm5=z3 - paddw xmm7,xmm1 ; xmm7=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm2,xmm5 - movdqa xmm0,xmm5 - punpcklwd xmm2,xmm7 - punpckhwd xmm0,xmm7 - movdqa xmm5,xmm2 - movdqa xmm7,xmm0 - pmaddwd xmm2,[rel PW_MF078_F117] ; xmm2=z3L - pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3H - pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L - pmaddwd xmm7,[rel PW_F117_F078] ; xmm7=z4H - - movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L - movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H - - ; (Original) - ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; - ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; - ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; tmp0 += z1 + z3; tmp1 += z2 + z4; - ; tmp2 += z2 + z3; tmp3 += z1 + z4; - ; - ; (This implementation) - ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; - ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; - ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); - ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); - ; tmp0 += z3; tmp1 += z4; - ; tmp2 += z3; tmp3 += z4; - - movdqa xmm2,xmm3 - movdqa xmm0,xmm3 - punpcklwd xmm2,xmm4 - punpckhwd xmm0,xmm4 - movdqa xmm3,xmm2 - movdqa xmm4,xmm0 - pmaddwd xmm2,[rel PW_MF060_MF089] ; xmm2=tmp0L - pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0H - pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3L - pmaddwd xmm4,[rel PW_MF089_F060] ; xmm4=tmp3H - - paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L - paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H - paddd xmm3,xmm5 ; xmm3=tmp3L - paddd xmm4,xmm7 ; xmm4=tmp3H - - movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L - movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H - - movdqa xmm2,xmm1 - movdqa xmm0,xmm1 - punpcklwd xmm2,xmm6 - punpckhwd xmm0,xmm6 - movdqa xmm1,xmm2 - movdqa xmm6,xmm0 - pmaddwd xmm2,[rel PW_MF050_MF256] ; xmm2=tmp1L - pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1H - pmaddwd xmm1,[rel PW_MF256_F050] ; xmm1=tmp2L - pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H - - paddd xmm2,xmm5 ; xmm2=tmp1L - paddd xmm0,xmm7 ; xmm0=tmp1H - paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L - paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H - - movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L - movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H - - ; -- Final output stage - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L - movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H - - movdqa xmm2,xmm5 - movdqa xmm0,xmm7 - paddd xmm5,xmm3 ; xmm5=data0L - paddd xmm7,xmm4 ; xmm7=data0H - psubd xmm2,xmm3 ; xmm2=data7L - psubd xmm0,xmm4 ; xmm0=data7H - - movdqa xmm3,[rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1] - - paddd xmm5,xmm3 - paddd xmm7,xmm3 - psrad xmm5,DESCALE_P1 - psrad xmm7,DESCALE_P1 - paddd xmm2,xmm3 - paddd xmm0,xmm3 - psrad xmm2,DESCALE_P1 - psrad xmm0,DESCALE_P1 - - packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) - packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) - - movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L - movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H - - movdqa xmm7,xmm4 - movdqa xmm0,xmm3 - paddd xmm4,xmm1 ; xmm4=data1L - paddd xmm3,xmm6 ; xmm3=data1H - psubd xmm7,xmm1 ; xmm7=data6L - psubd xmm0,xmm6 ; xmm0=data6H - - movdqa xmm1,[rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1] - - paddd xmm4,xmm1 - paddd xmm3,xmm1 - psrad xmm4,DESCALE_P1 - psrad xmm3,DESCALE_P1 - paddd xmm7,xmm1 - paddd xmm0,xmm1 - psrad xmm7,DESCALE_P1 - psrad xmm0,DESCALE_P1 - - packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) - packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) - - movdqa xmm6,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13) - punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) - movdqa xmm1,xmm7 ; transpose coefficients(phase 1) - punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73) - punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77) - - movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L - movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H - movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L - movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H - - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) - movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) - movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) - - movdqa xmm5,xmm3 - movdqa xmm6,xmm0 - paddd xmm3,xmm4 ; xmm3=data2L - paddd xmm0,xmm2 ; xmm0=data2H - psubd xmm5,xmm4 ; xmm5=data5L - psubd xmm6,xmm2 ; xmm6=data5H - - movdqa xmm7,[rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1] - - paddd xmm3,xmm7 - paddd xmm0,xmm7 - psrad xmm3,DESCALE_P1 - psrad xmm0,DESCALE_P1 - paddd xmm5,xmm7 - paddd xmm6,xmm7 - psrad xmm5,DESCALE_P1 - psrad xmm6,DESCALE_P1 - - packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) - packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) - - movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L - movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H - movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L - movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H - - movdqa xmm0,xmm1 - movdqa xmm6,xmm4 - paddd xmm1,xmm2 ; xmm1=data3L - paddd xmm4,xmm7 ; xmm4=data3H - psubd xmm0,xmm2 ; xmm0=data4L - psubd xmm6,xmm7 ; xmm6=data4H - - movdqa xmm2,[rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1] - - paddd xmm1,xmm2 - paddd xmm4,xmm2 - psrad xmm1,DESCALE_P1 - psrad xmm4,DESCALE_P1 - paddd xmm0,xmm2 - paddd xmm6,xmm2 - psrad xmm0,DESCALE_P1 - psrad xmm6,DESCALE_P1 - - packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) - packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) - movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) - - movdqa xmm4,xmm3 ; transpose coefficients(phase 1) - punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33) - punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37) - movdqa xmm6,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53) - punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57) - - movdqa xmm1,xmm7 ; transpose coefficients(phase 2) - punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31) - punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33) - movdqa xmm5,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35) - punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37) - - movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) - movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) - - movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) - movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) - - movdqa xmm2,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71) - punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73) - movdqa xmm5,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75) - punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77) - - movdqa xmm3,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) - punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) - movdqa xmm4,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) - punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) - - movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) - movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) - - movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 - movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 - - movdqa xmm3,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) - punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) - movdqa xmm4,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) - punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) - - movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 - movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm4, xmm1 ; xmm1=in2=z2 + movdqa xmm5, xmm1 + punpcklwd xmm4, xmm3 ; xmm3=in6=z3 + punpckhwd xmm5, xmm3 + movdqa xmm1, xmm4 + movdqa xmm3, xmm5 + pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=tmp3L + pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H + pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L + pmaddwd xmm3, [rel PW_F054_MF130] ; xmm3=tmp2H + + movdqa xmm6, xmm0 + paddw xmm0, xmm2 ; xmm0=in0+in4 + psubw xmm6, xmm2 ; xmm6=in0-in4 + + pxor xmm7, xmm7 + pxor xmm2, xmm2 + punpcklwd xmm7, xmm0 ; xmm7=tmp0L + punpckhwd xmm2, xmm0 ; xmm2=tmp0H + psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS + + movdqa xmm0, xmm7 + paddd xmm7, xmm4 ; xmm7=tmp10L + psubd xmm0, xmm4 ; xmm0=tmp13L + movdqa xmm4, xmm2 + paddd xmm2, xmm5 ; xmm2=tmp10H + psubd xmm4, xmm5 ; xmm4=tmp13H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H + + pxor xmm5, xmm5 + pxor xmm7, xmm7 + punpcklwd xmm5, xmm6 ; xmm5=tmp1L + punpckhwd xmm7, xmm6 ; xmm7=tmp1H + psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + + movdqa xmm2, xmm5 + paddd xmm5, xmm1 ; xmm5=tmp11L + psubd xmm2, xmm1 ; xmm2=tmp12L + movdqa xmm0, xmm7 + paddd xmm7, xmm3 ; xmm7=tmp11H + psubd xmm0, xmm3 ; xmm0=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm5, xmm6 + movdqa xmm7, xmm4 + paddw xmm5, xmm3 ; xmm5=z3 + paddw xmm7, xmm1 ; xmm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm2, xmm5 + movdqa xmm0, xmm5 + punpcklwd xmm2, xmm7 + punpckhwd xmm0, xmm7 + movdqa xmm5, xmm2 + movdqa xmm7, xmm0 + pmaddwd xmm2, [rel PW_MF078_F117] ; xmm2=z3L + pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3H + pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L + pmaddwd xmm7, [rel PW_F117_F078] ; xmm7=z4H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm2, xmm3 + movdqa xmm0, xmm3 + punpcklwd xmm2, xmm4 + punpckhwd xmm0, xmm4 + movdqa xmm3, xmm2 + movdqa xmm4, xmm0 + pmaddwd xmm2, [rel PW_MF060_MF089] ; xmm2=tmp0L + pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0H + pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3L + pmaddwd xmm4, [rel PW_MF089_F060] ; xmm4=tmp3H + + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L + paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H + paddd xmm3, xmm5 ; xmm3=tmp3L + paddd xmm4, xmm7 ; xmm4=tmp3H + + movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H + + movdqa xmm2, xmm1 + movdqa xmm0, xmm1 + punpcklwd xmm2, xmm6 + punpckhwd xmm0, xmm6 + movdqa xmm1, xmm2 + movdqa xmm6, xmm0 + pmaddwd xmm2, [rel PW_MF050_MF256] ; xmm2=tmp1L + pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1H + pmaddwd xmm1, [rel PW_MF256_F050] ; xmm1=tmp2L + pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H + + paddd xmm2, xmm5 ; xmm2=tmp1L + paddd xmm0, xmm7 ; xmm0=tmp1H + paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H + + movdqa xmm2, xmm5 + movdqa xmm0, xmm7 + paddd xmm5, xmm3 ; xmm5=data0L + paddd xmm7, xmm4 ; xmm7=data0H + psubd xmm2, xmm3 ; xmm2=data7L + psubd xmm0, xmm4 ; xmm0=data7H + + movdqa xmm3, [rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1] + + paddd xmm5, xmm3 + paddd xmm7, xmm3 + psrad xmm5, DESCALE_P1 + psrad xmm7, DESCALE_P1 + paddd xmm2, xmm3 + paddd xmm0, xmm3 + psrad xmm2, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) + packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) + + movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L + movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H + + movdqa xmm7, xmm4 + movdqa xmm0, xmm3 + paddd xmm4, xmm1 ; xmm4=data1L + paddd xmm3, xmm6 ; xmm3=data1H + psubd xmm7, xmm1 ; xmm7=data6L + psubd xmm0, xmm6 ; xmm0=data6H + + movdqa xmm1, [rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1] + + paddd xmm4, xmm1 + paddd xmm3, xmm1 + psrad xmm4, DESCALE_P1 + psrad xmm3, DESCALE_P1 + paddd xmm7, xmm1 + paddd xmm0, xmm1 + psrad xmm7, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) + + movdqa xmm6, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13) + punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm1, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73) + punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77) + + movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L + movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H + movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L + movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) + + movdqa xmm5, xmm3 + movdqa xmm6, xmm0 + paddd xmm3, xmm4 ; xmm3=data2L + paddd xmm0, xmm2 ; xmm0=data2H + psubd xmm5, xmm4 ; xmm5=data5L + psubd xmm6, xmm2 ; xmm6=data5H + + movdqa xmm7, [rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1] + + paddd xmm3, xmm7 + paddd xmm0, xmm7 + psrad xmm3, DESCALE_P1 + psrad xmm0, DESCALE_P1 + paddd xmm5, xmm7 + paddd xmm6, xmm7 + psrad xmm5, DESCALE_P1 + psrad xmm6, DESCALE_P1 + + packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) + packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L + movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H + movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L + movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H + + movdqa xmm0, xmm1 + movdqa xmm6, xmm4 + paddd xmm1, xmm2 ; xmm1=data3L + paddd xmm4, xmm7 ; xmm4=data3H + psubd xmm0, xmm2 ; xmm0=data4L + psubd xmm6, xmm7 ; xmm6=data4H + + movdqa xmm2, [rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1] + + paddd xmm1, xmm2 + paddd xmm4, xmm2 + psrad xmm1, DESCALE_P1 + psrad xmm4, DESCALE_P1 + paddd xmm0, xmm2 + paddd xmm6, xmm2 + psrad xmm0, DESCALE_P1 + psrad xmm6, DESCALE_P1 + + packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) + packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) + movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) + + movdqa xmm4, xmm3 ; transpose coefficients(phase 1) + punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33) + punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm6, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53) + punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57) + + movdqa xmm1, xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31) + punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33) + movdqa xmm5, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35) + punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) + movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) + + movdqa xmm2, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71) + punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75) + punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77) + + movdqa xmm3, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) + movdqa xmm4, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 + + movdqa xmm3, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) + movdqa xmm4, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 .column_end: - ; -- Prefetch the next coefficient block - - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov rax, [original_rbp] - mov rdi, r12 ; (JSAMPROW *) - mov eax, r13d - - ; -- Even part - - ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 - - ; (Original) - ; z1 = (z2 + z3) * 0.541196100; - ; tmp2 = z1 + z3 * -1.847759065; - ; tmp3 = z1 + z2 * 0.765366865; - ; - ; (This implementation) - ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); - ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; - - movdqa xmm6,xmm1 ; xmm1=in2=z2 - movdqa xmm5,xmm1 - punpcklwd xmm6,xmm2 ; xmm2=in6=z3 - punpckhwd xmm5,xmm2 - movdqa xmm1,xmm6 - movdqa xmm2,xmm5 - pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=tmp3L - pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H - pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L - pmaddwd xmm2,[rel PW_F054_MF130] ; xmm2=tmp2H - - movdqa xmm3,xmm7 - paddw xmm7,xmm0 ; xmm7=in0+in4 - psubw xmm3,xmm0 ; xmm3=in0-in4 - - pxor xmm4,xmm4 - pxor xmm0,xmm0 - punpcklwd xmm4,xmm7 ; xmm4=tmp0L - punpckhwd xmm0,xmm7 ; xmm0=tmp0H - psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS - psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS - - movdqa xmm7,xmm4 - paddd xmm4,xmm6 ; xmm4=tmp10L - psubd xmm7,xmm6 ; xmm7=tmp13L - movdqa xmm6,xmm0 - paddd xmm0,xmm5 ; xmm0=tmp10H - psubd xmm6,xmm5 ; xmm6=tmp13H - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H - movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L - movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H - - pxor xmm5,xmm5 - pxor xmm4,xmm4 - punpcklwd xmm5,xmm3 ; xmm5=tmp1L - punpckhwd xmm4,xmm3 ; xmm4=tmp1H - psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS - psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS - - movdqa xmm0,xmm5 - paddd xmm5,xmm1 ; xmm5=tmp11L - psubd xmm0,xmm1 ; xmm0=tmp12L - movdqa xmm7,xmm4 - paddd xmm4,xmm2 ; xmm4=tmp11H - psubd xmm7,xmm2 ; xmm7=tmp12H - - movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L - movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H - movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L - movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H - - ; -- Odd part - - movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 - movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 - movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 - movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 - - movdqa xmm5,xmm6 - movdqa xmm4,xmm3 - paddw xmm5,xmm1 ; xmm5=z3 - paddw xmm4,xmm2 ; xmm4=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm0,xmm5 - movdqa xmm7,xmm5 - punpcklwd xmm0,xmm4 - punpckhwd xmm7,xmm4 - movdqa xmm5,xmm0 - movdqa xmm4,xmm7 - pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3L - pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3H - pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L - pmaddwd xmm4,[rel PW_F117_F078] ; xmm4=z4H - - movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L - movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H - - ; (Original) - ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; - ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; - ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; tmp0 += z1 + z3; tmp1 += z2 + z4; - ; tmp2 += z2 + z3; tmp3 += z1 + z4; - ; - ; (This implementation) - ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; - ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; - ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); - ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); - ; tmp0 += z3; tmp1 += z4; - ; tmp2 += z3; tmp3 += z4; - - movdqa xmm0,xmm1 - movdqa xmm7,xmm1 - punpcklwd xmm0,xmm3 - punpckhwd xmm7,xmm3 - movdqa xmm1,xmm0 - movdqa xmm3,xmm7 - pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0L - pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp0H - pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp3L - pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3H - - paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L - paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H - paddd xmm1,xmm5 ; xmm1=tmp3L - paddd xmm3,xmm4 ; xmm3=tmp3H - - movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L - movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H - - movdqa xmm0,xmm2 - movdqa xmm7,xmm2 - punpcklwd xmm0,xmm6 - punpckhwd xmm7,xmm6 - movdqa xmm2,xmm0 - movdqa xmm6,xmm7 - pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1L - pmaddwd xmm7,[rel PW_MF050_MF256] ; xmm7=tmp1H - pmaddwd xmm2,[rel PW_MF256_F050] ; xmm2=tmp2L - pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H - - paddd xmm0,xmm5 ; xmm0=tmp1L - paddd xmm7,xmm4 ; xmm7=tmp1H - paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L - paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H - - movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L - movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H - - ; -- Final output stage - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L - movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H - - movdqa xmm0,xmm5 - movdqa xmm7,xmm4 - paddd xmm5,xmm1 ; xmm5=data0L - paddd xmm4,xmm3 ; xmm4=data0H - psubd xmm0,xmm1 ; xmm0=data7L - psubd xmm7,xmm3 ; xmm7=data7H - - movdqa xmm1,[rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2] - - paddd xmm5,xmm1 - paddd xmm4,xmm1 - psrad xmm5,DESCALE_P2 - psrad xmm4,DESCALE_P2 - paddd xmm0,xmm1 - paddd xmm7,xmm1 - psrad xmm0,DESCALE_P2 - psrad xmm7,DESCALE_P2 - - packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) - packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) - - movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L - movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H - - movdqa xmm4,xmm3 - movdqa xmm7,xmm1 - paddd xmm3,xmm2 ; xmm3=data1L - paddd xmm1,xmm6 ; xmm1=data1H - psubd xmm4,xmm2 ; xmm4=data6L - psubd xmm7,xmm6 ; xmm7=data6H - - movdqa xmm2,[rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2] - - paddd xmm3,xmm2 - paddd xmm1,xmm2 - psrad xmm3,DESCALE_P2 - psrad xmm1,DESCALE_P2 - paddd xmm4,xmm2 - paddd xmm7,xmm2 - psrad xmm4,DESCALE_P2 - psrad xmm7,DESCALE_P2 - - packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) - packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) - - packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L - movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H - movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L - movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H - - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - movdqa xmm4,xmm6 - movdqa xmm0,xmm2 - paddd xmm6,xmm1 ; xmm6=data2L - paddd xmm2,xmm7 ; xmm2=data2H - psubd xmm4,xmm1 ; xmm4=data5L - psubd xmm0,xmm7 ; xmm0=data5H - - movdqa xmm5,[rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2] - - paddd xmm6,xmm5 - paddd xmm2,xmm5 - psrad xmm6,DESCALE_P2 - psrad xmm2,DESCALE_P2 - paddd xmm4,xmm5 - paddd xmm0,xmm5 - psrad xmm4,DESCALE_P2 - psrad xmm0,DESCALE_P2 - - packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) - packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) - - movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L - movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H - movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L - movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H - - movdqa xmm2,xmm3 - movdqa xmm0,xmm1 - paddd xmm3,xmm7 ; xmm3=data3L - paddd xmm1,xmm5 ; xmm1=data3H - psubd xmm2,xmm7 ; xmm2=data4L - psubd xmm0,xmm5 ; xmm0=data4H - - movdqa xmm7,[rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2] - - paddd xmm3,xmm7 - paddd xmm1,xmm7 - psrad xmm3,DESCALE_P2 - psrad xmm1,DESCALE_P2 - paddd xmm2,xmm7 - paddd xmm0,xmm7 - psrad xmm2,DESCALE_P2 - psrad xmm0,DESCALE_P2 - - movdqa xmm5,[rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP] - - packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) - packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) - packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) - - paddb xmm7,xmm5 - paddb xmm1,xmm5 - paddb xmm6,xmm5 - paddb xmm3,xmm5 - - movdqa xmm0,xmm7 ; transpose coefficients(phase 1) - punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) - punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) - movdqa xmm2,xmm6 ; transpose coefficients(phase 1) - punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) - punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) - - movdqa xmm4,xmm7 ; transpose coefficients(phase 2) - punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) - punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) - movdqa xmm5,xmm2 ; transpose coefficients(phase 2) - punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) - punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) - - movdqa xmm1,xmm7 ; transpose coefficients(phase 3) - punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) - punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) - movdqa xmm3,xmm4 ; transpose coefficients(phase 3) - punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) - punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) - - pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) - pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) - pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) - pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) - - mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7 - movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1 - mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 - movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 - - mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 - movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 - mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2 - movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 - - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d + + ; -- Even part + + ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm6, xmm1 ; xmm1=in2=z2 + movdqa xmm5, xmm1 + punpcklwd xmm6, xmm2 ; xmm2=in6=z3 + punpckhwd xmm5, xmm2 + movdqa xmm1, xmm6 + movdqa xmm2, xmm5 + pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=tmp3L + pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H + pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L + pmaddwd xmm2, [rel PW_F054_MF130] ; xmm2=tmp2H + + movdqa xmm3, xmm7 + paddw xmm7, xmm0 ; xmm7=in0+in4 + psubw xmm3, xmm0 ; xmm3=in0-in4 + + pxor xmm4, xmm4 + pxor xmm0, xmm0 + punpcklwd xmm4, xmm7 ; xmm4=tmp0L + punpckhwd xmm0, xmm7 ; xmm0=tmp0H + psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS + + movdqa xmm7, xmm4 + paddd xmm4, xmm6 ; xmm4=tmp10L + psubd xmm7, xmm6 ; xmm7=tmp13L + movdqa xmm6, xmm0 + paddd xmm0, xmm5 ; xmm0=tmp10H + psubd xmm6, xmm5 ; xmm6=tmp13H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H + + pxor xmm5, xmm5 + pxor xmm4, xmm4 + punpcklwd xmm5, xmm3 ; xmm5=tmp1L + punpckhwd xmm4, xmm3 ; xmm4=tmp1H + psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + + movdqa xmm0, xmm5 + paddd xmm5, xmm1 ; xmm5=tmp11L + psubd xmm0, xmm1 ; xmm0=tmp12L + movdqa xmm7, xmm4 + paddd xmm4, xmm2 ; xmm4=tmp11H + psubd xmm7, xmm2 ; xmm7=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 + movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 + movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 + movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 + + movdqa xmm5, xmm6 + movdqa xmm4, xmm3 + paddw xmm5, xmm1 ; xmm5=z3 + paddw xmm4, xmm2 ; xmm4=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm0, xmm5 + movdqa xmm7, xmm5 + punpcklwd xmm0, xmm4 + punpckhwd xmm7, xmm4 + movdqa xmm5, xmm0 + movdqa xmm4, xmm7 + pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3L + pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3H + pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L + pmaddwd xmm4, [rel PW_F117_F078] ; xmm4=z4H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm0, xmm1 + movdqa xmm7, xmm1 + punpcklwd xmm0, xmm3 + punpckhwd xmm7, xmm3 + movdqa xmm1, xmm0 + movdqa xmm3, xmm7 + pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0L + pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp0H + pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp3L + pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3H + + paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L + paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H + paddd xmm1, xmm5 ; xmm1=tmp3L + paddd xmm3, xmm4 ; xmm3=tmp3H + + movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H + + movdqa xmm0, xmm2 + movdqa xmm7, xmm2 + punpcklwd xmm0, xmm6 + punpckhwd xmm7, xmm6 + movdqa xmm2, xmm0 + movdqa xmm6, xmm7 + pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1L + pmaddwd xmm7, [rel PW_MF050_MF256] ; xmm7=tmp1H + pmaddwd xmm2, [rel PW_MF256_F050] ; xmm2=tmp2L + pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H + + paddd xmm0, xmm5 ; xmm0=tmp1L + paddd xmm7, xmm4 ; xmm7=tmp1H + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H + + movdqa xmm0, xmm5 + movdqa xmm7, xmm4 + paddd xmm5, xmm1 ; xmm5=data0L + paddd xmm4, xmm3 ; xmm4=data0H + psubd xmm0, xmm1 ; xmm0=data7L + psubd xmm7, xmm3 ; xmm7=data7H + + movdqa xmm1, [rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2] + + paddd xmm5, xmm1 + paddd xmm4, xmm1 + psrad xmm5, DESCALE_P2 + psrad xmm4, DESCALE_P2 + paddd xmm0, xmm1 + paddd xmm7, xmm1 + psrad xmm0, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) + packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L + movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H + + movdqa xmm4, xmm3 + movdqa xmm7, xmm1 + paddd xmm3, xmm2 ; xmm3=data1L + paddd xmm1, xmm6 ; xmm1=data1H + psubd xmm4, xmm2 ; xmm4=data6L + psubd xmm7, xmm6 ; xmm7=data6H + + movdqa xmm2, [rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2] + + paddd xmm3, xmm2 + paddd xmm1, xmm2 + psrad xmm3, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm4, xmm2 + paddd xmm7, xmm2 + psrad xmm4, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) + packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) + + packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H + movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L + movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm4, xmm6 + movdqa xmm0, xmm2 + paddd xmm6, xmm1 ; xmm6=data2L + paddd xmm2, xmm7 ; xmm2=data2H + psubd xmm4, xmm1 ; xmm4=data5L + psubd xmm0, xmm7 ; xmm0=data5H + + movdqa xmm5, [rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2] + + paddd xmm6, xmm5 + paddd xmm2, xmm5 + psrad xmm6, DESCALE_P2 + psrad xmm2, DESCALE_P2 + paddd xmm4, xmm5 + paddd xmm0, xmm5 + psrad xmm4, DESCALE_P2 + psrad xmm0, DESCALE_P2 + + packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) + packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) + + movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L + movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H + movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L + movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H + + movdqa xmm2, xmm3 + movdqa xmm0, xmm1 + paddd xmm3, xmm7 ; xmm3=data3L + paddd xmm1, xmm5 ; xmm1=data3H + psubd xmm2, xmm7 ; xmm2=data4L + psubd xmm0, xmm5 ; xmm0=data4H + + movdqa xmm7, [rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2] + + paddd xmm3, xmm7 + paddd xmm1, xmm7 + psrad xmm3, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm2, xmm7 + paddd xmm0, xmm7 + psrad xmm2, DESCALE_P2 + psrad xmm0, DESCALE_P2 + + movdqa xmm5, [rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP] + + packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) + packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm7, xmm5 + paddb xmm1, xmm5 + paddb xmm6, xmm5 + paddb xmm3, xmm5 + + movdqa xmm0, xmm7 ; transpose coefficients(phase 1) + punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4, xmm7 ; transpose coefficients(phase 2) + punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm5, xmm2 ; transpose coefficients(phase 2) + punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm3, xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1 + mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + + mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 + mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 + + uncollect_args + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jidctint-sse2.asm b/simd/jidctint-sse2.asm index 6c7e7d9..03ef3d9 100644 --- a/simd/jidctint-sse2.asm +++ b/simd/jidctint-sse2.asm @@ -25,67 +25,67 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 13 -%define PASS1_BITS 2 +%define CONST_BITS 13 +%define PASS1_BITS 2 -%define DESCALE_P1 (CONST_BITS-PASS1_BITS) -%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) %if CONST_BITS == 13 -F_0_298 equ 2446 ; FIX(0.298631336) -F_0_390 equ 3196 ; FIX(0.390180644) -F_0_541 equ 4433 ; FIX(0.541196100) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_175 equ 9633 ; FIX(1.175875602) -F_1_501 equ 12299 ; FIX(1.501321110) -F_1_847 equ 15137 ; FIX(1.847759065) -F_1_961 equ 16069 ; FIX(1.961570560) -F_2_053 equ 16819 ; FIX(2.053119869) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_072 equ 25172 ; FIX(3.072711026) +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) -F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) -F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) -F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +F_0_298 equ DESCALE( 320652955, 30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276, 30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887, 30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813, 30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267, 30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350, 30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673, 30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341, 30-CONST_BITS) ; FIX(3.072711026) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_idct_islow_sse2) + alignz 16 + global EXTN(jconst_idct_islow_sse2) EXTN(jconst_idct_islow_sse2): -PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 -PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) -PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 -PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) -PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 -PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) -PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 -PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) -PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) -PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) -PB_CENTERJSAMP times 16 db CENTERJSAMPLE +PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform dequantization and inverse DCT on one block of coefficients. ; @@ -94,765 +94,765 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; JSAMPARRAY output_buf, JDIMENSION output_col) ; -%define dct_table(b) (b)+8 ; jpeg_component_info *compptr -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col +%define dct_table(b) (b)+8 ; jpeg_component_info *compptr +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 12 +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 12 - align 16 - global EXTN(jsimd_idct_islow_sse2) + align 16 + global EXTN(jsimd_idct_islow_sse2) EXTN(jsimd_idct_islow_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz near .columnDCT - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por xmm1,xmm0 - packsswb xmm1,xmm1 - packsswb xmm1,xmm1 - movd eax,xmm1 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - psllw xmm5,PASS1_BITS - - movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) - punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03) - punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07) - - pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) - pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) - pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) - pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) - pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) - pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) - pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) - pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) - - movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 - movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 - movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 - movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 - jmp near .column_end - alignx 16,7 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1, xmm0 + packsswb xmm1, xmm1 + packsswb xmm1, xmm1 + movd eax, xmm1 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm5, PASS1_BITS + + movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03) + punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07) + + pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) + pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) + pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) + pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) + pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) + pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) + pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) + pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + jmp near .column_end + alignx 16, 7 %endif .columnDCT: - ; -- Even part - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - ; (Original) - ; z1 = (z2 + z3) * 0.541196100; - ; tmp2 = z1 + z3 * -1.847759065; - ; tmp3 = z1 + z2 * 0.765366865; - ; - ; (This implementation) - ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); - ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; - - movdqa xmm4,xmm1 ; xmm1=in2=z2 - movdqa xmm5,xmm1 - punpcklwd xmm4,xmm3 ; xmm3=in6=z3 - punpckhwd xmm5,xmm3 - movdqa xmm1,xmm4 - movdqa xmm3,xmm5 - pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L - pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H - pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L - pmaddwd xmm3,[GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H - - movdqa xmm6,xmm0 - paddw xmm0,xmm2 ; xmm0=in0+in4 - psubw xmm6,xmm2 ; xmm6=in0-in4 - - pxor xmm7,xmm7 - pxor xmm2,xmm2 - punpcklwd xmm7,xmm0 ; xmm7=tmp0L - punpckhwd xmm2,xmm0 ; xmm2=tmp0H - psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS - psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS - - movdqa xmm0,xmm7 - paddd xmm7,xmm4 ; xmm7=tmp10L - psubd xmm0,xmm4 ; xmm0=tmp13L - movdqa xmm4,xmm2 - paddd xmm2,xmm5 ; xmm2=tmp10H - psubd xmm4,xmm5 ; xmm4=tmp13H - - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L - movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H - movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L - movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H - - pxor xmm5,xmm5 - pxor xmm7,xmm7 - punpcklwd xmm5,xmm6 ; xmm5=tmp1L - punpckhwd xmm7,xmm6 ; xmm7=tmp1H - psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS - psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS - - movdqa xmm2,xmm5 - paddd xmm5,xmm1 ; xmm5=tmp11L - psubd xmm2,xmm1 ; xmm2=tmp12L - movdqa xmm0,xmm7 - paddd xmm7,xmm3 ; xmm7=tmp11H - psubd xmm0,xmm3 ; xmm0=tmp12H - - movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L - movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H - movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L - movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H - - ; -- Odd part - - movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - movdqa xmm5,xmm6 - movdqa xmm7,xmm4 - paddw xmm5,xmm3 ; xmm5=z3 - paddw xmm7,xmm1 ; xmm7=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm2,xmm5 - movdqa xmm0,xmm5 - punpcklwd xmm2,xmm7 - punpckhwd xmm0,xmm7 - movdqa xmm5,xmm2 - movdqa xmm7,xmm0 - pmaddwd xmm2,[GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L - pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H - pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L - pmaddwd xmm7,[GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H - - movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L - movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H - - ; (Original) - ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; - ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; - ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; tmp0 += z1 + z3; tmp1 += z2 + z4; - ; tmp2 += z2 + z3; tmp3 += z1 + z4; - ; - ; (This implementation) - ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; - ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; - ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); - ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); - ; tmp0 += z3; tmp1 += z4; - ; tmp2 += z3; tmp3 += z4; - - movdqa xmm2,xmm3 - movdqa xmm0,xmm3 - punpcklwd xmm2,xmm4 - punpckhwd xmm0,xmm4 - movdqa xmm3,xmm2 - movdqa xmm4,xmm0 - pmaddwd xmm2,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L - pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H - pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L - pmaddwd xmm4,[GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H - - paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L - paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H - paddd xmm3,xmm5 ; xmm3=tmp3L - paddd xmm4,xmm7 ; xmm4=tmp3H - - movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L - movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H - - movdqa xmm2,xmm1 - movdqa xmm0,xmm1 - punpcklwd xmm2,xmm6 - punpckhwd xmm0,xmm6 - movdqa xmm1,xmm2 - movdqa xmm6,xmm0 - pmaddwd xmm2,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L - pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H - pmaddwd xmm1,[GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L - pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H - - paddd xmm2,xmm5 ; xmm2=tmp1L - paddd xmm0,xmm7 ; xmm0=tmp1H - paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L - paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H - - movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L - movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H - - ; -- Final output stage - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L - movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H - - movdqa xmm2,xmm5 - movdqa xmm0,xmm7 - paddd xmm5,xmm3 ; xmm5=data0L - paddd xmm7,xmm4 ; xmm7=data0H - psubd xmm2,xmm3 ; xmm2=data7L - psubd xmm0,xmm4 ; xmm0=data7H - - movdqa xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1] - - paddd xmm5,xmm3 - paddd xmm7,xmm3 - psrad xmm5,DESCALE_P1 - psrad xmm7,DESCALE_P1 - paddd xmm2,xmm3 - paddd xmm0,xmm3 - psrad xmm2,DESCALE_P1 - psrad xmm0,DESCALE_P1 - - packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) - packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) - - movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L - movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H - - movdqa xmm7,xmm4 - movdqa xmm0,xmm3 - paddd xmm4,xmm1 ; xmm4=data1L - paddd xmm3,xmm6 ; xmm3=data1H - psubd xmm7,xmm1 ; xmm7=data6L - psubd xmm0,xmm6 ; xmm0=data6H - - movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1] - - paddd xmm4,xmm1 - paddd xmm3,xmm1 - psrad xmm4,DESCALE_P1 - psrad xmm3,DESCALE_P1 - paddd xmm7,xmm1 - paddd xmm0,xmm1 - psrad xmm7,DESCALE_P1 - psrad xmm0,DESCALE_P1 - - packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) - packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) - - movdqa xmm6,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13) - punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) - movdqa xmm1,xmm7 ; transpose coefficients(phase 1) - punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73) - punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77) - - movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L - movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H - movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L - movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H - - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) - movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) - movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) - - movdqa xmm5,xmm3 - movdqa xmm6,xmm0 - paddd xmm3,xmm4 ; xmm3=data2L - paddd xmm0,xmm2 ; xmm0=data2H - psubd xmm5,xmm4 ; xmm5=data5L - psubd xmm6,xmm2 ; xmm6=data5H - - movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1] - - paddd xmm3,xmm7 - paddd xmm0,xmm7 - psrad xmm3,DESCALE_P1 - psrad xmm0,DESCALE_P1 - paddd xmm5,xmm7 - paddd xmm6,xmm7 - psrad xmm5,DESCALE_P1 - psrad xmm6,DESCALE_P1 - - packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) - packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) - - movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L - movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H - movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L - movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H - - movdqa xmm0,xmm1 - movdqa xmm6,xmm4 - paddd xmm1,xmm2 ; xmm1=data3L - paddd xmm4,xmm7 ; xmm4=data3H - psubd xmm0,xmm2 ; xmm0=data4L - psubd xmm6,xmm7 ; xmm6=data4H - - movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1] - - paddd xmm1,xmm2 - paddd xmm4,xmm2 - psrad xmm1,DESCALE_P1 - psrad xmm4,DESCALE_P1 - paddd xmm0,xmm2 - paddd xmm6,xmm2 - psrad xmm0,DESCALE_P1 - psrad xmm6,DESCALE_P1 - - packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) - packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) - movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) - - movdqa xmm4,xmm3 ; transpose coefficients(phase 1) - punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33) - punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37) - movdqa xmm6,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53) - punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57) - - movdqa xmm1,xmm7 ; transpose coefficients(phase 2) - punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31) - punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33) - movdqa xmm5,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35) - punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37) - - movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) - movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) - - movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) - movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) - - movdqa xmm2,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71) - punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73) - movdqa xmm5,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75) - punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77) - - movdqa xmm3,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) - punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) - movdqa xmm4,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) - punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) - - movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) - movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) - - movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 - movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 - - movdqa xmm3,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) - punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) - movdqa xmm4,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) - punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) - - movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 - movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm4, xmm1 ; xmm1=in2=z2 + movdqa xmm5, xmm1 + punpcklwd xmm4, xmm3 ; xmm3=in6=z3 + punpckhwd xmm5, xmm3 + movdqa xmm1, xmm4 + movdqa xmm3, xmm5 + pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L + pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H + pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L + pmaddwd xmm3, [GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H + + movdqa xmm6, xmm0 + paddw xmm0, xmm2 ; xmm0=in0+in4 + psubw xmm6, xmm2 ; xmm6=in0-in4 + + pxor xmm7, xmm7 + pxor xmm2, xmm2 + punpcklwd xmm7, xmm0 ; xmm7=tmp0L + punpckhwd xmm2, xmm0 ; xmm2=tmp0H + psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS + + movdqa xmm0, xmm7 + paddd xmm7, xmm4 ; xmm7=tmp10L + psubd xmm0, xmm4 ; xmm0=tmp13L + movdqa xmm4, xmm2 + paddd xmm2, xmm5 ; xmm2=tmp10H + psubd xmm4, xmm5 ; xmm4=tmp13H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H + + pxor xmm5, xmm5 + pxor xmm7, xmm7 + punpcklwd xmm5, xmm6 ; xmm5=tmp1L + punpckhwd xmm7, xmm6 ; xmm7=tmp1H + psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + + movdqa xmm2, xmm5 + paddd xmm5, xmm1 ; xmm5=tmp11L + psubd xmm2, xmm1 ; xmm2=tmp12L + movdqa xmm0, xmm7 + paddd xmm7, xmm3 ; xmm7=tmp11H + psubd xmm0, xmm3 ; xmm0=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm5, xmm6 + movdqa xmm7, xmm4 + paddw xmm5, xmm3 ; xmm5=z3 + paddw xmm7, xmm1 ; xmm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm2, xmm5 + movdqa xmm0, xmm5 + punpcklwd xmm2, xmm7 + punpckhwd xmm0, xmm7 + movdqa xmm5, xmm2 + movdqa xmm7, xmm0 + pmaddwd xmm2, [GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L + pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H + pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L + pmaddwd xmm7, [GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm2, xmm3 + movdqa xmm0, xmm3 + punpcklwd xmm2, xmm4 + punpckhwd xmm0, xmm4 + movdqa xmm3, xmm2 + movdqa xmm4, xmm0 + pmaddwd xmm2, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L + pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H + pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L + pmaddwd xmm4, [GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H + + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L + paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H + paddd xmm3, xmm5 ; xmm3=tmp3L + paddd xmm4, xmm7 ; xmm4=tmp3H + + movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H + + movdqa xmm2, xmm1 + movdqa xmm0, xmm1 + punpcklwd xmm2, xmm6 + punpckhwd xmm0, xmm6 + movdqa xmm1, xmm2 + movdqa xmm6, xmm0 + pmaddwd xmm2, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L + pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H + pmaddwd xmm1, [GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L + pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H + + paddd xmm2, xmm5 ; xmm2=tmp1L + paddd xmm0, xmm7 ; xmm0=tmp1H + paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H + + movdqa xmm2, xmm5 + movdqa xmm0, xmm7 + paddd xmm5, xmm3 ; xmm5=data0L + paddd xmm7, xmm4 ; xmm7=data0H + psubd xmm2, xmm3 ; xmm2=data7L + psubd xmm0, xmm4 ; xmm0=data7H + + movdqa xmm3, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1] + + paddd xmm5, xmm3 + paddd xmm7, xmm3 + psrad xmm5, DESCALE_P1 + psrad xmm7, DESCALE_P1 + paddd xmm2, xmm3 + paddd xmm0, xmm3 + psrad xmm2, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) + packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) + + movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L + movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H + + movdqa xmm7, xmm4 + movdqa xmm0, xmm3 + paddd xmm4, xmm1 ; xmm4=data1L + paddd xmm3, xmm6 ; xmm3=data1H + psubd xmm7, xmm1 ; xmm7=data6L + psubd xmm0, xmm6 ; xmm0=data6H + + movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1] + + paddd xmm4, xmm1 + paddd xmm3, xmm1 + psrad xmm4, DESCALE_P1 + psrad xmm3, DESCALE_P1 + paddd xmm7, xmm1 + paddd xmm0, xmm1 + psrad xmm7, DESCALE_P1 + psrad xmm0, DESCALE_P1 + + packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) + + movdqa xmm6, xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13) + punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm1, xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73) + punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77) + + movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L + movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H + movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L + movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) + + movdqa xmm5, xmm3 + movdqa xmm6, xmm0 + paddd xmm3, xmm4 ; xmm3=data2L + paddd xmm0, xmm2 ; xmm0=data2H + psubd xmm5, xmm4 ; xmm5=data5L + psubd xmm6, xmm2 ; xmm6=data5H + + movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1] + + paddd xmm3, xmm7 + paddd xmm0, xmm7 + psrad xmm3, DESCALE_P1 + psrad xmm0, DESCALE_P1 + paddd xmm5, xmm7 + paddd xmm6, xmm7 + psrad xmm5, DESCALE_P1 + psrad xmm6, DESCALE_P1 + + packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) + packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L + movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H + movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L + movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H + + movdqa xmm0, xmm1 + movdqa xmm6, xmm4 + paddd xmm1, xmm2 ; xmm1=data3L + paddd xmm4, xmm7 ; xmm4=data3H + psubd xmm0, xmm2 ; xmm0=data4L + psubd xmm6, xmm7 ; xmm6=data4H + + movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1] + + paddd xmm1, xmm2 + paddd xmm4, xmm2 + psrad xmm1, DESCALE_P1 + psrad xmm4, DESCALE_P1 + paddd xmm0, xmm2 + paddd xmm6, xmm2 + psrad xmm0, DESCALE_P1 + psrad xmm6, DESCALE_P1 + + packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) + packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) + movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) + + movdqa xmm4, xmm3 ; transpose coefficients(phase 1) + punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33) + punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm6, xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53) + punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57) + + movdqa xmm1, xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31) + punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33) + movdqa xmm5, xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35) + punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) + movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) + + movdqa xmm2, xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71) + punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75) + punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77) + + movdqa xmm3, xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) + movdqa xmm4, xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 + + movdqa xmm3, xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) + movdqa xmm4, xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 .column_end: - ; -- Prefetch the next coefficient block - - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - - ; -- Even part - - ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 - - ; (Original) - ; z1 = (z2 + z3) * 0.541196100; - ; tmp2 = z1 + z3 * -1.847759065; - ; tmp3 = z1 + z2 * 0.765366865; - ; - ; (This implementation) - ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); - ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; - - movdqa xmm6,xmm1 ; xmm1=in2=z2 - movdqa xmm5,xmm1 - punpcklwd xmm6,xmm2 ; xmm2=in6=z3 - punpckhwd xmm5,xmm2 - movdqa xmm1,xmm6 - movdqa xmm2,xmm5 - pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L - pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H - pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L - pmaddwd xmm2,[GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H - - movdqa xmm3,xmm7 - paddw xmm7,xmm0 ; xmm7=in0+in4 - psubw xmm3,xmm0 ; xmm3=in0-in4 - - pxor xmm4,xmm4 - pxor xmm0,xmm0 - punpcklwd xmm4,xmm7 ; xmm4=tmp0L - punpckhwd xmm0,xmm7 ; xmm0=tmp0H - psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS - psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS - - movdqa xmm7,xmm4 - paddd xmm4,xmm6 ; xmm4=tmp10L - psubd xmm7,xmm6 ; xmm7=tmp13L - movdqa xmm6,xmm0 - paddd xmm0,xmm5 ; xmm0=tmp10H - psubd xmm6,xmm5 ; xmm6=tmp13H - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H - movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L - movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H - - pxor xmm5,xmm5 - pxor xmm4,xmm4 - punpcklwd xmm5,xmm3 ; xmm5=tmp1L - punpckhwd xmm4,xmm3 ; xmm4=tmp1H - psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS - psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS - - movdqa xmm0,xmm5 - paddd xmm5,xmm1 ; xmm5=tmp11L - psubd xmm0,xmm1 ; xmm0=tmp12L - movdqa xmm7,xmm4 - paddd xmm4,xmm2 ; xmm4=tmp11H - psubd xmm7,xmm2 ; xmm7=tmp12H - - movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L - movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H - movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L - movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H - - ; -- Odd part - - movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 - movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 - movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 - movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 - - movdqa xmm5,xmm6 - movdqa xmm4,xmm3 - paddw xmm5,xmm1 ; xmm5=z3 - paddw xmm4,xmm2 ; xmm4=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm0,xmm5 - movdqa xmm7,xmm5 - punpcklwd xmm0,xmm4 - punpckhwd xmm7,xmm4 - movdqa xmm5,xmm0 - movdqa xmm4,xmm7 - pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L - pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H - pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L - pmaddwd xmm4,[GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H - - movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L - movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H - - ; (Original) - ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; - ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; - ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; tmp0 += z1 + z3; tmp1 += z2 + z4; - ; tmp2 += z2 + z3; tmp3 += z1 + z4; - ; - ; (This implementation) - ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; - ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; - ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); - ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); - ; tmp0 += z3; tmp1 += z4; - ; tmp2 += z3; tmp3 += z4; - - movdqa xmm0,xmm1 - movdqa xmm7,xmm1 - punpcklwd xmm0,xmm3 - punpckhwd xmm7,xmm3 - movdqa xmm1,xmm0 - movdqa xmm3,xmm7 - pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L - pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H - pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L - pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H - - paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L - paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H - paddd xmm1,xmm5 ; xmm1=tmp3L - paddd xmm3,xmm4 ; xmm3=tmp3H - - movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L - movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H - - movdqa xmm0,xmm2 - movdqa xmm7,xmm2 - punpcklwd xmm0,xmm6 - punpckhwd xmm7,xmm6 - movdqa xmm2,xmm0 - movdqa xmm6,xmm7 - pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L - pmaddwd xmm7,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H - pmaddwd xmm2,[GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L - pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H - - paddd xmm0,xmm5 ; xmm0=tmp1L - paddd xmm7,xmm4 ; xmm7=tmp1H - paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L - paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H - - movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L - movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H - - ; -- Final output stage - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L - movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H - - movdqa xmm0,xmm5 - movdqa xmm7,xmm4 - paddd xmm5,xmm1 ; xmm5=data0L - paddd xmm4,xmm3 ; xmm4=data0H - psubd xmm0,xmm1 ; xmm0=data7L - psubd xmm7,xmm3 ; xmm7=data7H - - movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2] - - paddd xmm5,xmm1 - paddd xmm4,xmm1 - psrad xmm5,DESCALE_P2 - psrad xmm4,DESCALE_P2 - paddd xmm0,xmm1 - paddd xmm7,xmm1 - psrad xmm0,DESCALE_P2 - psrad xmm7,DESCALE_P2 - - packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) - packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) - - movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L - movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H - - movdqa xmm4,xmm3 - movdqa xmm7,xmm1 - paddd xmm3,xmm2 ; xmm3=data1L - paddd xmm1,xmm6 ; xmm1=data1H - psubd xmm4,xmm2 ; xmm4=data6L - psubd xmm7,xmm6 ; xmm7=data6H - - movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2] - - paddd xmm3,xmm2 - paddd xmm1,xmm2 - psrad xmm3,DESCALE_P2 - psrad xmm1,DESCALE_P2 - paddd xmm4,xmm2 - paddd xmm7,xmm2 - psrad xmm4,DESCALE_P2 - psrad xmm7,DESCALE_P2 - - packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) - packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) - - packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L - movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H - movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L - movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H - - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - movdqa xmm4,xmm6 - movdqa xmm0,xmm2 - paddd xmm6,xmm1 ; xmm6=data2L - paddd xmm2,xmm7 ; xmm2=data2H - psubd xmm4,xmm1 ; xmm4=data5L - psubd xmm0,xmm7 ; xmm0=data5H - - movdqa xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2] - - paddd xmm6,xmm5 - paddd xmm2,xmm5 - psrad xmm6,DESCALE_P2 - psrad xmm2,DESCALE_P2 - paddd xmm4,xmm5 - paddd xmm0,xmm5 - psrad xmm4,DESCALE_P2 - psrad xmm0,DESCALE_P2 - - packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) - packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) - - movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L - movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H - movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L - movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H - - movdqa xmm2,xmm3 - movdqa xmm0,xmm1 - paddd xmm3,xmm7 ; xmm3=data3L - paddd xmm1,xmm5 ; xmm1=data3H - psubd xmm2,xmm7 ; xmm2=data4L - psubd xmm0,xmm5 ; xmm0=data4H - - movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2] - - paddd xmm3,xmm7 - paddd xmm1,xmm7 - psrad xmm3,DESCALE_P2 - psrad xmm1,DESCALE_P2 - paddd xmm2,xmm7 - paddd xmm0,xmm7 - psrad xmm2,DESCALE_P2 - psrad xmm0,DESCALE_P2 - - movdqa xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP] - - packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) - packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) - packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) - - paddb xmm7,xmm5 - paddb xmm1,xmm5 - paddb xmm6,xmm5 - paddb xmm3,xmm5 - - movdqa xmm0,xmm7 ; transpose coefficients(phase 1) - punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) - punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) - movdqa xmm2,xmm6 ; transpose coefficients(phase 1) - punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) - punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) - - movdqa xmm4,xmm7 ; transpose coefficients(phase 2) - punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) - punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) - movdqa xmm5,xmm2 ; transpose coefficients(phase 2) - punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) - punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) - - movdqa xmm1,xmm7 ; transpose coefficients(phase 3) - punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) - punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) - movdqa xmm3,xmm4 ; transpose coefficients(phase 3) - punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) - punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) - - pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) - pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) - pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) - pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1 - mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 - - mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 - mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5 - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm6, xmm1 ; xmm1=in2=z2 + movdqa xmm5, xmm1 + punpcklwd xmm6, xmm2 ; xmm2=in6=z3 + punpckhwd xmm5, xmm2 + movdqa xmm1, xmm6 + movdqa xmm2, xmm5 + pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L + pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H + pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L + pmaddwd xmm2, [GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H + + movdqa xmm3, xmm7 + paddw xmm7, xmm0 ; xmm7=in0+in4 + psubw xmm3, xmm0 ; xmm3=in0-in4 + + pxor xmm4, xmm4 + pxor xmm0, xmm0 + punpcklwd xmm4, xmm7 ; xmm4=tmp0L + punpckhwd xmm0, xmm7 ; xmm0=tmp0H + psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS + + movdqa xmm7, xmm4 + paddd xmm4, xmm6 ; xmm4=tmp10L + psubd xmm7, xmm6 ; xmm7=tmp13L + movdqa xmm6, xmm0 + paddd xmm0, xmm5 ; xmm0=tmp10H + psubd xmm6, xmm5 ; xmm6=tmp13H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H + + pxor xmm5, xmm5 + pxor xmm4, xmm4 + punpcklwd xmm5, xmm3 ; xmm5=tmp1L + punpckhwd xmm4, xmm3 ; xmm4=tmp1H + psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + + movdqa xmm0, xmm5 + paddd xmm5, xmm1 ; xmm5=tmp11L + psubd xmm0, xmm1 ; xmm0=tmp12L + movdqa xmm7, xmm4 + paddd xmm4, xmm2 ; xmm4=tmp11H + psubd xmm7, xmm2 ; xmm7=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 + movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 + movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 + movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 + + movdqa xmm5, xmm6 + movdqa xmm4, xmm3 + paddw xmm5, xmm1 ; xmm5=z3 + paddw xmm4, xmm2 ; xmm4=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm0, xmm5 + movdqa xmm7, xmm5 + punpcklwd xmm0, xmm4 + punpckhwd xmm7, xmm4 + movdqa xmm5, xmm0 + movdqa xmm4, xmm7 + pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L + pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H + pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L + pmaddwd xmm4, [GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm0, xmm1 + movdqa xmm7, xmm1 + punpcklwd xmm0, xmm3 + punpckhwd xmm7, xmm3 + movdqa xmm1, xmm0 + movdqa xmm3, xmm7 + pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L + pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H + pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L + pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H + + paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L + paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H + paddd xmm1, xmm5 ; xmm1=tmp3L + paddd xmm3, xmm4 ; xmm3=tmp3H + + movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H + + movdqa xmm0, xmm2 + movdqa xmm7, xmm2 + punpcklwd xmm0, xmm6 + punpckhwd xmm7, xmm6 + movdqa xmm2, xmm0 + movdqa xmm6, xmm7 + pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L + pmaddwd xmm7, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H + pmaddwd xmm2, [GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L + pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H + + paddd xmm0, xmm5 ; xmm0=tmp1L + paddd xmm7, xmm4 ; xmm7=tmp1H + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H + + movdqa xmm0, xmm5 + movdqa xmm7, xmm4 + paddd xmm5, xmm1 ; xmm5=data0L + paddd xmm4, xmm3 ; xmm4=data0H + psubd xmm0, xmm1 ; xmm0=data7L + psubd xmm7, xmm3 ; xmm7=data7H + + movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2] + + paddd xmm5, xmm1 + paddd xmm4, xmm1 + psrad xmm5, DESCALE_P2 + psrad xmm4, DESCALE_P2 + paddd xmm0, xmm1 + paddd xmm7, xmm1 + psrad xmm0, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) + packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L + movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H + + movdqa xmm4, xmm3 + movdqa xmm7, xmm1 + paddd xmm3, xmm2 ; xmm3=data1L + paddd xmm1, xmm6 ; xmm1=data1H + psubd xmm4, xmm2 ; xmm4=data6L + psubd xmm7, xmm6 ; xmm7=data6H + + movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2] + + paddd xmm3, xmm2 + paddd xmm1, xmm2 + psrad xmm3, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm4, xmm2 + paddd xmm7, xmm2 + psrad xmm4, DESCALE_P2 + psrad xmm7, DESCALE_P2 + + packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) + packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) + + packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H + movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L + movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm4, xmm6 + movdqa xmm0, xmm2 + paddd xmm6, xmm1 ; xmm6=data2L + paddd xmm2, xmm7 ; xmm2=data2H + psubd xmm4, xmm1 ; xmm4=data5L + psubd xmm0, xmm7 ; xmm0=data5H + + movdqa xmm5, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2] + + paddd xmm6, xmm5 + paddd xmm2, xmm5 + psrad xmm6, DESCALE_P2 + psrad xmm2, DESCALE_P2 + paddd xmm4, xmm5 + paddd xmm0, xmm5 + psrad xmm4, DESCALE_P2 + psrad xmm0, DESCALE_P2 + + packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) + packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) + + movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L + movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H + movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L + movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H + + movdqa xmm2, xmm3 + movdqa xmm0, xmm1 + paddd xmm3, xmm7 ; xmm3=data3L + paddd xmm1, xmm5 ; xmm1=data3H + psubd xmm2, xmm7 ; xmm2=data4L + psubd xmm0, xmm5 ; xmm0=data4H + + movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2] + + paddd xmm3, xmm7 + paddd xmm1, xmm7 + psrad xmm3, DESCALE_P2 + psrad xmm1, DESCALE_P2 + paddd xmm2, xmm7 + paddd xmm0, xmm7 + psrad xmm2, DESCALE_P2 + psrad xmm0, DESCALE_P2 + + movdqa xmm5, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP] + + packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) + packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm7, xmm5 + paddb xmm1, xmm5 + paddb xmm6, xmm5 + paddb xmm3, xmm5 + + movdqa xmm0, xmm7 ; transpose coefficients(phase 1) + punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm2, xmm6 ; transpose coefficients(phase 1) + punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4, xmm7 ; transpose coefficients(phase 2) + punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm5, xmm2 ; transpose coefficients(phase 2) + punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm1, xmm7 ; transpose coefficients(phase 3) + punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm3, xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1 + mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 + mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jidctred-sse2-64.asm b/simd/jidctred-sse2-64.asm index a54bbe2..31a3f36 100644 --- a/simd/jidctred-sse2-64.asm +++ b/simd/jidctred-sse2-64.asm @@ -26,74 +26,74 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 13 -%define PASS1_BITS 2 +%define CONST_BITS 13 +%define PASS1_BITS 2 -%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) -%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) -%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) -%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) +%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) +%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) +%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) +%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) %if CONST_BITS == 13 -F_0_211 equ 1730 ; FIX(0.211164243) -F_0_509 equ 4176 ; FIX(0.509795579) -F_0_601 equ 4926 ; FIX(0.601344887) -F_0_720 equ 5906 ; FIX(0.720959822) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_850 equ 6967 ; FIX(0.850430095) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_061 equ 8697 ; FIX(1.061594337) -F_1_272 equ 10426 ; FIX(1.272758580) -F_1_451 equ 11893 ; FIX(1.451774981) -F_1_847 equ 15137 ; FIX(1.847759065) -F_2_172 equ 17799 ; FIX(2.172734803) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_624 equ 29692 ; FIX(3.624509785) +F_0_211 equ 1730 ; FIX(0.211164243) +F_0_509 equ 4176 ; FIX(0.509795579) +F_0_601 equ 4926 ; FIX(0.601344887) +F_0_720 equ 5906 ; FIX(0.720959822) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_850 equ 6967 ; FIX(0.850430095) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_061 equ 8697 ; FIX(1.061594337) +F_1_272 equ 10426 ; FIX(1.272758580) +F_1_451 equ 11893 ; FIX(1.451774981) +F_1_847 equ 15137 ; FIX(1.847759065) +F_2_172 equ 17799 ; FIX(2.172734803) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_624 equ 29692 ; FIX(3.624509785) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) -F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) -F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) -F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) -F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) -F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) +F_0_211 equ DESCALE( 226735879, 30-CONST_BITS) ; FIX(0.211164243) +F_0_509 equ DESCALE( 547388834, 30-CONST_BITS) ; FIX(0.509795579) +F_0_601 equ DESCALE( 645689155, 30-CONST_BITS) ; FIX(0.601344887) +F_0_720 equ DESCALE( 774124714, 30-CONST_BITS) ; FIX(0.720959822) +F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865) +F_0_850 equ DESCALE( 913142361, 30-CONST_BITS) ; FIX(0.850430095) +F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223) +F_1_061 equ DESCALE(1139878239, 30-CONST_BITS) ; FIX(1.061594337) +F_1_272 equ DESCALE(1366614119, 30-CONST_BITS) ; FIX(1.272758580) +F_1_451 equ DESCALE(1558831516, 30-CONST_BITS) ; FIX(1.451774981) +F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065) +F_2_172 equ DESCALE(2332956230, 30-CONST_BITS) ; FIX(2.172734803) +F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447) +F_3_624 equ DESCALE(3891787747, 30-CONST_BITS) ; FIX(3.624509785) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_idct_red_sse2) + alignz 16 + global EXTN(jconst_idct_red_sse2) EXTN(jconst_idct_red_sse2): -PW_F184_MF076 times 4 dw F_1_847,-F_0_765 -PW_F256_F089 times 4 dw F_2_562, F_0_899 -PW_F106_MF217 times 4 dw F_1_061,-F_2_172 -PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 -PW_F145_MF021 times 4 dw F_1_451,-F_0_211 -PW_F362_MF127 times 4 dw F_3_624,-F_1_272 -PW_F085_MF072 times 4 dw F_0_850,-F_0_720 -PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1) -PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1) -PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) -PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) -PB_CENTERJSAMP times 16 db CENTERJSAMPLE - - alignz 16 +PW_F184_MF076 times 4 dw F_1_847,-F_0_765 +PW_F256_F089 times 4 dw F_2_562, F_0_899 +PW_F106_MF217 times 4 dw F_1_061,-F_2_172 +PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 +PW_F145_MF021 times 4 dw F_1_451,-F_0_211 +PW_F362_MF127 times 4 dw F_3_624,-F_1_272 +PW_F085_MF072 times 4 dw F_0_850,-F_0_720 +PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1) +PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1) +PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) +PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Perform dequantization and inverse DCT on one block of coefficients, ; producing a reduced-size 4x4 output block. @@ -108,292 +108,292 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; r12 = JSAMPARRAY output_buf ; r13 = JDIMENSION output_col -%define original_rbp rbp+0 -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define original_rbp rbp+0 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_idct_4x4_sse2) + align 16 + global EXTN(jsimd_idct_4x4_sse2) EXTN(jsimd_idct_4x4_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args - ; ---- Pass 1: process columns from input. + ; ---- Pass 1: process columns from input. - mov rdx, r10 ; quantptr - mov rsi, r11 ; inptr + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr %ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 - mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] - jnz short .columnDCT - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - por xmm0,xmm1 - packsswb xmm0,xmm0 - packsswb xmm0,xmm0 - movd eax,xmm0 - test rax,rax - jnz short .columnDCT - - ; -- AC terms all zero - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - - psllw xmm0,PASS1_BITS - - movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07) - - pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) - pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) - pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) - pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) - - jmp near .column_end + mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm0, xmm1 + packsswb xmm0, xmm0 + packsswb xmm0, xmm0 + movd eax, xmm0 + test rax, rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm0, PASS1_BITS + + movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07) + + pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) + pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) + pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) + pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) + + jmp near .column_end %endif .columnDCT: - ; -- Odd part - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - - movdqa xmm4,xmm0 - movdqa xmm5,xmm0 - punpcklwd xmm4,xmm1 - punpckhwd xmm5,xmm1 - movdqa xmm0,xmm4 - movdqa xmm1,xmm5 - pmaddwd xmm4,[rel PW_F256_F089] ; xmm4=(tmp2L) - pmaddwd xmm5,[rel PW_F256_F089] ; xmm5=(tmp2H) - pmaddwd xmm0,[rel PW_F106_MF217] ; xmm0=(tmp0L) - pmaddwd xmm1,[rel PW_F106_MF217] ; xmm1=(tmp0H) - - movdqa xmm6,xmm2 - movdqa xmm7,xmm2 - punpcklwd xmm6,xmm3 - punpckhwd xmm7,xmm3 - movdqa xmm2,xmm6 - movdqa xmm3,xmm7 - pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2L) - pmaddwd xmm7,[rel PW_MF060_MF050] ; xmm7=(tmp2H) - pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0L) - pmaddwd xmm3,[rel PW_F145_MF021] ; xmm3=(tmp0H) - - paddd xmm6,xmm4 ; xmm6=tmp2L - paddd xmm7,xmm5 ; xmm7=tmp2H - paddd xmm2,xmm0 ; xmm2=tmp0L - paddd xmm3,xmm1 ; xmm3=tmp0H - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H - - ; -- Even part - - movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] - movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] - pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - - pxor xmm1,xmm1 - pxor xmm2,xmm2 - punpcklwd xmm1,xmm4 ; xmm1=tmp0L - punpckhwd xmm2,xmm4 ; xmm2=tmp0H - psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 - psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 - - movdqa xmm3,xmm5 ; xmm5=in2=z2 - punpcklwd xmm5,xmm0 ; xmm0=in6=z3 - punpckhwd xmm3,xmm0 - pmaddwd xmm5,[rel PW_F184_MF076] ; xmm5=tmp2L - pmaddwd xmm3,[rel PW_F184_MF076] ; xmm3=tmp2H - - movdqa xmm4,xmm1 - movdqa xmm0,xmm2 - paddd xmm1,xmm5 ; xmm1=tmp10L - paddd xmm2,xmm3 ; xmm2=tmp10H - psubd xmm4,xmm5 ; xmm4=tmp12L - psubd xmm0,xmm3 ; xmm0=tmp12H - - ; -- Final output stage - - movdqa xmm5,xmm1 - movdqa xmm3,xmm2 - paddd xmm1,xmm6 ; xmm1=data0L - paddd xmm2,xmm7 ; xmm2=data0H - psubd xmm5,xmm6 ; xmm5=data3L - psubd xmm3,xmm7 ; xmm3=data3H - - movdqa xmm6,[rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4] - - paddd xmm1,xmm6 - paddd xmm2,xmm6 - psrad xmm1,DESCALE_P1_4 - psrad xmm2,DESCALE_P1_4 - paddd xmm5,xmm6 - paddd xmm3,xmm6 - psrad xmm5,DESCALE_P1_4 - psrad xmm3,DESCALE_P1_4 - - packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) - packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L - movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H - - movdqa xmm2,xmm4 - movdqa xmm3,xmm0 - paddd xmm4,xmm7 ; xmm4=data1L - paddd xmm0,xmm6 ; xmm0=data1H - psubd xmm2,xmm7 ; xmm2=data2L - psubd xmm3,xmm6 ; xmm3=data2H - - movdqa xmm7,[rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4] - - paddd xmm4,xmm7 - paddd xmm0,xmm7 - psrad xmm4,DESCALE_P1_4 - psrad xmm0,DESCALE_P1_4 - paddd xmm2,xmm7 - paddd xmm3,xmm7 - psrad xmm2,DESCALE_P1_4 - psrad xmm3,DESCALE_P1_4 - - packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) - packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) - - movdqa xmm6,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13) - punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) - movdqa xmm7,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33) - punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37) - - movdqa xmm0,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) - punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) - movdqa xmm3,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) - punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm4, xmm0 + movdqa xmm5, xmm0 + punpcklwd xmm4, xmm1 + punpckhwd xmm5, xmm1 + movdqa xmm0, xmm4 + movdqa xmm1, xmm5 + pmaddwd xmm4, [rel PW_F256_F089] ; xmm4=(tmp2L) + pmaddwd xmm5, [rel PW_F256_F089] ; xmm5=(tmp2H) + pmaddwd xmm0, [rel PW_F106_MF217] ; xmm0=(tmp0L) + pmaddwd xmm1, [rel PW_F106_MF217] ; xmm1=(tmp0H) + + movdqa xmm6, xmm2 + movdqa xmm7, xmm2 + punpcklwd xmm6, xmm3 + punpckhwd xmm7, xmm3 + movdqa xmm2, xmm6 + movdqa xmm3, xmm7 + pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2L) + pmaddwd xmm7, [rel PW_MF060_MF050] ; xmm7=(tmp2H) + pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0L) + pmaddwd xmm3, [rel PW_F145_MF021] ; xmm3=(tmp0H) + + paddd xmm6, xmm4 ; xmm6=tmp2L + paddd xmm7, xmm5 ; xmm7=tmp2H + paddd xmm2, xmm0 ; xmm2=tmp0L + paddd xmm3, xmm1 ; xmm3=tmp0H + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H + + ; -- Even part + + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor xmm1, xmm1 + pxor xmm2, xmm2 + punpcklwd xmm1, xmm4 ; xmm1=tmp0L + punpckhwd xmm2, xmm4 ; xmm2=tmp0H + psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 + psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 + + movdqa xmm3, xmm5 ; xmm5=in2=z2 + punpcklwd xmm5, xmm0 ; xmm0=in6=z3 + punpckhwd xmm3, xmm0 + pmaddwd xmm5, [rel PW_F184_MF076] ; xmm5=tmp2L + pmaddwd xmm3, [rel PW_F184_MF076] ; xmm3=tmp2H + + movdqa xmm4, xmm1 + movdqa xmm0, xmm2 + paddd xmm1, xmm5 ; xmm1=tmp10L + paddd xmm2, xmm3 ; xmm2=tmp10H + psubd xmm4, xmm5 ; xmm4=tmp12L + psubd xmm0, xmm3 ; xmm0=tmp12H + + ; -- Final output stage + + movdqa xmm5, xmm1 + movdqa xmm3, xmm2 + paddd xmm1, xmm6 ; xmm1=data0L + paddd xmm2, xmm7 ; xmm2=data0H + psubd xmm5, xmm6 ; xmm5=data3L + psubd xmm3, xmm7 ; xmm3=data3H + + movdqa xmm6, [rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4] + + paddd xmm1, xmm6 + paddd xmm2, xmm6 + psrad xmm1, DESCALE_P1_4 + psrad xmm2, DESCALE_P1_4 + paddd xmm5, xmm6 + paddd xmm3, xmm6 + psrad xmm5, DESCALE_P1_4 + psrad xmm3, DESCALE_P1_4 + + packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) + packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H + + movdqa xmm2, xmm4 + movdqa xmm3, xmm0 + paddd xmm4, xmm7 ; xmm4=data1L + paddd xmm0, xmm6 ; xmm0=data1H + psubd xmm2, xmm7 ; xmm2=data2L + psubd xmm3, xmm6 ; xmm3=data2H + + movdqa xmm7, [rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4] + + paddd xmm4, xmm7 + paddd xmm0, xmm7 + psrad xmm4, DESCALE_P1_4 + psrad xmm0, DESCALE_P1_4 + paddd xmm2, xmm7 + paddd xmm3, xmm7 + psrad xmm2, DESCALE_P1_4 + psrad xmm3, DESCALE_P1_4 + + packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) + + movdqa xmm6, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13) + punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm7, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37) + + movdqa xmm0, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) + punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) + movdqa xmm3, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) + punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) .column_end: - ; -- Prefetch the next coefficient block + ; -- Prefetch the next coefficient block - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - ; ---- Pass 2: process rows, store into output array. + ; ---- Pass 2: process rows, store into output array. - mov rax, [original_rbp] - mov rdi, r12 ; (JSAMPROW *) - mov eax, r13d + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d - ; -- Even part + ; -- Even part - pxor xmm4,xmm4 - punpcklwd xmm4,xmm1 ; xmm4=tmp0 - psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 + pxor xmm4, xmm4 + punpcklwd xmm4, xmm1 ; xmm4=tmp0 + psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 - ; -- Odd part + ; -- Odd part - punpckhwd xmm1,xmm0 - punpckhwd xmm6,xmm3 - movdqa xmm5,xmm1 - movdqa xmm2,xmm6 - pmaddwd xmm1,[rel PW_F256_F089] ; xmm1=(tmp2) - pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2) - pmaddwd xmm5,[rel PW_F106_MF217] ; xmm5=(tmp0) - pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0) + punpckhwd xmm1, xmm0 + punpckhwd xmm6, xmm3 + movdqa xmm5, xmm1 + movdqa xmm2, xmm6 + pmaddwd xmm1, [rel PW_F256_F089] ; xmm1=(tmp2) + pmaddwd xmm6, [rel PW_MF060_MF050] ; xmm6=(tmp2) + pmaddwd xmm5, [rel PW_F106_MF217] ; xmm5=(tmp0) + pmaddwd xmm2, [rel PW_F145_MF021] ; xmm2=(tmp0) - paddd xmm6,xmm1 ; xmm6=tmp2 - paddd xmm2,xmm5 ; xmm2=tmp0 + paddd xmm6, xmm1 ; xmm6=tmp2 + paddd xmm2, xmm5 ; xmm2=tmp0 - ; -- Even part + ; -- Even part - punpcklwd xmm0,xmm3 - pmaddwd xmm0,[rel PW_F184_MF076] ; xmm0=tmp2 + punpcklwd xmm0, xmm3 + pmaddwd xmm0, [rel PW_F184_MF076] ; xmm0=tmp2 - movdqa xmm7,xmm4 - paddd xmm4,xmm0 ; xmm4=tmp10 - psubd xmm7,xmm0 ; xmm7=tmp12 + movdqa xmm7, xmm4 + paddd xmm4, xmm0 ; xmm4=tmp10 + psubd xmm7, xmm0 ; xmm7=tmp12 - ; -- Final output stage + ; -- Final output stage - movdqa xmm1,[rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4] + movdqa xmm1, [rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4] - movdqa xmm5,xmm4 - movdqa xmm3,xmm7 - paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30) - paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31) - psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33) - psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32) + movdqa xmm5, xmm4 + movdqa xmm3, xmm7 + paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30) + paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31) + psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33) + psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32) - paddd xmm4,xmm1 - paddd xmm7,xmm1 - psrad xmm4,DESCALE_P2_4 - psrad xmm7,DESCALE_P2_4 - paddd xmm5,xmm1 - paddd xmm3,xmm1 - psrad xmm5,DESCALE_P2_4 - psrad xmm3,DESCALE_P2_4 + paddd xmm4, xmm1 + paddd xmm7, xmm1 + psrad xmm4, DESCALE_P2_4 + psrad xmm7, DESCALE_P2_4 + paddd xmm5, xmm1 + paddd xmm3, xmm1 + psrad xmm5, DESCALE_P2_4 + psrad xmm3, DESCALE_P2_4 - packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32) - packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33) + packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32) + packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33) - movdqa xmm0,xmm4 ; transpose coefficients(phase 1) - punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31) - punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33) + movdqa xmm0, xmm4 ; transpose coefficients(phase 1) + punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31) + punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33) - movdqa xmm6,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13) - punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33) + movdqa xmm6, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13) + punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33) - packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) - paddb xmm4,[rel PB_CENTERJSAMP] + packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) + paddb xmm4, [rel PB_CENTERJSAMP] - pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) - pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) - pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) + pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) + pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) + pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) - mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] - movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 - movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 - mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] - movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 - movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 + mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 + movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + uncollect_args + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; -------------------------------------------------------------------------- @@ -411,165 +411,165 @@ EXTN(jsimd_idct_4x4_sse2): ; r12 = JSAMPARRAY output_buf ; r13 = JDIMENSION output_col - align 16 - global EXTN(jsimd_idct_2x2_sse2) + align 16 + global EXTN(jsimd_idct_2x2_sse2) EXTN(jsimd_idct_2x2_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - push rbx - - ; ---- Pass 1: process columns from input. - - mov rdx, r10 ; quantptr - mov rsi, r11 ; inptr - - ; | input: | result: | - ; | 00 01 ** 03 ** 05 ** 07 | | - ; | 10 11 ** 13 ** 15 ** 17 | | - ; | ** ** ** ** ** ** ** ** | | - ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | - ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | - ; | 50 51 ** 53 ** 55 ** 57 | | - ; | ** ** ** ** ** ** ** ** | | - ; | 70 71 ** 73 ** 75 ** 77 | | - - ; -- Odd part + push rbp + mov rax, rsp + mov rbp, rsp + collect_args + push rbx + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) - ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) + ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) + ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) - pcmpeqd xmm7,xmm7 - pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} + pcmpeqd xmm7, xmm7 + pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} - movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) - movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) - punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) - punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) - pmaddwd xmm4,[rel PW_F362_MF127] - pmaddwd xmm5,[rel PW_F085_MF072] + movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) + movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) + punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) + punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) + pmaddwd xmm4, [rel PW_F362_MF127] + pmaddwd xmm5, [rel PW_F085_MF072] - psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) - pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) - psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) - pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) - por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37) - por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77) - pmaddwd xmm0,[rel PW_F362_MF127] - pmaddwd xmm2,[rel PW_F085_MF072] + psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) + pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) + psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) + pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) + por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37) + por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77) + pmaddwd xmm0, [rel PW_F362_MF127] + pmaddwd xmm2, [rel PW_F085_MF072] - paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3] - paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7] + paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3] + paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7] - ; -- Even part + ; -- Even part - movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - ; xmm6=(00 01 ** 03 ** 05 ** 07) + ; xmm6=(00 01 ** 03 ** 05 ** 07) - movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) - pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) - pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) - psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] - psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] + movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) + pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) + pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) + psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] + psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] - ; -- Final output stage + ; -- Final output stage - movdqa xmm3,xmm6 - movdqa xmm5,xmm1 - paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) - paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) - psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) - psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) + movdqa xmm3, xmm6 + movdqa xmm5, xmm1 + paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) + paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) + psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) + psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) - movdqa xmm2,[rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2] + movdqa xmm2, [rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2] - punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **) + punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **) - movdqa xmm7,xmm1 - punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3) - punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7) + movdqa xmm7, xmm1 + punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3) + punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7) - paddd xmm6,xmm2 - psrad xmm6,DESCALE_P1_2 + paddd xmm6, xmm2 + psrad xmm6, DESCALE_P1_2 - paddd xmm1,xmm2 - paddd xmm7,xmm2 - psrad xmm1,DESCALE_P1_2 - psrad xmm7,DESCALE_P1_2 + paddd xmm1, xmm2 + paddd xmm7, xmm2 + psrad xmm1, DESCALE_P1_2 + psrad xmm7, DESCALE_P1_2 - ; -- Prefetch the next coefficient block + ; -- Prefetch the next coefficient block - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - ; ---- Pass 2: process rows, store into output array. + ; ---- Pass 2: process rows, store into output array. - mov rdi, r12 ; (JSAMPROW *) - mov eax, r13d + mov rdi, r12 ; (JSAMPROW *) + mov eax, r13d - ; | input:| result:| - ; | A0 B0 | | - ; | A1 B1 | C0 C1 | - ; | A3 B3 | D0 D1 | - ; | A5 B5 | | - ; | A7 B7 | | + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | - ; -- Odd part + ; -- Odd part - packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) - packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) - pmaddwd xmm1,[rel PW_F362_MF127] - pmaddwd xmm7,[rel PW_F085_MF072] + packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) + packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) + pmaddwd xmm1, [rel PW_F362_MF127] + pmaddwd xmm7, [rel PW_F085_MF072] - paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1] + paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1] - ; -- Even part + ; -- Even part - pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] + pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] - ; -- Final output stage + ; -- Final output stage - movdqa xmm4,xmm6 - paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) - psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) + movdqa xmm4, xmm6 + paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) + psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) - punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1) + punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1) - paddd xmm6,[rel PD_DESCALE_P2_2] - psrad xmm6,DESCALE_P2_2 + paddd xmm6, [rel PD_DESCALE_P2_2] + psrad xmm6, DESCALE_P2_2 - packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) - packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) - paddb xmm6,[rel PB_CENTERJSAMP] + packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) + packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) + paddb xmm6, [rel PB_CENTERJSAMP] - pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --) - pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --) + pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --) + pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --) - mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] - mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx - mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx + mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx - pop rbx - uncollect_args - pop rbp - ret + pop rbx + uncollect_args + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jidctred-sse2.asm b/simd/jidctred-sse2.asm index 232d983..761fba8 100644 --- a/simd/jidctred-sse2.asm +++ b/simd/jidctred-sse2.asm @@ -25,74 +25,74 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 13 -%define PASS1_BITS 2 +%define CONST_BITS 13 +%define PASS1_BITS 2 -%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) -%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) -%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) -%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) +%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) +%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) +%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) +%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) %if CONST_BITS == 13 -F_0_211 equ 1730 ; FIX(0.211164243) -F_0_509 equ 4176 ; FIX(0.509795579) -F_0_601 equ 4926 ; FIX(0.601344887) -F_0_720 equ 5906 ; FIX(0.720959822) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_850 equ 6967 ; FIX(0.850430095) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_061 equ 8697 ; FIX(1.061594337) -F_1_272 equ 10426 ; FIX(1.272758580) -F_1_451 equ 11893 ; FIX(1.451774981) -F_1_847 equ 15137 ; FIX(1.847759065) -F_2_172 equ 17799 ; FIX(2.172734803) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_624 equ 29692 ; FIX(3.624509785) +F_0_211 equ 1730 ; FIX(0.211164243) +F_0_509 equ 4176 ; FIX(0.509795579) +F_0_601 equ 4926 ; FIX(0.601344887) +F_0_720 equ 5906 ; FIX(0.720959822) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_850 equ 6967 ; FIX(0.850430095) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_061 equ 8697 ; FIX(1.061594337) +F_1_272 equ 10426 ; FIX(1.272758580) +F_1_451 equ 11893 ; FIX(1.451774981) +F_1_847 equ 15137 ; FIX(1.847759065) +F_2_172 equ 17799 ; FIX(2.172734803) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_624 equ 29692 ; FIX(3.624509785) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) -F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) -F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) -F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) -F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) -F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) +F_0_211 equ DESCALE( 226735879, 30-CONST_BITS) ; FIX(0.211164243) +F_0_509 equ DESCALE( 547388834, 30-CONST_BITS) ; FIX(0.509795579) +F_0_601 equ DESCALE( 645689155, 30-CONST_BITS) ; FIX(0.601344887) +F_0_720 equ DESCALE( 774124714, 30-CONST_BITS) ; FIX(0.720959822) +F_0_765 equ DESCALE( 821806413, 30-CONST_BITS) ; FIX(0.765366865) +F_0_850 equ DESCALE( 913142361, 30-CONST_BITS) ; FIX(0.850430095) +F_0_899 equ DESCALE( 966342111, 30-CONST_BITS) ; FIX(0.899976223) +F_1_061 equ DESCALE(1139878239, 30-CONST_BITS) ; FIX(1.061594337) +F_1_272 equ DESCALE(1366614119, 30-CONST_BITS) ; FIX(1.272758580) +F_1_451 equ DESCALE(1558831516, 30-CONST_BITS) ; FIX(1.451774981) +F_1_847 equ DESCALE(1984016188, 30-CONST_BITS) ; FIX(1.847759065) +F_2_172 equ DESCALE(2332956230, 30-CONST_BITS) ; FIX(2.172734803) +F_2_562 equ DESCALE(2751909506, 30-CONST_BITS) ; FIX(2.562915447) +F_3_624 equ DESCALE(3891787747, 30-CONST_BITS) ; FIX(3.624509785) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_idct_red_sse2) + alignz 16 + global EXTN(jconst_idct_red_sse2) EXTN(jconst_idct_red_sse2): -PW_F184_MF076 times 4 dw F_1_847,-F_0_765 -PW_F256_F089 times 4 dw F_2_562, F_0_899 -PW_F106_MF217 times 4 dw F_1_061,-F_2_172 -PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 -PW_F145_MF021 times 4 dw F_1_451,-F_0_211 -PW_F362_MF127 times 4 dw F_3_624,-F_1_272 -PW_F085_MF072 times 4 dw F_0_850,-F_0_720 -PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1) -PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1) -PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) -PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) -PB_CENTERJSAMP times 16 db CENTERJSAMPLE - - alignz 16 +PW_F184_MF076 times 4 dw F_1_847,-F_0_765 +PW_F256_F089 times 4 dw F_2_562, F_0_899 +PW_F106_MF217 times 4 dw F_1_061,-F_2_172 +PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 +PW_F145_MF021 times 4 dw F_1_451,-F_0_211 +PW_F362_MF127 times 4 dw F_3_624,-F_1_272 +PW_F085_MF072 times 4 dw F_0_850,-F_0_720 +PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1) +PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1) +PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) +PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform dequantization and inverse DCT on one block of coefficients, ; producing a reduced-size 4x4 output block. @@ -102,309 +102,309 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; JSAMPARRAY output_buf, JDIMENSION output_col) ; -%define dct_table(b) (b)+8 ; void *dct_table -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col +%define dct_table(b) (b)+8 ; void *dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_idct_4x4_sse2) + align 16 + global EXTN(jsimd_idct_4x4_sse2) EXTN(jsimd_idct_4x4_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr + push ebp + mov eax, esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp], eax + mov ebp, esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr %ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz short .columnDCT - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por xmm0,xmm1 - packsswb xmm0,xmm0 - packsswb xmm0,xmm0 - movd eax,xmm0 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - psllw xmm0,PASS1_BITS - - movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07) - - pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) - pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) - pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) - pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) - - jmp near .column_end - alignx 16,7 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm0, xmm1 + packsswb xmm0, xmm0 + packsswb xmm0, xmm0 + movd eax, xmm0 + test eax, eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm0, PASS1_BITS + + movdqa xmm3, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm3, xmm3 ; xmm3=(04 04 05 05 06 06 07 07) + + pshufd xmm1, xmm0, 0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) + pshufd xmm0, xmm0, 0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) + pshufd xmm6, xmm3, 0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) + pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) + + jmp near .column_end + alignx 16, 7 %endif .columnDCT: - ; -- Odd part - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - movdqa xmm4,xmm0 - movdqa xmm5,xmm0 - punpcklwd xmm4,xmm1 - punpckhwd xmm5,xmm1 - movdqa xmm0,xmm4 - movdqa xmm1,xmm5 - pmaddwd xmm4,[GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L) - pmaddwd xmm5,[GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H) - pmaddwd xmm0,[GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L) - pmaddwd xmm1,[GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H) - - movdqa xmm6,xmm2 - movdqa xmm7,xmm2 - punpcklwd xmm6,xmm3 - punpckhwd xmm7,xmm3 - movdqa xmm2,xmm6 - movdqa xmm3,xmm7 - pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L) - pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H) - pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L) - pmaddwd xmm3,[GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H) - - paddd xmm6,xmm4 ; xmm6=tmp2L - paddd xmm7,xmm5 ; xmm7=tmp2H - paddd xmm2,xmm0 ; xmm2=tmp0L - paddd xmm3,xmm1 ; xmm3=tmp0H - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H - - ; -- Even part - - movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - pxor xmm1,xmm1 - pxor xmm2,xmm2 - punpcklwd xmm1,xmm4 ; xmm1=tmp0L - punpckhwd xmm2,xmm4 ; xmm2=tmp0H - psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 - psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 - - movdqa xmm3,xmm5 ; xmm5=in2=z2 - punpcklwd xmm5,xmm0 ; xmm0=in6=z3 - punpckhwd xmm3,xmm0 - pmaddwd xmm5,[GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L - pmaddwd xmm3,[GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H - - movdqa xmm4,xmm1 - movdqa xmm0,xmm2 - paddd xmm1,xmm5 ; xmm1=tmp10L - paddd xmm2,xmm3 ; xmm2=tmp10H - psubd xmm4,xmm5 ; xmm4=tmp12L - psubd xmm0,xmm3 ; xmm0=tmp12H - - ; -- Final output stage - - movdqa xmm5,xmm1 - movdqa xmm3,xmm2 - paddd xmm1,xmm6 ; xmm1=data0L - paddd xmm2,xmm7 ; xmm2=data0H - psubd xmm5,xmm6 ; xmm5=data3L - psubd xmm3,xmm7 ; xmm3=data3H - - movdqa xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4] - - paddd xmm1,xmm6 - paddd xmm2,xmm6 - psrad xmm1,DESCALE_P1_4 - psrad xmm2,DESCALE_P1_4 - paddd xmm5,xmm6 - paddd xmm3,xmm6 - psrad xmm5,DESCALE_P1_4 - psrad xmm3,DESCALE_P1_4 - - packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) - packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L - movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H - - movdqa xmm2,xmm4 - movdqa xmm3,xmm0 - paddd xmm4,xmm7 ; xmm4=data1L - paddd xmm0,xmm6 ; xmm0=data1H - psubd xmm2,xmm7 ; xmm2=data2L - psubd xmm3,xmm6 ; xmm3=data2H - - movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4] - - paddd xmm4,xmm7 - paddd xmm0,xmm7 - psrad xmm4,DESCALE_P1_4 - psrad xmm0,DESCALE_P1_4 - paddd xmm2,xmm7 - paddd xmm3,xmm7 - psrad xmm2,DESCALE_P1_4 - psrad xmm3,DESCALE_P1_4 - - packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) - packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) - - movdqa xmm6,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13) - punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) - movdqa xmm7,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33) - punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37) - - movdqa xmm0,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) - punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) - movdqa xmm3,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) - punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm4, xmm0 + movdqa xmm5, xmm0 + punpcklwd xmm4, xmm1 + punpckhwd xmm5, xmm1 + movdqa xmm0, xmm4 + movdqa xmm1, xmm5 + pmaddwd xmm4, [GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L) + pmaddwd xmm5, [GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H) + pmaddwd xmm0, [GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L) + pmaddwd xmm1, [GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H) + + movdqa xmm6, xmm2 + movdqa xmm7, xmm2 + punpcklwd xmm6, xmm3 + punpckhwd xmm7, xmm3 + movdqa xmm2, xmm6 + movdqa xmm3, xmm7 + pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L) + pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H) + pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L) + pmaddwd xmm3, [GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H) + + paddd xmm6, xmm4 ; xmm6=tmp2L + paddd xmm7, xmm5 ; xmm7=tmp2H + paddd xmm2, xmm0 ; xmm2=tmp0L + paddd xmm3, xmm1 ; xmm3=tmp0H + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H + + ; -- Even part + + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor xmm1, xmm1 + pxor xmm2, xmm2 + punpcklwd xmm1, xmm4 ; xmm1=tmp0L + punpckhwd xmm2, xmm4 ; xmm2=tmp0H + psrad xmm1, (16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 + psrad xmm2, (16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 + + movdqa xmm3, xmm5 ; xmm5=in2=z2 + punpcklwd xmm5, xmm0 ; xmm0=in6=z3 + punpckhwd xmm3, xmm0 + pmaddwd xmm5, [GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L + pmaddwd xmm3, [GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H + + movdqa xmm4, xmm1 + movdqa xmm0, xmm2 + paddd xmm1, xmm5 ; xmm1=tmp10L + paddd xmm2, xmm3 ; xmm2=tmp10H + psubd xmm4, xmm5 ; xmm4=tmp12L + psubd xmm0, xmm3 ; xmm0=tmp12H + + ; -- Final output stage + + movdqa xmm5, xmm1 + movdqa xmm3, xmm2 + paddd xmm1, xmm6 ; xmm1=data0L + paddd xmm2, xmm7 ; xmm2=data0H + psubd xmm5, xmm6 ; xmm5=data3L + psubd xmm3, xmm7 ; xmm3=data3H + + movdqa xmm6, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4] + + paddd xmm1, xmm6 + paddd xmm2, xmm6 + psrad xmm1, DESCALE_P1_4 + psrad xmm2, DESCALE_P1_4 + paddd xmm5, xmm6 + paddd xmm3, xmm6 + psrad xmm5, DESCALE_P1_4 + psrad xmm3, DESCALE_P1_4 + + packssdw xmm1, xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) + packssdw xmm5, xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H + + movdqa xmm2, xmm4 + movdqa xmm3, xmm0 + paddd xmm4, xmm7 ; xmm4=data1L + paddd xmm0, xmm6 ; xmm0=data1H + psubd xmm2, xmm7 ; xmm2=data2L + psubd xmm3, xmm6 ; xmm3=data2H + + movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4] + + paddd xmm4, xmm7 + paddd xmm0, xmm7 + psrad xmm4, DESCALE_P1_4 + psrad xmm0, DESCALE_P1_4 + paddd xmm2, xmm7 + paddd xmm3, xmm7 + psrad xmm2, DESCALE_P1_4 + psrad xmm3, DESCALE_P1_4 + + packssdw xmm4, xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm2, xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) + + movdqa xmm6, xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1, xmm4 ; xmm1=(00 10 01 11 02 12 03 13) + punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm7, xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2, xmm5 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm7, xmm5 ; xmm7=(24 34 25 35 26 36 27 37) + + movdqa xmm0, xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1, xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) + punpckhdq xmm0, xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) + movdqa xmm3, xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6, xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) + punpckhdq xmm3, xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) .column_end: - ; -- Prefetch the next coefficient block + ; -- Prefetch the next coefficient block - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - ; ---- Pass 2: process rows, store into output array. + ; ---- Pass 2: process rows, store into output array. - mov eax, [original_ebp] - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] - ; -- Even part + ; -- Even part - pxor xmm4,xmm4 - punpcklwd xmm4,xmm1 ; xmm4=tmp0 - psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 + pxor xmm4, xmm4 + punpcklwd xmm4, xmm1 ; xmm4=tmp0 + psrad xmm4, (16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 - ; -- Odd part + ; -- Odd part - punpckhwd xmm1,xmm0 - punpckhwd xmm6,xmm3 - movdqa xmm5,xmm1 - movdqa xmm2,xmm6 - pmaddwd xmm1,[GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2) - pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2) - pmaddwd xmm5,[GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0) - pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0) + punpckhwd xmm1, xmm0 + punpckhwd xmm6, xmm3 + movdqa xmm5, xmm1 + movdqa xmm2, xmm6 + pmaddwd xmm1, [GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2) + pmaddwd xmm6, [GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2) + pmaddwd xmm5, [GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0) + pmaddwd xmm2, [GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0) - paddd xmm6,xmm1 ; xmm6=tmp2 - paddd xmm2,xmm5 ; xmm2=tmp0 + paddd xmm6, xmm1 ; xmm6=tmp2 + paddd xmm2, xmm5 ; xmm2=tmp0 - ; -- Even part + ; -- Even part - punpcklwd xmm0,xmm3 - pmaddwd xmm0,[GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2 + punpcklwd xmm0, xmm3 + pmaddwd xmm0, [GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2 - movdqa xmm7,xmm4 - paddd xmm4,xmm0 ; xmm4=tmp10 - psubd xmm7,xmm0 ; xmm7=tmp12 + movdqa xmm7, xmm4 + paddd xmm4, xmm0 ; xmm4=tmp10 + psubd xmm7, xmm0 ; xmm7=tmp12 - ; -- Final output stage + ; -- Final output stage - movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4] + movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4] - movdqa xmm5,xmm4 - movdqa xmm3,xmm7 - paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30) - paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31) - psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33) - psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32) + movdqa xmm5, xmm4 + movdqa xmm3, xmm7 + paddd xmm4, xmm6 ; xmm4=data0=(00 10 20 30) + paddd xmm7, xmm2 ; xmm7=data1=(01 11 21 31) + psubd xmm5, xmm6 ; xmm5=data3=(03 13 23 33) + psubd xmm3, xmm2 ; xmm3=data2=(02 12 22 32) - paddd xmm4,xmm1 - paddd xmm7,xmm1 - psrad xmm4,DESCALE_P2_4 - psrad xmm7,DESCALE_P2_4 - paddd xmm5,xmm1 - paddd xmm3,xmm1 - psrad xmm5,DESCALE_P2_4 - psrad xmm3,DESCALE_P2_4 + paddd xmm4, xmm1 + paddd xmm7, xmm1 + psrad xmm4, DESCALE_P2_4 + psrad xmm7, DESCALE_P2_4 + paddd xmm5, xmm1 + paddd xmm3, xmm1 + psrad xmm5, DESCALE_P2_4 + psrad xmm3, DESCALE_P2_4 - packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32) - packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33) + packssdw xmm4, xmm3 ; xmm4=(00 10 20 30 02 12 22 32) + packssdw xmm7, xmm5 ; xmm7=(01 11 21 31 03 13 23 33) - movdqa xmm0,xmm4 ; transpose coefficients(phase 1) - punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31) - punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33) + movdqa xmm0, xmm4 ; transpose coefficients(phase 1) + punpcklwd xmm4, xmm7 ; xmm4=(00 01 10 11 20 21 30 31) + punpckhwd xmm0, xmm7 ; xmm0=(02 03 12 13 22 23 32 33) - movdqa xmm6,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13) - punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33) + movdqa xmm6, xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4, xmm0 ; xmm4=(00 01 02 03 10 11 12 13) + punpckhdq xmm6, xmm0 ; xmm6=(20 21 22 23 30 31 32 33) - packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) - paddb xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)] + packsswb xmm4, xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) + paddb xmm4, [GOTOFF(ebx,PB_CENTERJSAMP)] - pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) - pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) - pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) + pshufd xmm2, xmm4, 0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) + pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) + pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 - movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 - mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 - movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 + movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp, ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; -------------------------------------------------------------------------- @@ -417,177 +417,177 @@ EXTN(jsimd_idct_4x4_sse2): ; JSAMPARRAY output_buf, JDIMENSION output_col) ; -%define dct_table(b) (b)+8 ; void *dct_table -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col +%define dct_table(b) (b)+8 ; void *dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col - align 16 - global EXTN(jsimd_idct_2x2_sse2) + align 16 + global EXTN(jsimd_idct_2x2_sse2) EXTN(jsimd_idct_2x2_sse2): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input. - - mov edx, POINTER [dct_table(ebp)] ; quantptr - mov esi, JCOEFPTR [coef_block(ebp)] ; inptr - - ; | input: | result: | - ; | 00 01 ** 03 ** 05 ** 07 | | - ; | 10 11 ** 13 ** 15 ** 17 | | - ; | ** ** ** ** ** ** ** ** | | - ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | - ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | - ; | 50 51 ** 53 ** 55 ** 57 | | - ; | ** ** ** ** ** ** ** ** | | - ; | 70 71 ** 73 ** 75 ** 77 | | - - ; -- Odd part + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + + mov edx, POINTER [dct_table(ebp)] ; quantptr + mov esi, JCOEFPTR [coef_block(ebp)] ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) - ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) + ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) + ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) - pcmpeqd xmm7,xmm7 - pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} + pcmpeqd xmm7, xmm7 + pslld xmm7, WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} - movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) - movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) - punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) - punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) - pmaddwd xmm4,[GOTOFF(ebx,PW_F362_MF127)] - pmaddwd xmm5,[GOTOFF(ebx,PW_F085_MF072)] + movdqa xmm4, xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) + movdqa xmm5, xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) + punpcklwd xmm4, xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) + punpcklwd xmm5, xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) + pmaddwd xmm4, [GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm5, [GOTOFF(ebx,PW_F085_MF072)] - psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) - pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) - psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) - pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) - por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37) - por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77) - pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)] - pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)] + psrld xmm0, WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) + pand xmm1, xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) + psrld xmm2, WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) + pand xmm3, xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) + por xmm0, xmm1 ; xmm0=(11 31 13 33 15 35 17 37) + por xmm2, xmm3 ; xmm2=(51 71 53 73 55 75 57 77) + pmaddwd xmm0, [GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm2, [GOTOFF(ebx,PW_F085_MF072)] - paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3] - paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7] + paddd xmm4, xmm5 ; xmm4=tmp0[col0 col1 **** col3] + paddd xmm0, xmm2 ; xmm0=tmp0[col1 col3 col5 col7] - ; -- Even part + ; -- Even part - movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - ; xmm6=(00 01 ** 03 ** 05 ** 07) + ; xmm6=(00 01 ** 03 ** 05 ** 07) - movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) - pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) - pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) - psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] - psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] + movdqa xmm1, xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) + pslld xmm6, WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) + pand xmm1, xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) + psrad xmm6, (WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] + psrad xmm1, (WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] - ; -- Final output stage + ; -- Final output stage - movdqa xmm3,xmm6 - movdqa xmm5,xmm1 - paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) - paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) - psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) - psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) + movdqa xmm3, xmm6 + movdqa xmm5, xmm1 + paddd xmm6, xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) + paddd xmm1, xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) + psubd xmm3, xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) + psubd xmm5, xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) - movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2] + movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2] - punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **) + punpckldq xmm6, xmm3 ; xmm6=(A0 B0 ** **) - movdqa xmm7,xmm1 - punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3) - punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7) + movdqa xmm7, xmm1 + punpcklqdq xmm1, xmm5 ; xmm1=(A1 A3 B1 B3) + punpckhqdq xmm7, xmm5 ; xmm7=(A5 A7 B5 B7) - paddd xmm6,xmm2 - psrad xmm6,DESCALE_P1_2 + paddd xmm6, xmm2 + psrad xmm6, DESCALE_P1_2 - paddd xmm1,xmm2 - paddd xmm7,xmm2 - psrad xmm1,DESCALE_P1_2 - psrad xmm7,DESCALE_P1_2 + paddd xmm1, xmm2 + paddd xmm7, xmm2 + psrad xmm1, DESCALE_P1_2 + psrad xmm7, DESCALE_P1_2 - ; -- Prefetch the next coefficient block + ; -- Prefetch the next coefficient block - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - ; ---- Pass 2: process rows, store into output array. + ; ---- Pass 2: process rows, store into output array. - mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(ebp)] + mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(ebp)] - ; | input:| result:| - ; | A0 B0 | | - ; | A1 B1 | C0 C1 | - ; | A3 B3 | D0 D1 | - ; | A5 B5 | | - ; | A7 B7 | | + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | - ; -- Odd part + ; -- Odd part - packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) - packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) - pmaddwd xmm1,[GOTOFF(ebx,PW_F362_MF127)] - pmaddwd xmm7,[GOTOFF(ebx,PW_F085_MF072)] + packssdw xmm1, xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) + packssdw xmm7, xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) + pmaddwd xmm1, [GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm7, [GOTOFF(ebx,PW_F085_MF072)] - paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1] + paddd xmm1, xmm7 ; xmm1=tmp0[row0 row1 row0 row1] - ; -- Even part + ; -- Even part - pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] + pslld xmm6, (CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] - ; -- Final output stage + ; -- Final output stage - movdqa xmm4,xmm6 - paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) - psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) + movdqa xmm4, xmm6 + paddd xmm6, xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) + psubd xmm4, xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) - punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1) + punpckldq xmm6, xmm4 ; xmm6=(C0 D0 C1 D1) - paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)] - psrad xmm6,DESCALE_P2_2 + paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2_2)] + psrad xmm6, DESCALE_P2_2 - packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) - packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) - paddb xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)] + packssdw xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) + packsswb xmm6, xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) + paddb xmm6, [GOTOFF(ebx,PB_CENTERJSAMP)] - pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --) - pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --) + pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --) + pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --) - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov WORD [edx+eax*SIZEOF_JSAMPLE], bx - mov WORD [esi+eax*SIZEOF_JSAMPLE], cx + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov WORD [edx+eax*SIZEOF_JSAMPLE], bx + mov WORD [esi+eax*SIZEOF_JSAMPLE], cx - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jquantf-sse2-64.asm b/simd/jquantf-sse2-64.asm index ef5c1f9..ed69bc2 100644 --- a/simd/jquantf-sse2-64.asm +++ b/simd/jquantf-sse2-64.asm @@ -20,8 +20,8 @@ %include "jdct.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Load data into workspace, applying unsigned->signed conversion ; @@ -34,65 +34,65 @@ ; r11 = JDIMENSION start_col ; r12 = FAST_FLOAT *workspace - align 16 - global EXTN(jsimd_convsamp_float_sse2) + align 16 + global EXTN(jsimd_convsamp_float_sse2) EXTN(jsimd_convsamp_float_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - push rbx - - pcmpeqw xmm7,xmm7 - psllw xmm7,7 - packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) - - mov rsi, r10 - mov eax, r11d - mov rdi, r12 - mov rcx, DCTSIZE/2 + push rbp + mov rax, rsp + mov rbp, rsp + collect_args + push rbx + + pcmpeqw xmm7, xmm7 + psllw xmm7, 7 + packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) + + mov rsi, r10 + mov eax, r11d + mov rdi, r12 + mov rcx, DCTSIZE/2 .convloop: - mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) - movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] - movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] + movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] + movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] - psubb xmm0,xmm7 ; xmm0=(01234567) - psubb xmm1,xmm7 ; xmm1=(89ABCDEF) + psubb xmm0, xmm7 ; xmm0=(01234567) + psubb xmm1, xmm7 ; xmm1=(89ABCDEF) - punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) - punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) + punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) + punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) - punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) - punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) - punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) - punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) + punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3) + punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7) + punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B) + punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F) - psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) - psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) - cvtdq2ps xmm2,xmm2 ; xmm2=(0123) - cvtdq2ps xmm0,xmm0 ; xmm0=(4567) - psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) - psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) - cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) - cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) + psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123) + psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567) + cvtdq2ps xmm2, xmm2 ; xmm2=(0123) + cvtdq2ps xmm0, xmm0 ; xmm0=(4567) + psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) + psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) + cvtdq2ps xmm3, xmm3 ; xmm3=(89AB) + cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF) - movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 - add rsi, byte 2*SIZEOF_JSAMPROW - add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT - dec rcx - jnz short .convloop + add rsi, byte 2*SIZEOF_JSAMPROW + add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec rcx + jnz short .convloop - pop rbx - uncollect_args - pop rbp - ret + pop rbx + uncollect_args + pop rbp + ret ; -------------------------------------------------------------------------- @@ -108,50 +108,50 @@ EXTN(jsimd_convsamp_float_sse2): ; r11 = FAST_FLOAT *divisors ; r12 = FAST_FLOAT *workspace - align 16 - global EXTN(jsimd_quantize_float_sse2) + align 16 + global EXTN(jsimd_quantize_float_sse2) EXTN(jsimd_quantize_float_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - - mov rsi, r12 - mov rdx, r11 - mov rdi, r10 - mov rax, DCTSIZE2/16 + push rbp + mov rax, rsp + mov rbp, rsp + collect_args + + mov rsi, r12 + mov rdx, r11 + mov rdi, r10 + mov rax, DCTSIZE2/16 .quantloop: - movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)] - mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] - mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)] - mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] - mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] - - cvtps2dq xmm0,xmm0 - cvtps2dq xmm1,xmm1 - cvtps2dq xmm2,xmm2 - cvtps2dq xmm3,xmm3 - - packssdw xmm0,xmm1 - packssdw xmm2,xmm3 - - movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0 - movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2 - - add rsi, byte 16*SIZEOF_FAST_FLOAT - add rdx, byte 16*SIZEOF_FAST_FLOAT - add rdi, byte 16*SIZEOF_JCOEF - dec rax - jnz short .quantloop - - uncollect_args - pop rbp - ret + movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] + + cvtps2dq xmm0, xmm0 + cvtps2dq xmm1, xmm1 + cvtps2dq xmm2, xmm2 + cvtps2dq xmm3, xmm3 + + packssdw xmm0, xmm1 + packssdw xmm2, xmm3 + + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2 + + add rsi, byte 16*SIZEOF_FAST_FLOAT + add rdx, byte 16*SIZEOF_FAST_FLOAT + add rdi, byte 16*SIZEOF_JCOEF + dec rax + jnz short .quantloop + + uncollect_args + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jquantf-sse2.asm b/simd/jquantf-sse2.asm index 1cbc267..1dca26a 100644 --- a/simd/jquantf-sse2.asm +++ b/simd/jquantf-sse2.asm @@ -19,8 +19,8 @@ %include "jdct.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Load data into workspace, applying unsigned->signed conversion ; @@ -29,75 +29,75 @@ ; FAST_FLOAT *workspace); ; -%define sample_data ebp+8 ; JSAMPARRAY sample_data -%define start_col ebp+12 ; JDIMENSION start_col -%define workspace ebp+16 ; FAST_FLOAT *workspace +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; FAST_FLOAT *workspace - align 16 - global EXTN(jsimd_convsamp_float_sse2) + align 16 + global EXTN(jsimd_convsamp_float_sse2) EXTN(jsimd_convsamp_float_sse2): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - pcmpeqw xmm7,xmm7 - psllw xmm7,7 - packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) - - mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) - mov eax, JDIMENSION [start_col] - mov edi, POINTER [workspace] ; (DCTELEM *) - mov ecx, DCTSIZE/2 - alignx 16,7 + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pcmpeqw xmm7, xmm7 + psllw xmm7, 7 + packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/2 + alignx 16, 7 .convloop: - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] - movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] - - psubb xmm0,xmm7 ; xmm0=(01234567) - psubb xmm1,xmm7 ; xmm1=(89ABCDEF) - - punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) - punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) - - punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) - punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) - punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) - punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) - - psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) - psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) - cvtdq2ps xmm2,xmm2 ; xmm2=(0123) - cvtdq2ps xmm0,xmm0 ; xmm0=(4567) - psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) - psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) - cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) - cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) - - movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 - - add esi, byte 2*SIZEOF_JSAMPROW - add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT - dec ecx - jnz short .convloop - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + psubb xmm0, xmm7 ; xmm0=(01234567) + psubb xmm1, xmm7 ; xmm1=(89ABCDEF) + + punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) + punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) + + punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3) + punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7) + punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B) + punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F) + + psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123) + psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567) + cvtdq2ps xmm2, xmm2 ; xmm2=(0123) + cvtdq2ps xmm0, xmm0 ; xmm0=(4567) + psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) + psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) + cvtdq2ps xmm3, xmm3 ; xmm3=(89AB) + cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + + add esi, byte 2*SIZEOF_JSAMPROW + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz short .convloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret ; -------------------------------------------------------------------------- @@ -109,62 +109,62 @@ EXTN(jsimd_convsamp_float_sse2): ; FAST_FLOAT *workspace); ; -%define coef_block ebp+8 ; JCOEFPTR coef_block -%define divisors ebp+12 ; FAST_FLOAT *divisors -%define workspace ebp+16 ; FAST_FLOAT *workspace +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; FAST_FLOAT *divisors +%define workspace ebp+16 ; FAST_FLOAT *workspace - align 16 - global EXTN(jsimd_quantize_float_sse2) + align 16 + global EXTN(jsimd_quantize_float_sse2) EXTN(jsimd_quantize_float_sse2): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - mov esi, POINTER [workspace] - mov edx, POINTER [divisors] - mov edi, JCOEFPTR [coef_block] - mov eax, DCTSIZE2/16 - alignx 16,7 + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/16 + alignx 16, 7 .quantloop: - movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] - mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] - mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] - mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] - mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] - - cvtps2dq xmm0,xmm0 - cvtps2dq xmm1,xmm1 - cvtps2dq xmm2,xmm2 - cvtps2dq xmm3,xmm3 - - packssdw xmm0,xmm1 - packssdw xmm2,xmm3 - - movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0 - movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2 - - add esi, byte 16*SIZEOF_FAST_FLOAT - add edx, byte 16*SIZEOF_FAST_FLOAT - add edi, byte 16*SIZEOF_JCOEF - dec eax - jnz short .quantloop - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused -; pop ebx ; unused - pop ebp - ret + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + cvtps2dq xmm0, xmm0 + cvtps2dq xmm1, xmm1 + cvtps2dq xmm2, xmm2 + cvtps2dq xmm3, xmm3 + + packssdw xmm0, xmm1 + packssdw xmm2, xmm3 + + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2 + + add esi, byte 16*SIZEOF_FAST_FLOAT + add edx, byte 16*SIZEOF_FAST_FLOAT + add edi, byte 16*SIZEOF_JCOEF + dec eax + jnz short .quantloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jquanti-sse2-64.asm b/simd/jquanti-sse2-64.asm index 66c4e51..7cb6872 100644 --- a/simd/jquanti-sse2-64.asm +++ b/simd/jquanti-sse2-64.asm @@ -20,8 +20,8 @@ %include "jdct.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Load data into workspace, applying unsigned->signed conversion ; @@ -34,60 +34,60 @@ ; r11 = JDIMENSION start_col ; r12 = DCTELEM *workspace - align 16 - global EXTN(jsimd_convsamp_sse2) + align 16 + global EXTN(jsimd_convsamp_sse2) EXTN(jsimd_convsamp_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - push rbx - - pxor xmm6,xmm6 ; xmm6=(all 0's) - pcmpeqw xmm7,xmm7 - psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} - - mov rsi, r10 - mov eax, r11d - mov rdi, r12 - mov rcx, DCTSIZE/4 + push rbp + mov rax, rsp + mov rbp, rsp + collect_args + push rbx + + pxor xmm6, xmm6 ; xmm6=(all 0's) + pcmpeqw xmm7, xmm7 + psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + mov rsi, r10 + mov eax, r11d + mov rdi, r12 + mov rcx, DCTSIZE/4 .convloop: - mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567) - movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) - - mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) - movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) - - punpcklbw xmm0,xmm6 ; xmm0=(01234567) - punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) - paddw xmm0,xmm7 - paddw xmm1,xmm7 - punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) - punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) - paddw xmm2,xmm7 - paddw xmm3,xmm7 - - movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 - movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 - movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 - movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 - - add rsi, byte 4*SIZEOF_JSAMPROW - add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM - dec rcx - jnz short .convloop - - pop rbx - uncollect_args - pop rbp - ret + mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567) + movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) + + mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) + movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) + + punpcklbw xmm0, xmm6 ; xmm0=(01234567) + punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF) + paddw xmm0, xmm7 + paddw xmm1, xmm7 + punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN) + punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV) + paddw xmm2, xmm7 + paddw xmm3, xmm7 + + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 + + add rsi, byte 4*SIZEOF_JSAMPROW + add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec rcx + jnz short .convloop + + pop rbx + uncollect_args + pop rbp + ret ; -------------------------------------------------------------------------- ; @@ -102,85 +102,85 @@ EXTN(jsimd_convsamp_sse2): ; DCTELEM *workspace); ; -%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) -%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) -%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) +%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) +%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) +%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) ; r10 = JCOEFPTR coef_block ; r11 = DCTELEM *divisors ; r12 = DCTELEM *workspace - align 16 - global EXTN(jsimd_quantize_sse2) + align 16 + global EXTN(jsimd_quantize_sse2) EXTN(jsimd_quantize_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - - mov rsi, r12 - mov rdx, r11 - mov rdi, r10 - mov rax, DCTSIZE2/32 + push rbp + mov rax, rsp + mov rbp, rsp + collect_args + + mov rsi, r12 + mov rdx, r11 + mov rdi, r10 + mov rax, DCTSIZE2/32 .quantloop: - movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)] - movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)] - movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)] - movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)] - movdqa xmm0,xmm4 - movdqa xmm1,xmm5 - movdqa xmm2,xmm6 - movdqa xmm3,xmm7 - psraw xmm4,(WORD_BIT-1) - psraw xmm5,(WORD_BIT-1) - psraw xmm6,(WORD_BIT-1) - psraw xmm7,(WORD_BIT-1) - pxor xmm0,xmm4 - pxor xmm1,xmm5 - pxor xmm2,xmm6 - pxor xmm3,xmm7 - psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; - psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; - psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; - psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; - - paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor - paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)] - paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)] - paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)] - pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal - pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)] - pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)] - pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)] - pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale - pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)] - pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)] - pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)] - - pxor xmm0,xmm4 - pxor xmm1,xmm5 - pxor xmm2,xmm6 - pxor xmm3,xmm7 - psubw xmm0,xmm4 - psubw xmm1,xmm5 - psubw xmm2,xmm6 - psubw xmm3,xmm7 - movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 - movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 - movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 - movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 - - add rsi, byte 32*SIZEOF_DCTELEM - add rdx, byte 32*SIZEOF_DCTELEM - add rdi, byte 32*SIZEOF_JCOEF - dec rax - jnz near .quantloop - - uncollect_args - pop rbp - ret + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm0, xmm4 + movdqa xmm1, xmm5 + movdqa xmm2, xmm6 + movdqa xmm3, xmm7 + psraw xmm4, (WORD_BIT-1) + psraw xmm5, (WORD_BIT-1) + psraw xmm6, (WORD_BIT-1) + psraw xmm7, (WORD_BIT-1) + pxor xmm0, xmm4 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pxor xmm3, xmm7 + psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; + psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; + psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; + psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; + + paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor + paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)] + paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)] + paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)] + pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal + pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)] + pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)] + pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)] + pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale + pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)] + pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)] + pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)] + + pxor xmm0, xmm4 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pxor xmm3, xmm7 + psubw xmm0, xmm4 + psubw xmm1, xmm5 + psubw xmm2, xmm6 + psubw xmm3, xmm7 + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 + + add rsi, byte 32*SIZEOF_DCTELEM + add rdx, byte 32*SIZEOF_DCTELEM + add rdi, byte 32*SIZEOF_JCOEF + dec rax + jnz near .quantloop + + uncollect_args + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jquanti-sse2.asm b/simd/jquanti-sse2.asm index aea8604..6fb001f 100644 --- a/simd/jquanti-sse2.asm +++ b/simd/jquanti-sse2.asm @@ -19,8 +19,8 @@ %include "jdct.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Load data into workspace, applying unsigned->signed conversion ; @@ -29,70 +29,70 @@ ; DCTELEM *workspace); ; -%define sample_data ebp+8 ; JSAMPARRAY sample_data -%define start_col ebp+12 ; JDIMENSION start_col -%define workspace ebp+16 ; DCTELEM *workspace +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; DCTELEM *workspace - align 16 - global EXTN(jsimd_convsamp_sse2) + align 16 + global EXTN(jsimd_convsamp_sse2) EXTN(jsimd_convsamp_sse2): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - pxor xmm6,xmm6 ; xmm6=(all 0's) - pcmpeqw xmm7,xmm7 - psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} - - mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) - mov eax, JDIMENSION [start_col] - mov edi, POINTER [workspace] ; (DCTELEM *) - mov ecx, DCTSIZE/4 - alignx 16,7 + push ebp + mov ebp, esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pxor xmm6, xmm6 ; xmm6=(all 0's) + pcmpeqw xmm7, xmm7 + psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16, 7 .convloop: - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567) - movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) - - mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) - movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) - - punpcklbw xmm0,xmm6 ; xmm0=(01234567) - punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) - paddw xmm0,xmm7 - paddw xmm1,xmm7 - punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) - punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) - paddw xmm2,xmm7 - paddw xmm3,xmm7 - - movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 - movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 - movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 - movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 - - add esi, byte 4*SIZEOF_JSAMPROW - add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM - dec ecx - jnz short .convloop - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567) + movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) + + mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) + movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) + + punpcklbw xmm0, xmm6 ; xmm0=(01234567) + punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF) + paddw xmm0, xmm7 + paddw xmm1, xmm7 + punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN) + punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV) + paddw xmm2, xmm7 + paddw xmm3, xmm7 + + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 + + add esi, byte 4*SIZEOF_JSAMPROW + add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz short .convloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -107,93 +107,93 @@ EXTN(jsimd_convsamp_sse2): ; DCTELEM *workspace); ; -%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) -%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) -%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) +%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) +%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) +%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) -%define coef_block ebp+8 ; JCOEFPTR coef_block -%define divisors ebp+12 ; DCTELEM *divisors -%define workspace ebp+16 ; DCTELEM *workspace +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; DCTELEM *divisors +%define workspace ebp+16 ; DCTELEM *workspace - align 16 - global EXTN(jsimd_quantize_sse2) + align 16 + global EXTN(jsimd_quantize_sse2) EXTN(jsimd_quantize_sse2): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - mov esi, POINTER [workspace] - mov edx, POINTER [divisors] - mov edi, JCOEFPTR [coef_block] - mov eax, DCTSIZE2/32 - alignx 16,7 + push ebp + mov ebp, esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/32 + alignx 16, 7 .quantloop: - movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] - movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] - movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] - movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] - movdqa xmm0,xmm4 - movdqa xmm1,xmm5 - movdqa xmm2,xmm6 - movdqa xmm3,xmm7 - psraw xmm4,(WORD_BIT-1) - psraw xmm5,(WORD_BIT-1) - psraw xmm6,(WORD_BIT-1) - psraw xmm7,(WORD_BIT-1) - pxor xmm0,xmm4 - pxor xmm1,xmm5 - pxor xmm2,xmm6 - pxor xmm3,xmm7 - psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; - psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; - psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; - psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; - - paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor - paddw xmm1, XMMWORD [CORRECTION(1,0,edx)] - paddw xmm2, XMMWORD [CORRECTION(2,0,edx)] - paddw xmm3, XMMWORD [CORRECTION(3,0,edx)] - pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal - pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)] - pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)] - pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)] - pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale - pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)] - pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] - pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] - - pxor xmm0,xmm4 - pxor xmm1,xmm5 - pxor xmm2,xmm6 - pxor xmm3,xmm7 - psubw xmm0,xmm4 - psubw xmm1,xmm5 - psubw xmm2,xmm6 - psubw xmm3,xmm7 - movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 - movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 - movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 - movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 - - add esi, byte 32*SIZEOF_DCTELEM - add edx, byte 32*SIZEOF_DCTELEM - add edi, byte 32*SIZEOF_JCOEF - dec eax - jnz near .quantloop - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused -; pop ebx ; unused - pop ebp - ret + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] + movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] + movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] + movdqa xmm0, xmm4 + movdqa xmm1, xmm5 + movdqa xmm2, xmm6 + movdqa xmm3, xmm7 + psraw xmm4, (WORD_BIT-1) + psraw xmm5, (WORD_BIT-1) + psraw xmm6, (WORD_BIT-1) + psraw xmm7, (WORD_BIT-1) + pxor xmm0, xmm4 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pxor xmm3, xmm7 + psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; + psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; + psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; + psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; + + paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor + paddw xmm1, XMMWORD [CORRECTION(1,0,edx)] + paddw xmm2, XMMWORD [CORRECTION(2,0,edx)] + paddw xmm3, XMMWORD [CORRECTION(3,0,edx)] + pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal + pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)] + pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)] + pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)] + pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale + pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)] + pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] + pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] + + pxor xmm0, xmm4 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pxor xmm3, xmm7 + psubw xmm0, xmm4 + psubw xmm1, xmm5 + psubw xmm2, xmm6 + psubw xmm3, xmm7 + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 + + add esi, byte 32*SIZEOF_DCTELEM + add edx, byte 32*SIZEOF_DCTELEM + add edi, byte 32*SIZEOF_JCOEF + dec eax + jnz near .quantloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jsimdcfg.inc.h b/simd/jsimdcfg.inc.h index d2b499f..81574d5 100644 --- a/simd/jsimdcfg.inc.h +++ b/simd/jsimdcfg.inc.h @@ -19,79 +19,79 @@ ; -- jpeglib.h ; -%define _cpp_protection_DCTSIZE DCTSIZE -%define _cpp_protection_DCTSIZE2 DCTSIZE2 +%define _cpp_protection_DCTSIZE DCTSIZE +%define _cpp_protection_DCTSIZE2 DCTSIZE2 ; ; -- jmorecfg.h ; -%define _cpp_protection_RGB_RED RGB_RED -%define _cpp_protection_RGB_GREEN RGB_GREEN -%define _cpp_protection_RGB_BLUE RGB_BLUE -%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE +%define _cpp_protection_RGB_RED RGB_RED +%define _cpp_protection_RGB_GREEN RGB_GREEN +%define _cpp_protection_RGB_BLUE RGB_BLUE +%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE -%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED -%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN -%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE -%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED +%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN +%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE +%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE -%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED -%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN -%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE -%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE +%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED +%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN +%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE +%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE -%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED -%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN -%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE -%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE +%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED +%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN +%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE +%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE -%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED -%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN -%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE -%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE +%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED +%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN +%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE +%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE -%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED -%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN -%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE -%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE +%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED +%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN +%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE +%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE -%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED -%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN -%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE -%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED +%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN +%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE +%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE -%define RGBX_FILLER_0XFF 1 +%define RGBX_FILLER_0XFF 1 ; Representation of a single sample (pixel element value). ; On this SIMD implementation, this must be 'unsigned char'. ; -%define JSAMPLE byte ; unsigned char -%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE) +%define JSAMPLE byte ; unsigned char +%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE) -%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE +%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE ; Representation of a DCT frequency coefficient. ; On this SIMD implementation, this must be 'short'. ; -%define JCOEF word ; short -%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF) +%define JCOEF word ; short +%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF) ; Datatype used for image dimensions. ; On this SIMD implementation, this must be 'unsigned int'. ; -%define JDIMENSION dword ; unsigned int -%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION) - -%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h) -%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h) -%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h) -%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h) -%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW) -%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY) -%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE) -%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR) +%define JDIMENSION dword ; unsigned int +%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION) + +%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h) +%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h) +%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h) +%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h) +%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW) +%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY) +%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE) +%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR) ; ; -- jdct.h @@ -101,30 +101,30 @@ ; the DCT is to be performed in-place in that buffer. ; To maximize parallelism, Type DCTELEM is changed to short (originally, int). ; -%define DCTELEM word ; short -%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM) +%define DCTELEM word ; short +%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM) -%define FAST_FLOAT FP32 ; float -%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT) +%define FAST_FLOAT FP32 ; float +%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT) ; To maximize parallelism, Type MULTIPLIER is changed to short. ; -%define ISLOW_MULT_TYPE word ; must be short -%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE) +%define ISLOW_MULT_TYPE word ; must be short +%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE) -%define IFAST_MULT_TYPE word ; must be short -%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE) -%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors +%define IFAST_MULT_TYPE word ; must be short +%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE) +%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors -%define FLOAT_MULT_TYPE FP32 ; must be float -%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE) +%define FLOAT_MULT_TYPE FP32 ; must be float +%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE) ; ; -- jsimd.h ; -%define _cpp_protection_JSIMD_NONE JSIMD_NONE -%define _cpp_protection_JSIMD_MMX JSIMD_MMX -%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW -%define _cpp_protection_JSIMD_SSE JSIMD_SSE -%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2 +%define _cpp_protection_JSIMD_NONE JSIMD_NONE +%define _cpp_protection_JSIMD_MMX JSIMD_MMX +%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW +%define _cpp_protection_JSIMD_SSE JSIMD_SSE +%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2 diff --git a/simd/jsimdext.inc b/simd/jsimdext.inc index f28db60..7c0b809 100644 --- a/simd/jsimdext.inc +++ b/simd/jsimdext.inc @@ -38,11 +38,11 @@ ; -- segment definition -- ; %ifdef __YASM_VER__ -%define SEG_TEXT .text align=16 -%define SEG_CONST .rdata align=16 +%define SEG_TEXT .text align=16 +%define SEG_CONST .rdata align=16 %else -%define SEG_TEXT .text align=16 public use32 class=CODE -%define SEG_CONST .rdata align=16 public use32 class=CONST +%define SEG_TEXT .text align=16 public use32 class=CODE +%define SEG_CONST .rdata align=16 public use32 class=CONST %endif %elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- @@ -57,15 +57,15 @@ %define SEG_TEXT .text align=16 public use64 class=CODE %define SEG_CONST .rdata align=16 public use64 class=CONST %endif -%define EXTN(name) name ; foo() -> foo +%define EXTN(name) name ; foo() -> foo %elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- ; * Borland C++ (Win32) ; -- segment definition -- ; -%define SEG_TEXT _text align=16 public use32 class=CODE -%define SEG_CONST _data align=16 public use32 class=DATA +%define SEG_TEXT _text align=16 public use32 class=CODE +%define SEG_CONST _data align=16 public use32 class=DATA %elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ ; * Linux @@ -78,17 +78,17 @@ section .note.GNU-stack noalloc noexec nowrite progbits ; -- segment definition -- ; %ifdef __x86_64__ -%define SEG_TEXT .text progbits align=16 -%define SEG_CONST .rodata progbits align=16 +%define SEG_TEXT .text progbits align=16 +%define SEG_CONST .rodata progbits align=16 %else -%define SEG_TEXT .text progbits alloc exec nowrite align=16 -%define SEG_CONST .rodata progbits alloc noexec nowrite align=16 +%define SEG_TEXT .text progbits alloc exec nowrite align=16 +%define SEG_CONST .rodata progbits alloc noexec nowrite align=16 %endif ; To make the code position-independent, append -DPIC to the commandline ; -%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC -%define EXTN(name) name ; foo() -> foo +%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC +%define EXTN(name) name ; foo() -> foo %elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- ; * Older Linux using a.out format (nasm -f aout -DAOUT ...) @@ -96,20 +96,20 @@ section .note.GNU-stack noalloc noexec nowrite progbits ; -- segment definition -- ; -%define SEG_TEXT .text -%define SEG_CONST .data +%define SEG_TEXT .text +%define SEG_CONST .data ; To make the code position-independent, append -DPIC to the commandline ; -%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC +%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC %elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- ; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format) ; -- segment definition -- ; -%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why? -%define SEG_CONST .rodata align=16 +%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why? +%define SEG_CONST .rodata align=16 ; The generation of position-independent code (PIC) is the default on Darwin. ; @@ -120,10 +120,10 @@ section .note.GNU-stack noalloc noexec nowrite progbits ; -- segment definition -- ; -%define SEG_TEXT .text -%define SEG_CONST .data +%define SEG_TEXT .text +%define SEG_CONST .data -%endif ; ---------------------------------------------- +%endif ; ---------------------------------------------- ; ========================================================================== @@ -131,54 +131,54 @@ section .note.GNU-stack noalloc noexec nowrite progbits ; Common types ; %ifdef __x86_64__ -%define POINTER qword ; general pointer type -%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER) -%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT +%define POINTER qword ; general pointer type +%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER) +%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT %else -%define POINTER dword ; general pointer type -%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) -%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT +%define POINTER dword ; general pointer type +%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) +%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT %endif -%define INT dword ; signed integer type -%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT) -%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT +%define INT dword ; signed integer type +%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT) +%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT -%define FP32 dword ; IEEE754 single -%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32) -%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT +%define FP32 dword ; IEEE754 single +%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32) +%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT -%define MMWORD qword ; int64 (MMX register) -%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD) -%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT +%define MMWORD qword ; int64 (MMX register) +%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD) +%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT ; NASM is buggy and doesn't properly handle operand sizes for SSE ; instructions, so for now we have to define XMMWORD as blank. -%define XMMWORD ; int128 (SSE register) -%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD) -%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT +%define XMMWORD ; int128 (SSE register) +%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD) +%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT ; Similar hacks for when we load a dword or MMWORD into an xmm# register %define XMM_DWORD %define XMM_MMWORD -%define SIZEOF_BYTE 1 ; sizeof(BYTE) -%define SIZEOF_WORD 2 ; sizeof(WORD) -%define SIZEOF_DWORD 4 ; sizeof(DWORD) -%define SIZEOF_QWORD 8 ; sizeof(QWORD) -%define SIZEOF_OWORD 16 ; sizeof(OWORD) +%define SIZEOF_BYTE 1 ; sizeof(BYTE) +%define SIZEOF_WORD 2 ; sizeof(WORD) +%define SIZEOF_DWORD 4 ; sizeof(DWORD) +%define SIZEOF_QWORD 8 ; sizeof(QWORD) +%define SIZEOF_OWORD 16 ; sizeof(OWORD) -%define BYTE_BIT 8 ; CHAR_BIT in C -%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT -%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT -%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT -%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT +%define BYTE_BIT 8 ; CHAR_BIT in C +%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT +%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT +%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT +%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT ; -------------------------------------------------------------------------- ; External Symbol Name ; %ifndef EXTN -%define EXTN(name) _ %+ name ; foo() -> _foo +%define EXTN(name) _ %+ name ; foo() -> _foo %endif ; -------------------------------------------------------------------------- @@ -188,75 +188,76 @@ section .note.GNU-stack noalloc noexec nowrite progbits %undef PIC %endif -%ifdef PIC ; ------------------------------------------- +%ifdef PIC ; ------------------------------------------- -%ifidn GOT_SYMBOL,_MACHO_PIC_ ; -------------------- +%ifidn GOT_SYMBOL, _MACHO_PIC_ ; -------------------- ; At present, nasm doesn't seem to support PIC generation for Mach-O. ; The PIC support code below is a little tricky. - SECTION SEG_CONST + SECTION SEG_CONST const_base: -%define GOTOFF(got,sym) (got) + (sym) - const_base +%define GOTOFF(got,sym) (got) + (sym) - const_base %imacro get_GOT 1 - ; NOTE: this macro destroys ecx resister. - call %%geteip - add ecx, byte (%%ref - $) - jmp short %%adjust + ; NOTE: this macro destroys ecx resister. + call %%geteip + add ecx, byte (%%ref - $) + jmp short %%adjust %%geteip: - mov ecx, POINTER [esp] - ret + mov ecx, POINTER [esp] + ret %%adjust: - push ebp - xor ebp,ebp ; ebp = 0 -%ifidni %1,ebx ; (%1 == ebx) - ; db 0x8D,0x9C + jmp near const_base = - ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) - db 0x8D,0x9C ; 8D,9C - jmp near const_base ; E9,(const_base-%%ref) + push ebp + xor ebp, ebp ; ebp = 0 +%ifidni %1, ebx ; (%1 == ebx) + ; db 0x8D,0x9C + jmp near const_base = + ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) + db 0x8D, 0x9C ; 8D,9C + jmp near const_base ; E9,(const_base-%%ref) %%ref: %else ; (%1 != ebx) - ; db 0x8D,0x8C + jmp near const_base = - ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) - db 0x8D,0x8C ; 8D,8C - jmp near const_base ; E9,(const_base-%%ref) -%%ref: mov %1, ecx -%endif ; (%1 == ebx) - pop ebp + ; db 0x8D,0x8C + jmp near const_base = + ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) + db 0x8D, 0x8C ; 8D,8C + jmp near const_base ; E9,(const_base-%%ref) +%%ref: + mov %1, ecx +%endif ; (%1 == ebx) + pop ebp %endmacro -%else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- +%else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- -%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff +%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff %imacro get_GOT 1 - extern GOT_SYMBOL - call %%geteip - add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc - jmp short %%done + extern GOT_SYMBOL + call %%geteip + add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc + jmp short %%done %%geteip: - mov %1, POINTER [esp] - ret + mov %1, POINTER [esp] + ret %%done: %endmacro -%endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- +%endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- %imacro pushpic 1.nolist - push %1 + push %1 %endmacro %imacro poppic 1.nolist - pop %1 + pop %1 %endmacro %imacro movpic 2.nolist - mov %1,%2 + mov %1, %2 %endmacro -%else ; !PIC ----------------------------------------- +%else ; !PIC ----------------------------------------- -%define GOTOFF(got,sym) (sym) +%define GOTOFF(got,sym) (sym) %imacro get_GOT 1.nolist %endmacro @@ -267,7 +268,7 @@ const_base: %imacro movpic 2.nolist %endmacro -%endif ; PIC ----------------------------------------- +%endif ; PIC ----------------------------------------- ; -------------------------------------------------------------------------- ; Align the next instruction on {2,4,8,16,..}-byte boundary. @@ -277,28 +278,29 @@ const_base: %define FILLB(b,n) (($$-(b)) & ((n)-1)) %imacro alignx 1-2.nolist 0xFFFF -%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \ - db 0x90 ; nop - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \ - db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000] - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \ - db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \ - db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \ - db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00] - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \ - db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00] - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \ - db 0x8B,0xED ; mov ebp,ebp - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \ - db 0x90 ; nop +%%bs: \ + times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \ + db 0x90 ; nop + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \ + db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \ + db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \ + db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \ + db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \ + db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \ + db 0x8B,0xED ; mov ebp,ebp + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \ + db 0x90 ; nop %endmacro ; Align the next data on {2,4,8,16,..}-byte boundary. ; %imacro alignz 1.nolist - align %1, db 0 ; filling zeros + align %1, db 0 ; filling zeros %endmacro %ifdef __x86_64__ @@ -306,61 +308,61 @@ const_base: %ifdef WIN64 %imacro collect_args 0 - push r12 - push r13 - push r14 - push r15 - mov r10, rcx - mov r11, rdx - mov r12, r8 - mov r13, r9 - mov r14, [rax+48] - mov r15, [rax+56] - push rsi - push rdi - sub rsp, SIZEOF_XMMWORD - movaps XMMWORD [rsp], xmm6 - sub rsp, SIZEOF_XMMWORD - movaps XMMWORD [rsp], xmm7 + push r12 + push r13 + push r14 + push r15 + mov r10, rcx + mov r11, rdx + mov r12, r8 + mov r13, r9 + mov r14, [rax+48] + mov r15, [rax+56] + push rsi + push rdi + sub rsp, SIZEOF_XMMWORD + movaps XMMWORD [rsp], xmm6 + sub rsp, SIZEOF_XMMWORD + movaps XMMWORD [rsp], xmm7 %endmacro %imacro uncollect_args 0 - movaps xmm7, XMMWORD [rsp] - add rsp, SIZEOF_XMMWORD - movaps xmm6, XMMWORD [rsp] - add rsp, SIZEOF_XMMWORD - pop rdi - pop rsi - pop r15 - pop r14 - pop r13 - pop r12 + movaps xmm7, XMMWORD [rsp] + add rsp, SIZEOF_XMMWORD + movaps xmm6, XMMWORD [rsp] + add rsp, SIZEOF_XMMWORD + pop rdi + pop rsi + pop r15 + pop r14 + pop r13 + pop r12 %endmacro %else %imacro collect_args 0 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - mov r10, rdi - mov r11, rsi - mov r12, rdx - mov r13, rcx - mov r14, r8 - mov r15, r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + mov r10, rdi + mov r11, rsi + mov r12, rdx + mov r13, rcx + mov r14, r8 + mov r15, r9 %endmacro %imacro uncollect_args 0 - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 %endmacro %endif