From 69799275be895de4963bb22c975081cb53a147a0 Mon Sep 17 00:00:00 2001 From: DRC Date: Wed, 13 Jun 2012 01:21:29 +0000 Subject: [PATCH] Eliminate the use of the MASKMOVDQU instruction, to speed up decompression performance by 10x on AMD Bobcat embedded processors (and ~5% on AMD desktop processors.) git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.2.x@835 632fc199-4ca6-4c93-a231-07263d6284db --- ChangeLog.txt | 7 +++++++ simd/jdclrss2-64.asm | 49 +++++++++++++++++-------------------------- simd/jdclrss2.asm | 49 +++++++++++++++++-------------------------- simd/jdmrgss2-64.asm | 50 +++++++++++++++++--------------------------- simd/jdmrgss2.asm | 49 +++++++++++++++++-------------------------- 5 files changed, 83 insertions(+), 121 deletions(-) diff --git a/ChangeLog.txt b/ChangeLog.txt index 3775d54..e80ac6c 100644 --- a/ChangeLog.txt +++ b/ChangeLog.txt @@ -19,6 +19,13 @@ calling conventions. images (specifically, images in which the component count was erroneously set to a large value) would cause libjpeg-turbo to segfault. +[5] Worked around a severe performance issue with "Bobcat" (AMD Embedded APU) +processors. The MASKMOVDQU instruction, which was used by the libjpeg-turbo +SSE2 SIMD code, is apparently implemented in microcode on AMD processors, and +it is painfully slow on Bobcat processors in particular. Eliminating the use +of this instruction improved performance by an order of magnitude on Bobcat +processors and by a small amount (typically 5%) on AMD desktop processors. + 1.2.0 ===== diff --git a/simd/jdclrss2-64.asm b/simd/jdclrss2-64.asm index 696a383..06cb213 100644 --- a/simd/jdclrss2-64.asm +++ b/simd/jdclrss2-64.asm @@ -251,17 +251,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2): movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr jmp short .out0 .out1: ; --(unaligned)----------------- - pcmpeqb xmmH,xmmH ; xmmH=(all 1's) - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF - add rdi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF .out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr sub rcx, byte SIZEOF_XMMWORD jz near .nextrow @@ -275,17 +271,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2): lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE cmp rcx, byte 2*SIZEOF_XMMWORD jb short .column_st16 - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD - add rdi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmF sub rcx, byte 2*SIZEOF_XMMWORD jmp short .column_st15 .column_st16: cmp rcx, byte SIZEOF_XMMWORD jb short .column_st15 - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA add rdi, byte SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmD sub rcx, byte SIZEOF_XMMWORD @@ -363,7 +358,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): por xmmA,xmmG por xmmE,xmmC .adj0: ; ---------------- - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA + movdqu XMMWORD [rdi], xmmA %endif ; STRICT_MEMORY_ACCESS ; --------------- %else ; RGB_PIXELSIZE == 4 ; ----------- @@ -409,19 +404,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2): movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr jmp short .out0 .out1: ; --(unaligned)----------------- - pcmpeqb xmmE,xmmE ; xmmE=(all 1's) - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH - add rdi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH .out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr sub rcx, byte SIZEOF_XMMWORD jz near .nextrow @@ -434,17 +424,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2): pcmpeqb xmmE,xmmE ; xmmE=(all 1's) cmp rcx, byte SIZEOF_XMMWORD/2 jb short .column_st16 - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD - add rdi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmC movdqa xmmD,xmmH sub rcx, byte SIZEOF_XMMWORD/2 .column_st16: cmp rcx, byte SIZEOF_XMMWORD/4 jb short .column_st15 - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA add rdi, byte SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmD sub rcx, byte SIZEOF_XMMWORD/4 @@ -503,7 +492,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): por xmmA,xmmB por xmmE,xmmG .adj0: ; ---------------- - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA + movdqu XMMWORD [rdi], xmmA %endif ; STRICT_MEMORY_ACCESS ; --------------- %endif ; RGB_PIXELSIZE ; --------------- diff --git a/simd/jdclrss2.asm b/simd/jdclrss2.asm index 7f519e6..1354c3d 100644 --- a/simd/jdclrss2.asm +++ b/simd/jdclrss2.asm @@ -262,17 +262,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2): movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr jmp short .out0 .out1: ; --(unaligned)----------------- - pcmpeqb xmmH,xmmH ; xmmH=(all 1's) - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF - add edi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF .out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr sub ecx, byte SIZEOF_XMMWORD jz near .nextrow @@ -287,17 +283,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2): lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE cmp ecx, byte 2*SIZEOF_XMMWORD jb short .column_st16 - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD - add edi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmF sub ecx, byte 2*SIZEOF_XMMWORD jmp short .column_st15 .column_st16: cmp ecx, byte SIZEOF_XMMWORD jb short .column_st15 - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA add edi, byte SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmD sub ecx, byte SIZEOF_XMMWORD @@ -375,7 +370,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): por xmmA,xmmG por xmmE,xmmC .adj0: ; ---------------- - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [edi], xmmA %endif ; STRICT_MEMORY_ACCESS ; --------------- %else ; RGB_PIXELSIZE == 4 ; ----------- @@ -421,19 +416,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2): movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr jmp short .out0 .out1: ; --(unaligned)----------------- - pcmpeqb xmmE,xmmE ; xmmE=(all 1's) - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH - add edi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH .out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr sub ecx, byte SIZEOF_XMMWORD jz near .nextrow @@ -447,17 +437,16 @@ EXTN(jsimd_ycc_rgb_convert_sse2): pcmpeqb xmmE,xmmE ; xmmE=(all 1's) cmp ecx, byte SIZEOF_XMMWORD/2 jb short .column_st16 - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD - add edi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmC movdqa xmmD,xmmH sub ecx, byte SIZEOF_XMMWORD/2 .column_st16: cmp ecx, byte SIZEOF_XMMWORD/4 jb short .column_st15 - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA add edi, byte SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmD sub ecx, byte SIZEOF_XMMWORD/4 @@ -516,7 +505,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): por xmmA,xmmB por xmmE,xmmG .adj0: ; ---------------- - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [edi], xmmA %endif ; STRICT_MEMORY_ACCESS ; --------------- %endif ; RGB_PIXELSIZE ; --------------- diff --git a/simd/jdmrgss2-64.asm b/simd/jdmrgss2-64.asm index a64a6b3..ffe0288 100644 --- a/simd/jdmrgss2-64.asm +++ b/simd/jdmrgss2-64.asm @@ -252,17 +252,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr jmp short .out0 .out1: ; --(unaligned)----------------- - pcmpeqb xmmH,xmmH ; xmmH=(all 1's) - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF - add rdi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF .out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr sub rcx, byte SIZEOF_XMMWORD jz near .endcolumn @@ -275,21 +271,19 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): jmp near .columnloop .column_st32: - pcmpeqb xmmH,xmmH ; xmmH=(all 1's) lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE cmp rcx, byte 2*SIZEOF_XMMWORD jb short .column_st16 - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD - add rdi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmF sub rcx, byte 2*SIZEOF_XMMWORD jmp short .column_st15 .column_st16: cmp rcx, byte SIZEOF_XMMWORD jb short .column_st15 - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA add rdi, byte SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmD sub rcx, byte SIZEOF_XMMWORD @@ -367,7 +361,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): por xmmA,xmmG por xmmE,xmmC .adj0: ; ---------------- - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [rdi],xmmA %endif ; STRICT_MEMORY_ACCESS ; --------------- %else ; RGB_PIXELSIZE == 4 ; ----------- @@ -413,19 +407,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr jmp short .out0 .out1: ; --(unaligned)----------------- - pcmpeqb xmmE,xmmE ; xmmE=(all 1's) - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH - add rdi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH .out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr sub rcx, byte SIZEOF_XMMWORD jz near .endcolumn @@ -441,17 +430,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): pcmpeqb xmmE,xmmE ; xmmE=(all 1's) cmp rcx, byte SIZEOF_XMMWORD/2 jb short .column_st16 - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD - add rdi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmC movdqa xmmD,xmmH sub rcx, byte SIZEOF_XMMWORD/2 .column_st16: cmp rcx, byte SIZEOF_XMMWORD/4 jb short .column_st15 - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA add rdi, byte SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmD sub rcx, byte SIZEOF_XMMWORD/4 @@ -510,7 +498,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): por xmmA,xmmB por xmmE,xmmG .adj0: ; ---------------- - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [rdi],xmmA %endif ; STRICT_MEMORY_ACCESS ; --------------- %endif ; RGB_PIXELSIZE ; --------------- diff --git a/simd/jdmrgss2.asm b/simd/jdmrgss2.asm index 04089aa..556a490 100644 --- a/simd/jdmrgss2.asm +++ b/simd/jdmrgss2.asm @@ -264,17 +264,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr jmp short .out0 .out1: ; --(unaligned)----------------- - pcmpeqb xmmH,xmmH ; xmmH=(all 1's) - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF - add edi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF .out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr sub ecx, byte SIZEOF_XMMWORD jz near .endcolumn @@ -292,17 +288,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE cmp ecx, byte 2*SIZEOF_XMMWORD jb short .column_st16 - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD - add edi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmF sub ecx, byte 2*SIZEOF_XMMWORD jmp short .column_st15 .column_st16: cmp ecx, byte SIZEOF_XMMWORD jb short .column_st15 - maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA add edi, byte SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmD sub ecx, byte SIZEOF_XMMWORD @@ -380,7 +375,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): por xmmA,xmmG por xmmE,xmmC .adj0: ; ---------------- - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [edi], xmmA %endif ; STRICT_MEMORY_ACCESS ; --------------- %else ; RGB_PIXELSIZE == 4 ; ----------- @@ -426,19 +421,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr jmp short .out0 .out1: ; --(unaligned)----------------- - pcmpeqb xmmE,xmmE ; xmmE=(all 1's) - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH - add edi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH .out0: + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr sub ecx, byte SIZEOF_XMMWORD jz near .endcolumn @@ -455,17 +445,16 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): pcmpeqb xmmE,xmmE ; xmmE=(all 1's) cmp ecx, byte SIZEOF_XMMWORD/2 jb short .column_st16 - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD - add edi, byte SIZEOF_XMMWORD ; outptr + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmC movdqa xmmD,xmmH sub ecx, byte SIZEOF_XMMWORD/2 .column_st16: cmp ecx, byte SIZEOF_XMMWORD/4 jb short .column_st15 - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA add edi, byte SIZEOF_XMMWORD ; outptr movdqa xmmA,xmmD sub ecx, byte SIZEOF_XMMWORD/4 @@ -524,7 +513,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): por xmmA,xmmB por xmmE,xmmG .adj0: ; ---------------- - maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + movdqu XMMWORD [edi], xmmA %endif ; STRICT_MEMORY_ACCESS ; --------------- %endif ; RGB_PIXELSIZE ; --------------- -- 2.40.0