fail to compile if the Windows system headers were included before jpeglib.h.
This issue was caused by a conflict in the definition of the INT32 type.
+[9] Fixed out-of-bounds read in SSE2 SIMD code that occurred when decompressing
+a JPEG image to a bitmap buffer whose size was not a multiple of 16 bytes.
+This was more of an annoyance than an actual bug, since it did not cause any
+actual run-time problems, but the issue showed up when running libjpeg-turbo in
+valgrind. See http://crbug.com/72399 for more information.
+
1.0.1
=====
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
.column_st15:
+%ifdef STRICT_MEMORY_ACCESS
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ movq MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_MMWORD
+ sub rcx, byte SIZEOF_MMWORD
+ psrldq xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_DWORD
+ jb short .column_st3
+ movd DWORD [rdi], xmmA
+ add rdi, byte SIZEOF_DWORD
+ sub rcx, byte SIZEOF_DWORD
+ psrldq xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of rax to the output when it has enough
+ ; space.
+ movd eax, xmmA
+ cmp rcx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov WORD [rdi], ax
+ add rdi, byte SIZEOF_WORD
+ sub rcx, byte SIZEOF_WORD
+ shr rax, 16
+.column_st1:
+ ; Store the lower 1 byte of rax to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .nextrow
+ mov BYTE [rdi], al
+%else
mov rax,rcx
xor rcx, byte 0x0F
shl rcx, 2
por xmmE,xmmC
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
+%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
.column_st15:
+%ifdef STRICT_MEMORY_ACCESS
+ ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_XMMWORD/8
+ jb short .column_st7
+ movq MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD/8*4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+ ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .nextrow
+ movd DWORD [rdi], xmmA
+%else
cmp rcx, byte SIZEOF_XMMWORD/16
jb near .nextrow
mov rax,rcx
por xmmE,xmmG
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
+%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
+%ifdef STRICT_MEMORY_ACCESS
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ movq MMWORD [edi], xmmA
+ add edi, byte SIZEOF_MMWORD
+ sub ecx, byte SIZEOF_MMWORD
+ psrldq xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st3
+ movd DWORD [edi], xmmA
+ add edi, byte SIZEOF_DWORD
+ sub ecx, byte SIZEOF_DWORD
+ psrldq xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of eax to the output when it has enough
+ ; space.
+ movd eax, xmmA
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov WORD [edi], ax
+ add edi, byte SIZEOF_WORD
+ sub ecx, byte SIZEOF_WORD
+ shr eax, 16
+.column_st1:
+ ; Store the lower 1 byte of eax to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .nextrow
+ mov BYTE [edi], al
+%else
mov eax,ecx
xor ecx, byte 0x0F
shl ecx, 2
por xmmE,xmmC
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
+%ifdef STRICT_MEMORY_ACCESS
+ ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_XMMWORD/8
+ jb short .column_st7
+ movq MMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD/8*4
+ sub ecx, byte SIZEOF_XMMWORD/8
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+ ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .nextrow
+ movd DWORD [edi], xmmA
+%else
cmp ecx, byte SIZEOF_XMMWORD/16
jb short .nextrow
mov eax,ecx
por xmmE,xmmG
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
.column_st15:
+%ifdef STRICT_MEMORY_ACCESS
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ movq MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_MMWORD
+ sub rcx, byte SIZEOF_MMWORD
+ psrldq xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_DWORD
+ jb short .column_st3
+ movd DWORD [rdi], xmmA
+ add rdi, byte SIZEOF_DWORD
+ sub rcx, byte SIZEOF_DWORD
+ psrldq xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of rax to the output when it has enough
+ ; space.
+ movd eax, xmmA
+ cmp rcx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov WORD [rdi], ax
+ add rdi, byte SIZEOF_WORD
+ sub rcx, byte SIZEOF_WORD
+ shr rax, 16
+.column_st1:
+ ; Store the lower 1 byte of rax to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .endcolumn
+ mov BYTE [rdi], al
+%else
mov rax,rcx
xor rcx, byte 0x0F
shl rcx, 2
por xmmE,xmmC
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
.column_st15:
+%ifdef STRICT_MEMORY_ACCESS
+ ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+ ; space.
+ cmp rcx, byte SIZEOF_XMMWORD/8
+ jb short .column_st7
+ movq MMWORD [rdi], xmmA
+ add rdi, byte SIZEOF_XMMWORD/8*4
+ sub rcx, byte SIZEOF_XMMWORD/8
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+ ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+ ; space.
+ test rcx, rcx
+ jz short .endcolumn
+ movd DWORD [rdi], xmmA
+%else
cmp rcx, byte SIZEOF_XMMWORD/16
jb near .endcolumn
mov rax,rcx
por xmmE,xmmG
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
+%ifdef STRICT_MEMORY_ACCESS
+ ; Store the lower 8 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_MMWORD
+ jb short .column_st7
+ movq MMWORD [edi], xmmA
+ add edi, byte SIZEOF_MMWORD
+ sub ecx, byte SIZEOF_MMWORD
+ psrldq xmmA, SIZEOF_MMWORD
+.column_st7:
+ ; Store the lower 4 bytes of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_DWORD
+ jb short .column_st3
+ movd DWORD [edi], xmmA
+ add edi, byte SIZEOF_DWORD
+ sub ecx, byte SIZEOF_DWORD
+ psrldq xmmA, SIZEOF_DWORD
+.column_st3:
+ ; Store the lower 2 bytes of eax to the output when it has enough
+ ; space.
+ movd eax, xmmA
+ cmp ecx, byte SIZEOF_WORD
+ jb short .column_st1
+ mov WORD [edi], ax
+ add edi, byte SIZEOF_WORD
+ sub ecx, byte SIZEOF_WORD
+ shr eax, 16
+.column_st1:
+ ; Store the lower 1 byte of eax to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .endcolumn
+ mov BYTE [edi], al
+%else
mov eax,ecx
xor ecx, byte 0x0F
shl ecx, 2
por xmmE,xmmC
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
+%ifdef STRICT_MEMORY_ACCESS
+ ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+ ; space.
+ cmp ecx, byte SIZEOF_XMMWORD/8
+ jb short .column_st7
+ movq MMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD/2
+ sub ecx, byte SIZEOF_XMMWORD/8
+ psrldq xmmA, 64
+.column_st7:
+ ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+ ; space.
+ test ecx, ecx
+ jz short .endcolumn
+ movd DWORD [edi], xmmA
+%else
cmp ecx, byte SIZEOF_XMMWORD/16
jb short .endcolumn
mov eax,ecx
por xmmE,xmmG
.adj0: ; ----------------
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
%endif
+%define STRICT_MEMORY_ACCESS 1
+
; To make the code position-independent, append -DPIC to the commandline
;
%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC