platforms. This speeds up the decompression of 4:2:2 JPEGs by 20-25% on such
platforms.
+[7] Fixed a regression caused by 1.2.0[2] whereby, on Linux/x86 platforms,
+decompressing a 4:2:0 or 4:2:2 JPEG image without using fancy upsampling would
+produce several incorrect columns of pixels at the right-hand side of the
+output image if each row in the output image was not evenly divisible by 16
+bytes.
+
1.2.0
=====
;
; jdclrss2-64.asm - colorspace conversion (64-bit SSE2)
;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2009 D. R. Commander
;
; Based on
jmp near .columnloop
.column_st32:
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_MMWORD
test rcx, rcx
jz short .nextrow
mov BYTE [rdi], al
-%else
- mov rax,rcx
- xor rcx, byte 0x0F
- shl rcx, 2
- movd xmmB,ecx
- psrlq xmmH,4
- pcmpeqb xmmE,xmmE
- psrlq xmmH,xmmB
- psrlq xmmE,xmmB
- punpcklbw xmmE,xmmH
- ; ----------------
- mov rcx,rdi
- and rcx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- add rax,rcx
- cmp rax, byte SIZEOF_XMMWORD
- ja short .adj0
- and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,rcx
- movdqa xmmG,xmmA
- movdqa xmmC,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmD,ecx
- sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmF,ecx
- psllq xmmA,xmmF
- psllq xmmE,xmmF
- jmp short .adj0
-.adj1: neg ecx
- movd xmmF,ecx
- psrlq xmmA,xmmF
- psrlq xmmE,xmmF
- psllq xmmG,xmmD
- psllq xmmC,xmmD
- por xmmA,xmmG
- por xmmE,xmmC
-.adj0: ; ----------------
- movdqu XMMWORD [rdi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
jmp near .columnloop
.column_st32:
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_XMMWORD/8
test rcx, rcx
jz short .nextrow
movd DWORD [rdi], xmmA
-%else
- cmp rcx, byte SIZEOF_XMMWORD/16
- jb near .nextrow
- mov rax,rcx
- xor rcx, byte 0x03
- inc rcx
- shl rcx, 4
- movd xmmF,ecx
- psrlq xmmE,xmmF
- punpcklbw xmmE,xmmE
- ; ----------------
- mov rcx,rdi
- and rcx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
- cmp rax, byte SIZEOF_XMMWORD
- ja short .adj0
- and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmB,xmmA
- movdqa xmmG,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmC,ecx
- sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmH,ecx
- psllq xmmA,xmmH
- psllq xmmE,xmmH
- jmp short .adj0
-.adj1: neg rcx
- movd xmmH,ecx
- psrlq xmmA,xmmH
- psrlq xmmE,xmmH
- psllq xmmB,xmmC
- psllq xmmG,xmmC
- por xmmA,xmmB
- por xmmE,xmmG
-.adj0: ; ----------------
- movdqu XMMWORD [rdi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
;
; jdclrss2.asm - colorspace conversion (SSE2)
;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
;
; Based on
; x86 SIMD extension for IJG JPEG library
alignx 16,7
.column_st32:
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
test ecx, ecx
jz short .nextrow
mov BYTE [edi], al
-%else
- mov eax,ecx
- xor ecx, byte 0x0F
- shl ecx, 2
- movd xmmB,ecx
- psrlq xmmH,4
- pcmpeqb xmmE,xmmE
- psrlq xmmH,xmmB
- psrlq xmmE,xmmB
- punpcklbw xmmE,xmmH
- ; ----------------
- mov ecx,edi
- and ecx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- add eax,ecx
- cmp eax, byte SIZEOF_XMMWORD
- ja short .adj0
- and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmG,xmmA
- movdqa xmmC,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmD,ecx
- sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmF,ecx
- psllq xmmA,xmmF
- psllq xmmE,xmmF
- jmp short .adj0
-.adj1: neg ecx
- movd xmmF,ecx
- psrlq xmmA,xmmF
- psrlq xmmE,xmmF
- psllq xmmG,xmmD
- psllq xmmC,xmmD
- por xmmA,xmmG
- por xmmE,xmmC
-.adj0: ; ----------------
- movdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
alignx 16,7
.column_st32:
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_XMMWORD/8
test ecx, ecx
jz short .nextrow
movd DWORD [edi], xmmA
-%else
- cmp ecx, byte SIZEOF_XMMWORD/16
- jb short .nextrow
- mov eax,ecx
- xor ecx, byte 0x03
- inc ecx
- shl ecx, 4
- movd xmmF,ecx
- psrlq xmmE,xmmF
- punpcklbw xmmE,xmmE
- ; ----------------
- mov ecx,edi
- and ecx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
- cmp eax, byte SIZEOF_XMMWORD
- ja short .adj0
- and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmB,xmmA
- movdqa xmmG,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmC,ecx
- sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmH,ecx
- psllq xmmA,xmmH
- psllq xmmE,xmmH
- jmp short .adj0
-.adj1: neg ecx
- movd xmmH,ecx
- psrlq xmmA,xmmH
- psrlq xmmE,xmmH
- psllq xmmB,xmmC
- psllq xmmG,xmmC
- por xmmA,xmmB
- por xmmE,xmmG
-.adj0: ; ----------------
- movdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
;
; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2009 D. R. Commander
;
; Based on
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ for
+; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; [TAB8]
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_MMWORD
test rcx, rcx
jz short .endcolumn
mov BYTE [rdi], al
-%else
- mov rax,rcx
- xor rcx, byte 0x0F
- shl rcx, 2
- movd xmmB,ecx
- psrlq xmmH,4
- pcmpeqb xmmE,xmmE
- psrlq xmmH,xmmB
- psrlq xmmE,xmmB
- punpcklbw xmmE,xmmH
- ; ----------------
- mov rcx,rdi
- and rcx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- add rax,rcx
- cmp rax, byte SIZEOF_XMMWORD
- ja short .adj0
- and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmG,xmmA
- movdqa xmmC,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmD,ecx
- sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmF,ecx
- psllq xmmA,xmmF
- psllq xmmE,xmmF
- jmp short .adj0
-.adj1: neg rcx
- movd xmmF,ecx
- psrlq xmmA,xmmF
- psrlq xmmE,xmmF
- psllq xmmG,xmmD
- psllq xmmC,xmmD
- por xmmA,xmmG
- por xmmE,xmmC
-.adj0: ; ----------------
- movdqu XMMWORD [rdi],xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
jmp near .columnloop
.column_st32:
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_XMMWORD/8
test rcx, rcx
jz short .endcolumn
movd DWORD [rdi], xmmA
-%else
- cmp rcx, byte SIZEOF_XMMWORD/16
- jb near .endcolumn
- mov rax,rcx
- xor rcx, byte 0x03
- inc rcx
- shl rcx, 4
- movd xmmF,ecx
- psrlq xmmE,xmmF
- punpcklbw xmmE,xmmE
- ; ----------------
- mov rcx,rdi
- and rcx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
- cmp rax, byte SIZEOF_XMMWORD
- ja short .adj0
- and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmB,xmmA
- movdqa xmmG,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmC,ecx
- sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmH,ecx
- psllq xmmA,xmmH
- psllq xmmE,xmmH
- jmp short .adj0
-.adj1: neg rcx
- movd xmmH,ecx
- psrlq xmmA,xmmH
- psrlq xmmE,xmmH
- psllq xmmB,xmmC
- psllq xmmG,xmmC
- por xmmA,xmmB
- por xmmE,xmmG
-.adj0: ; ----------------
- movdqu XMMWORD [rdi],xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
;
; jdmrgss2.asm - merged upsampling/color conversion (SSE2)
;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
;
; Based on
; x86 SIMD extension for IJG JPEG library
alignx 16,7
.column_st32:
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
test ecx, ecx
jz short .endcolumn
mov BYTE [edi], al
-%else
- mov eax,ecx
- xor ecx, byte 0x0F
- shl ecx, 2
- movd xmmB,ecx
- psrlq xmmH,4
- pcmpeqb xmmE,xmmE
- psrlq xmmH,xmmB
- psrlq xmmE,xmmB
- punpcklbw xmmE,xmmH
- ; ----------------
- mov ecx,edi
- and ecx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- add eax,ecx
- cmp eax, byte SIZEOF_XMMWORD
- ja short .adj0
- and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmG,xmmA
- movdqa xmmC,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmD,ecx
- sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmF,ecx
- psllq xmmA,xmmF
- psllq xmmE,xmmF
- jmp short .adj0
-.adj1: neg ecx
- movd xmmF,ecx
- psrlq xmmA,xmmF
- psrlq xmmE,xmmF
- psllq xmmG,xmmD
- psllq xmmC,xmmD
- por xmmA,xmmG
- por xmmE,xmmC
-.adj0: ; ----------------
- movdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
alignx 16,7
.column_st32:
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_XMMWORD/8
jb short .column_st7
movq MMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD/2
+ add edi, byte SIZEOF_XMMWORD/8*4
sub ecx, byte SIZEOF_XMMWORD/8
- psrldq xmmA, 64
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7:
; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space.
test ecx, ecx
jz short .endcolumn
movd DWORD [edi], xmmA
-%else
- cmp ecx, byte SIZEOF_XMMWORD/16
- jb short .endcolumn
- mov eax,ecx
- xor ecx, byte 0x03
- inc ecx
- shl ecx, 4
- movd xmmF,ecx
- psrlq xmmE,xmmF
- punpcklbw xmmE,xmmE
- ; ----------------
- mov ecx,edi
- and ecx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
- cmp eax, byte SIZEOF_XMMWORD
- ja short .adj0
- and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmB,xmmA
- movdqa xmmG,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmC,ecx
- sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmH,ecx
- psllq xmmA,xmmH
- psllq xmmE,xmmH
- jmp short .adj0
-.adj1: neg ecx
- movd xmmH,ecx
- psrlq xmmA,xmmH
- psrlq xmmE,xmmH
- psllq xmmB,xmmC
- psllq xmmG,xmmC
- por xmmA,xmmB
- por xmmE,xmmG
-.adj0: ; ----------------
- movdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%endif ; RGB_PIXELSIZE ; ---------------
%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
%endif
-%define STRICT_MEMORY_ACCESS 1
-
; To make the code position-independent, append -DPIC to the commandline
;
%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC