forward DCT were used. This was known to cause 'make test' to fail when the
library was built with '-march=haswell' on x86 systems.
+[3] Fixed an issue whereby libjpeg-turbo would crash when built with the latest
+& greatest development version of the Clang/LLVM compiler. This was caused by
+an x86-64 ABI conformance issue in some of libjpeg-turbo's 64-bit SSE2 SIMD
+routines. Those routines were incorrectly using a 64-bit mov instruction to
+transfer a 32-bit JDIMENSION argument, whereas the x86-64 ABI allows the upper
+(unused) 32 bits of a 32-bit argument's register to be undefined. The new
+Clang/LLVM optimizer uses load combining to transfer multiple adjacent 32-bit
+structure members into a single 64-bit register, and this exposed the ABI
+conformance issue.
+
1.4.1
=====
* images up to 64K*64K due to 16-bit fields in SOF markers. Therefore
* "unsigned int" is sufficient on all machines. However, if you need to
* handle larger images and you don't mind deviating from the spec, you
- * can change this datatype.
+ * can change this datatype. (Note that changing this datatype will
+ * potentially require modifying the SIMD code. The x86-64 SIMD extensions,
+ * in particular, assume a 32-bit JDIMENSION.)
*/
typedef unsigned int JDIMENSION;
collect_args
push rbx
- mov rcx, r10
+ mov ecx, r10d
test rcx,rcx
jz near .return
push rcx
mov rsi, r12
- mov rcx, r13
+ mov ecx, r13d
mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
collect_args
push rbx
- mov rcx, r10
+ mov ecx, r10d
test rcx,rcx
jz near .return
push rcx
mov rsi, r12
- mov rcx, r13
+ mov ecx, r13d
mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
mov rbp,rsp
collect_args
- mov rcx, r13
+ mov ecx, r13d
shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
jz near .return
- mov rdx, r10
+ mov edx, r10d
; -- expand_right_edge
; -- h2v1_downsample
- mov rax, r12 ; rowctr
+ mov eax, r12d ; rowctr
test eax,eax
jle near .return
mov rbp,rsp
collect_args
- mov rcx, r13
+ mov ecx, r13d
shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
jz near .return
- mov rdx, r10
+ mov edx, r10d
; -- expand_right_edge
; -- h2v2_downsample
- mov rax, r12 ; rowctr
+ mov eax, r12d ; rowctr
test rax,rax
jle near .return
collect_args
push rbx
- mov rcx, r10 ; num_cols
+ mov ecx, r10d ; num_cols
test rcx,rcx
jz near .return
push rcx
mov rdi, r11
- mov rcx, r12
+ mov ecx, r12d
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
collect_args
push rbx
- mov rcx, r10 ; col
+ mov ecx, r10d ; col
test rcx,rcx
jz near .return
push rcx
mov rdi, r11
- mov rcx, r12
+ mov ecx, r12d
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
collect_args
push rbx
- mov rax, r10
+ mov eax, r10d
mov rdi, r11
- mov rcx, r12
+ mov ecx, r12d
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
mov rbp,rsp
collect_args
- mov rax, r11 ; colctr
+ mov eax, r11d ; colctr
test rax,rax
jz near .return
collect_args
push rbx
- mov rax, r11 ; colctr
+ mov eax, r11d ; colctr
test rax,rax
jz near .return
mov rbp,rsp
collect_args
- mov rdx, r11
+ mov edx, r11d
add rdx, byte (2*SIZEOF_XMMWORD)-1
and rdx, byte -(2*SIZEOF_XMMWORD)
jz near .return
collect_args
push rbx
- mov rdx, r11
+ mov edx, r11d
add rdx, byte (2*SIZEOF_XMMWORD)-1
and rdx, byte -(2*SIZEOF_XMMWORD)
jz near .return
mov rax, [original_rbp]
lea rsi, [workspace] ; FAST_FLOAT * wsptr
mov rdi, r12 ; (JSAMPROW *)
- mov rax, r13
+ mov eax, r13d
mov rcx, DCTSIZE/4 ; ctr
.rowloop:
mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *)
- mov rax, r13
+ mov eax, r13d
; -- Even part
mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *)
- mov rax, r13
+ mov eax, r13d
; -- Even part
mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *)
- mov rax, r13
+ mov eax, r13d
; -- Even part
; ---- Pass 2: process rows, store into output array.
mov rdi, r12 ; (JSAMPROW *)
- mov rax, r13
+ mov eax, r13d
; | input:| result:|
; | A0 B0 | |
packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
mov rsi, r10
- mov rax, r11
+ mov eax, r11d
mov rdi, r12
mov rcx, DCTSIZE/2
.convloop:
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
mov rsi, r10
- mov rax, r11
+ mov eax, r11d
mov rdi, r12
mov rcx, DCTSIZE/4
.convloop: