+2.1 pre-beta
+============
+
+### Significant changes relative to 2.0.1:
+
+1. The build system, x86-64 SIMD extensions, and accelerated Huffman codec now
+support the x32 ABI on Linux, which allows for using x86-64 instructions with
+32-bit pointers. The x32 ABI is generally enabled by adding `-mx32` to the
+compiler flags.
+
+ Caveats:
+ - CMake 3.9.0 or later is required in order for the build system to
+automatically detect an x32 build.
+ - Java does not support the x32 ABI, and thus the TurboJPEG Java API will
+automatically be disabled with x32 builds.
+ - SIMD acceleration for progressive Huffman encoding does not (currently)
+work with the x32 ABI and will be disabled in x32 builds.
+
+2. Added Loongson MMI SIMD implementations of the RGB-to-grayscale, 4:2:2 fancy
+chroma upsampling, 4:2:2 and 4:2:0 merged chroma upsampling/color conversion,
+and fast integer DCT/IDCT algorithms. Relative to libjpeg-turbo 2.0.x, this
+speeds up:
+
+ - the compression of RGB source images into grayscale JPEG images by
+approximately 20%
+ - the decompression of 4:2:2 JPEG images by approximately 40-60% when
+using fancy upsampling
+ - the decompression of 4:2:2 and 4:2:0 JPEG images by approximately
+15-20% when using merged upsampling
+ - the compression of RGB source images by approximately 30-45% when using
+the fast integer DCT
+ - the decompression of JPEG images into RGB destination images by
+approximately 2x when using the fast integer IDCT
+
+ The overall decompression speedup for RGB images is now approximately
+2.3-3.7x (compared to 2-3.5x with libjpeg-turbo 2.0.x.)
+
+3. 32-bit (ARMv7 or ARMv7s) iOS builds of libjpeg-turbo are no longer
+supported, and the libjpeg-turbo build system can no longer be used to package
+such builds. 32-bit iOS apps cannot run in iOS 11 and later, and the App Store
+no longer allows them.
+
+4. The TurboJPEG Java wrapper in the "official" libjpeg-turbo SDK for macOS no
+longer supports 32-bit Java virtual machines. Oracle no longer provides a
+32-bit JVM for macOS, and Apple's implementation of Java 1.6 (Java for OS X
+systems) is long obsolete.
+
+
+ 2.0.4
+ =====
+
+ ### Significant changes relative to 2.0.3:
+
+ 1. Fixed a regression in the Windows packaging system (introduced by
+ 2.0 beta1[2]) whereby, if both the 64-bit libjpeg-turbo SDK for GCC and the
+ 64-bit libjpeg-turbo SDK for Visual C++ were installed on the same system, only
+ one of them could be uninstalled.
+
+
2.0.3
=====
mov buffer, r11 ; r11 is now sratch
- mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer;
- mov put_bits, dword [r10+24] ; put_bits = state->cur.put_bits;
+ mov put_buffer, MMWORD [r10+SIZEOF_POINTER*2] ; put_buffer = state->cur.put_buffer;
- mov put_bits, DWORD [r10+SIZEOF_POINTER*2+8] ; put_bits = state->cur.put_bits;
++ mov put_bits, dword [r10+SIZEOF_POINTER*2+8] ; put_bits = state->cur.put_bits;
push r10 ; r10 is now scratch
; Encode the DC coefficient difference per section F.1.2.1
.EFN:
pop r10
; Save put_buffer & put_bits
- mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer;
- mov dword [r10+24], put_bits ; state->cur.put_bits = put_bits;
+ mov MMWORD [r10+SIZEOF_POINTER*2], put_buffer ; state->cur.put_buffer = put_buffer;
- mov DWORD [r10+SIZEOF_POINTER*2+8], put_bits ; state->cur.put_bits = put_bits;
++ mov dword [r10+SIZEOF_POINTER*2+8], put_bits ; state->cur.put_bits = put_bits;
pop rbx
uncollect_args 6
pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
- mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx
- mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx
+ mov word [rdx+rax*SIZEOF_JSAMPLE], bx
+ mov word [rsi+rax*SIZEOF_JSAMPLE], cx
pop rbx
uncollect_args 4