From 1120ff29a178ee666504f0067e7c079a6b792296 Mon Sep 17 00:00:00 2001
From: DRC <information@libjpeg-turbo.org>
Date: Wed, 13 Jul 2016 12:15:02 -0500
Subject: [PATCH] Fix AArch64 ABI conformance issue in SIMD code

In the AArch64 ABI, the high (unused) DWORD of a 32-bit argument's
register is undefined, so it was incorrect to use 64-bit
instructions to transfer a JDIMENSION argument in the 64-bit NEON SIMD
functions.  The code worked thus far only because the existing compiler
optimizers weren't smart enough to do anything else with the register in
question, so the upper 32 bits happened to be all zeroes.

The latest builds of Clang/LLVM have a smarter optimizer, and under
certain circumstances, it will attempt to load-combine adjacent 32-bit
integers from one of the libjpeg structures into a single 64-bit integer
and pass that 64-bit integer as a 32-bit argument to one of the SIMD
functions (which is allowed by the ABI, since the upper 32 bits of the
32-bit argument's register are undefined.)  This caused the
libjpeg-turbo regression tests to crash.

This patch tries to use the Wn registers whenever possible.  Otherwise,
it uses a zero-extend instruction to avoid using the upper 32 bits of
the 64-bit registers, which are not guaranteed to be valid for 32-bit
arguments.

Based on https://github.com/sebpop/libjpeg-turbo/commit/1fbae13021eb98f6fffdfaf8678fcdb00b0b04d9

Closes #91.  Refer also to android-ndk/ndk#110 and
https://llvm.org/bugs/show_bug.cgi?id=28393
---
 ChangeLog.md            | 10 +++++++++
 simd/jsimd_arm64_neon.S | 50 ++++++++++++++++++++++++++++++-----------
 2 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/ChangeLog.md b/ChangeLog.md
index 426a83b..33b46dc 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -33,6 +33,16 @@ assume that AltiVec support is always available, which means that libjpeg-turbo
 cannot be used with G3 Macs unless you set the environment variable
 `JSIMD_FORCENONE` to `1`.
 
+3. Fixed an issue whereby 64-bit ARM (AArch64) builds of libjpeg-turbo would
+crash when built with recent releases of the Clang/LLVM compiler.  This was
+caused by an ABI conformance issue in some of libjpeg-turbo's 64-bit NEON SIMD
+routines.  Those routines were incorrectly using 64-bit instructions to
+transfer a 32-bit JDIMENSION argument, whereas the ABI allows the upper
+(unused) 32 bits of a 32-bit argument's register to be undefined.  The new
+Clang/LLVM optimizer uses load combining to transfer multiple adjacent 32-bit
+structure members into a single 64-bit register, and this exposed the ABI
+conformance issue.
+
 
 1.5.0
 =====
diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S
index 74d6c76..6c1a959 100644
--- a/simd/jsimd_arm64_neon.S
+++ b/simd/jsimd_arm64_neon.S
@@ -210,6 +210,11 @@ asm_function jsimd_idct_islow_neon
     TMP7            .req x13
     TMP8            .req x14
 
+    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
     sub             sp, sp, #64
     adr             x15, Ljsimd_idct_islow_neon_consts
     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
@@ -807,6 +812,11 @@ asm_function jsimd_idct_ifast_neon
     TMP7            .req x13
     TMP8            .req x14
 
+    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
     /* Load and dequantize coefficients into NEON registers
      * with the following allocation:
      *       0 1 2 3 | 4 5 6 7
@@ -1101,6 +1111,11 @@ asm_function jsimd_idct_4x4_neon
     TMP3            .req x2
     TMP4            .req x15
 
+    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
     /* Save all used NEON registers */
     sub             sp, sp, 272
     str             x15, [sp], 16
@@ -1299,6 +1314,11 @@ asm_function jsimd_idct_2x2_neon
     TMP1            .req x0
     TMP2            .req x15
 
+    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
     /* vpush           {v8.4h - v15.4h}            ; not available */
     sub             sp, sp, 208
     str             x15, [sp], 16
@@ -1688,11 +1708,11 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
 .else
 asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
 .endif
-    OUTPUT_WIDTH    .req x0
+    OUTPUT_WIDTH    .req w0
     INPUT_BUF       .req x1
-    INPUT_ROW       .req x2
+    INPUT_ROW       .req w2
     OUTPUT_BUF      .req x3
-    NUM_ROWS        .req x4
+    NUM_ROWS        .req w4
 
     INPUT_BUF0      .req x5
     INPUT_BUF1      .req x6
@@ -1702,7 +1722,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
     Y               .req x8
     U               .req x9
     V               .req x10
-    N               .req x15
+    N               .req w15
 
     sub             sp, sp, 336
     str             x15, [sp], 16
@@ -1745,11 +1765,10 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
     cmp             NUM_ROWS, #1
     b.lt            9f
 0:
-    lsl             x16, INPUT_ROW, #3
-    ldr             Y, [INPUT_BUF0, x16]
-    ldr             U, [INPUT_BUF1, x16]
+    ldr             Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
+    ldr             U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
     mov             N, OUTPUT_WIDTH
-    ldr             V, [INPUT_BUF2, x16]
+    ldr             V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
     add             INPUT_ROW, INPUT_ROW, #1
     ldr             RGB, [OUTPUT_BUF], #8
 
@@ -2054,8 +2073,8 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
     OUTPUT_WIDTH    .req w0
     INPUT_BUF       .req x1
     OUTPUT_BUF      .req x2
-    OUTPUT_ROW      .req x3
-    NUM_ROWS        .req x4
+    OUTPUT_ROW      .req w3
+    NUM_ROWS        .req w4
 
     OUTPUT_BUF0     .req x5
     OUTPUT_BUF1     .req x6
@@ -2089,10 +2108,10 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
     cmp             NUM_ROWS, #1
     b.lt            9f
 0:
-    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #3]
-    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #3]
+    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
+    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
     mov             N, OUTPUT_WIDTH
-    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #3]
+    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
     add             OUTPUT_ROW, OUTPUT_ROW, #1
     ldr             RGB, [INPUT_BUF], #8
 
@@ -2199,6 +2218,11 @@ asm_function jsimd_convsamp_neon
     TMP8            .req x4
     TMPDUP          .req w3
 
+    /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x1 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x1, w1
+
     mov             TMPDUP, #128
     ldp             TMP1, TMP2, [SAMPLE_DATA], 16
     ldp             TMP3, TMP4, [SAMPLE_DATA], 16
-- 
2.40.0