From 1120ff29a178ee666504f0067e7c079a6b792296 Mon Sep 17 00:00:00 2001 From: DRC Date: Wed, 13 Jul 2016 12:15:02 -0500 Subject: [PATCH] Fix AArch64 ABI conformance issue in SIMD code In the AArch64 ABI, the high (unused) DWORD of a 32-bit argument's register is undefined, so it was incorrect to use 64-bit instructions to transfer a JDIMENSION argument in the 64-bit NEON SIMD functions. The code worked thus far only because the existing compiler optimizers weren't smart enough to do anything else with the register in question, so the upper 32 bits happened to be all zeroes. The latest builds of Clang/LLVM have a smarter optimizer, and under certain circumstances, it will attempt to load-combine adjacent 32-bit integers from one of the libjpeg structures into a single 64-bit integer and pass that 64-bit integer as a 32-bit argument to one of the SIMD functions (which is allowed by the ABI, since the upper 32 bits of the 32-bit argument's register are undefined.) This caused the libjpeg-turbo regression tests to crash. This patch tries to use the Wn registers whenever possible. Otherwise, it uses a zero-extend instruction to avoid using the upper 32 bits of the 64-bit registers, which are not guaranteed to be valid for 32-bit arguments. Based on https://github.com/sebpop/libjpeg-turbo/commit/1fbae13021eb98f6fffdfaf8678fcdb00b0b04d9 Closes #91. Refer also to android-ndk/ndk#110 and https://llvm.org/bugs/show_bug.cgi?id=28393 --- ChangeLog.md | 10 +++++++++ simd/jsimd_arm64_neon.S | 50 ++++++++++++++++++++++++++++++----------- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 426a83b..33b46dc 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -33,6 +33,16 @@ assume that AltiVec support is always available, which means that libjpeg-turbo cannot be used with G3 Macs unless you set the environment variable `JSIMD_FORCENONE` to `1`. +3. Fixed an issue whereby 64-bit ARM (AArch64) builds of libjpeg-turbo would +crash when built with recent releases of the Clang/LLVM compiler. This was +caused by an ABI conformance issue in some of libjpeg-turbo's 64-bit NEON SIMD +routines. Those routines were incorrectly using 64-bit instructions to +transfer a 32-bit JDIMENSION argument, whereas the ABI allows the upper +(unused) 32 bits of a 32-bit argument's register to be undefined. The new +Clang/LLVM optimizer uses load combining to transfer multiple adjacent 32-bit +structure members into a single 64-bit register, and this exposed the ABI +conformance issue. + 1.5.0 ===== diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S index 74d6c76..6c1a959 100644 --- a/simd/jsimd_arm64_neon.S +++ b/simd/jsimd_arm64_neon.S @@ -210,6 +210,11 @@ asm_function jsimd_idct_islow_neon TMP7 .req x13 TMP8 .req x14 + /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x3 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x3, w3 + sub sp, sp, #64 adr x15, Ljsimd_idct_islow_neon_consts st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32 @@ -807,6 +812,11 @@ asm_function jsimd_idct_ifast_neon TMP7 .req x13 TMP8 .req x14 + /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x3 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x3, w3 + /* Load and dequantize coefficients into NEON registers * with the following allocation: * 0 1 2 3 | 4 5 6 7 @@ -1101,6 +1111,11 @@ asm_function jsimd_idct_4x4_neon TMP3 .req x2 TMP4 .req x15 + /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x3 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x3, w3 + /* Save all used NEON registers */ sub sp, sp, 272 str x15, [sp], 16 @@ -1299,6 +1314,11 @@ asm_function jsimd_idct_2x2_neon TMP1 .req x0 TMP2 .req x15 + /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x3 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x3, w3 + /* vpush {v8.4h - v15.4h} ; not available */ sub sp, sp, 208 str x15, [sp], 16 @@ -1688,11 +1708,11 @@ asm_function jsimd_ycc_\colorid\()_convert_neon .else asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 .endif - OUTPUT_WIDTH .req x0 + OUTPUT_WIDTH .req w0 INPUT_BUF .req x1 - INPUT_ROW .req x2 + INPUT_ROW .req w2 OUTPUT_BUF .req x3 - NUM_ROWS .req x4 + NUM_ROWS .req w4 INPUT_BUF0 .req x5 INPUT_BUF1 .req x6 @@ -1702,7 +1722,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 Y .req x8 U .req x9 V .req x10 - N .req x15 + N .req w15 sub sp, sp, 336 str x15, [sp], 16 @@ -1745,11 +1765,10 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 cmp NUM_ROWS, #1 b.lt 9f 0: - lsl x16, INPUT_ROW, #3 - ldr Y, [INPUT_BUF0, x16] - ldr U, [INPUT_BUF1, x16] + ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3] + ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3] mov N, OUTPUT_WIDTH - ldr V, [INPUT_BUF2, x16] + ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3] add INPUT_ROW, INPUT_ROW, #1 ldr RGB, [OUTPUT_BUF], #8 @@ -2054,8 +2073,8 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 OUTPUT_WIDTH .req w0 INPUT_BUF .req x1 OUTPUT_BUF .req x2 - OUTPUT_ROW .req x3 - NUM_ROWS .req x4 + OUTPUT_ROW .req w3 + NUM_ROWS .req w4 OUTPUT_BUF0 .req x5 OUTPUT_BUF1 .req x6 @@ -2089,10 +2108,10 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 cmp NUM_ROWS, #1 b.lt 9f 0: - ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #3] - ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #3] + ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3] + ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3] mov N, OUTPUT_WIDTH - ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #3] + ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3] add OUTPUT_ROW, OUTPUT_ROW, #1 ldr RGB, [INPUT_BUF], #8 @@ -2199,6 +2218,11 @@ asm_function jsimd_convsamp_neon TMP8 .req x4 TMPDUP .req w3 + /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't + guarantee that the upper (unused) 32 bits of x1 are valid. This + instruction ensures that those bits are set to zero. */ + uxtw x1, w1 + mov TMPDUP, #128 ldp TMP1, TMP2, [SAMPLE_DATA], 16 ldp TMP3, TMP4, [SAMPLE_DATA], 16 -- 2.40.0