From cb88e5da8003afcdc443b787fdcb77285e5a8a02 Mon Sep 17 00:00:00 2001 From: mayeut Date: Tue, 20 Sep 2016 21:06:24 +0200 Subject: [PATCH] ARM64 NEON: Fix another ABI conformance issue Based on https://github.com/mayeut/libjpeg-turbo/commit/98a5a9dc899aa9265858a3cbe0a96289a31a1322 with wordsmithing by DRC. In the AArch64 ABI, as in many others, it's forbidden to read/store data below the stack pointer. Some SIMD functions were doing just that (stack pointer misuse) when trying to preserve callee-saved registers, and this resulted in those registers being restored with incorrect contents under certain circumstances. This patch fixes that behavior, and callee-saved registers are now stored above the stack pointer throughout the function call. The patch also removes register saving in places where it is unnecessary for this ABI, or it makes use of unused scratch regiters instead of callee-saved registers. Fixes #97. Closes #101. Refer also to https://bugzilla.redhat.com/show_bug.cgi?id=1368569 --- ChangeLog.md | 6 +++ simd/jsimd_arm64_neon.S | 108 +++++++++++----------------------------- 2 files changed, 34 insertions(+), 80 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 9cd195f..7f417a4 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -83,6 +83,12 @@ same structure, it was not known to pose a security threat, but removing the warning makes it easier to detect actual security issues, should they arise in the future. +10. Fixed another ABI conformance issue in the 64-bit ARM (AArch64) NEON SIMD +code. Some of the routines were incorrectly reading and storing data below the +stack pointer, which caused segfaults in certain applications under specific +circumstances. + + 1.5.0 ===== diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S index 6c1a959..3309858 100644 --- a/simd/jsimd_arm64_neon.S +++ b/simd/jsimd_arm64_neon.S @@ -217,8 +217,9 @@ asm_function jsimd_idct_islow_neon sub sp, sp, #64 adr x15, Ljsimd_idct_islow_neon_consts - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32 + mov x10, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32 ld1 {v0.8h, v1.8h}, [x15] ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64 ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64 @@ -243,7 +244,6 @@ asm_function jsimd_idct_islow_neon shl v10.8h, v2.8h, #(PASS1_BITS) sqxtn v16.8b, v15.8h mov TMP1, v16.d[0] - sub sp, sp, #64 mvn TMP2, TMP1 cbnz TMP2, 2f @@ -1117,18 +1117,12 @@ asm_function jsimd_idct_4x4_neon uxtw x3, w3 /* Save all used NEON registers */ - sub sp, sp, 272 - str x15, [sp], 16 + sub sp, sp, 64 + mov x9, sp /* Load constants (v3.4h is just used for padding) */ adr TMP4, Ljsimd_idct_4x4_neon_consts - st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 - st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 - st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 - st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 - st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] /* Load all COEF_BLOCK into NEON registers with the following allocation: @@ -1237,16 +1231,8 @@ asm_function jsimd_idct_4x4_neon #endif /* vpop {v8.4h - v15.4h} ;not available */ - sub sp, sp, #272 - ldr x15, [sp], 16 - ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 - ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 - ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 - ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 - ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 blr x30 .unreq DCT_TABLE @@ -1320,18 +1306,13 @@ asm_function jsimd_idct_2x2_neon uxtw x3, w3 /* vpush {v8.4h - v15.4h} ; not available */ - sub sp, sp, 208 - str x15, [sp], 16 + sub sp, sp, 64 + mov x9, sp /* Load constants */ adr TMP2, Ljsimd_idct_2x2_neon_consts - st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 - st1 {v21.8b, v22.8b}, [sp], 16 - st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 - st1 {v30.8b, v31.8b}, [sp], 16 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 ld1 {v14.4h}, [TMP2] /* Load all COEF_BLOCK into NEON registers with the following allocation: @@ -1431,15 +1412,8 @@ asm_function jsimd_idct_2x2_neon st1 {v26.b}[1], [TMP2], 1 st1 {v27.b}[5], [TMP2], 1 - sub sp, sp, #208 - ldr x15, [sp], 16 - ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 - ld1 {v21.8b, v22.8b}, [sp], 16 - ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 - ld1 {v30.8b, v31.8b}, [sp], 16 blr x30 .unreq DCT_TABLE @@ -1719,13 +1693,13 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 INPUT_BUF2 .req x1 RGB .req x7 - Y .req x8 - U .req x9 - V .req x10 + Y .req x9 + U .req x10 + V .req x11 N .req w15 - sub sp, sp, 336 - str x15, [sp], 16 + sub sp, sp, 64 + mov x9, sp /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ .if \fast_st3 == 1 @@ -1735,23 +1709,11 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 .endif /* Save NEON registers */ - st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 - st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 - st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 - st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 - st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 ld1 {v0.4h, v1.4h}, [x15], 16 ld1 {v2.8h}, [x15] - /* Save ARM registers and handle input arguments */ - /* push {x4, x5, x6, x7, x8, x9, x10, x30} */ - stp x4, x5, [sp], 16 - stp x6, x7, [sp], 16 - stp x8, x9, [sp], 16 - stp x10, x30, [sp], 16 ldr INPUT_BUF0, [INPUT_BUF] ldr INPUT_BUF1, [INPUT_BUF, #8] ldr INPUT_BUF2, [INPUT_BUF, #16] @@ -1818,21 +1780,8 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 b.gt 0b 9: /* Restore all registers and return */ - sub sp, sp, #336 - ldr x15, [sp], 16 - ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 - ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 - ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 - ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 - ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 - /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */ - ldp x4, x5, [sp], 16 - ldp x6, x7, [sp], 16 - ldp x8, x9, [sp], 16 - ldp x10, x30, [sp], 16 br x30 .unreq OUTPUT_WIDTH .unreq INPUT_ROW @@ -2101,8 +2050,9 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 /* Save NEON registers */ sub sp, sp, #64 - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + mov x9, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 /* Outer loop over scanlines */ cmp NUM_ROWS, #1 @@ -2155,7 +2105,6 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 b.gt 0b 9: /* Restore all registers and return */ - sub sp, sp, #64 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 br x30 @@ -2359,8 +2308,9 @@ asm_function jsimd_fdct_islow_neon /* Save NEON registers */ sub sp, sp, #64 - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + mov x10, sp + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32 /* Load all DATA into NEON registers with the following allocation: * 0 1 2 3 | 4 5 6 7 @@ -2590,7 +2540,6 @@ asm_function jsimd_fdct_islow_neon st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] /* Restore NEON registers */ - sub sp, sp, #64 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 @@ -3104,7 +3053,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl sub sp, sp, 272 sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */ /* Save ARM registers */ - stp x19, x20, [sp], 16 + stp x19, x20, [sp] .if \fast_tbl == 1 adr x15, Ljsimd_huff_encode_one_block_neon_consts .else @@ -3318,7 +3267,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl and v18.16b, v18.16b, v23.16b add x3, x4, #0x400 /* r1 = dctbl->ehufsi */ and v20.16b, v20.16b, v23.16b - add x15, sp, #0x80 /* x15 = t2 */ + add x15, sp, #0x90 /* x15 = t2 */ and v22.16b, v22.16b, v23.16b ldr w10, [x4, x12, lsl #2] addp v16.16b, v16.16b, v18.16b @@ -3341,7 +3290,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl rbit x9, x9 /* x9 = index0 */ ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */ cmp w12, #(64-8) - mov x11, sp + add x11, sp, #16 b.lt 4f cbz x9, 6f st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 @@ -3445,7 +3394,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl put_bits x3, x11 cbnz x9, 1b 6: - add x13, sp, #0xfe + add x13, sp, #0x10e cmp x15, x13 b.hs 1f ldr w12, [x5] @@ -3453,7 +3402,6 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl checkbuf47 put_bits x12, x14 1: - sub sp, sp, 16 str PUT_BUFFER, [x0, #0x10] str PUT_BITSw, [x0, #0x18] ldp x19, x20, [sp], 16 -- 2.40.0