From 4346f91fcb3b576fb037cef7dc94eac2e5a06250 Mon Sep 17 00:00:00 2001 From: DRC Date: Tue, 14 Jun 2011 22:16:50 +0000 Subject: [PATCH] iOS ARM support git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@659 632fc199-4ca6-4c93-a231-07263d6284db --- acinclude.m4 | 26 ++++++++++++- configure.ac | 15 ++++++++ simd/jsimd_arm.c | 11 +++++- simd/jsimd_arm_neon.S | 90 ++++++++++++++++++++++++------------------- 4 files changed, 98 insertions(+), 44 deletions(-) diff --git a/acinclude.m4 b/acinclude.m4 index f6355bf..70e2555 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -140,18 +140,40 @@ fi # Test whether the assembler is suitable and supports NEON instructions AC_DEFUN([AC_CHECK_COMPATIBLE_ARM_ASSEMBLER_IFELSE],[ ac_good_gnu_arm_assembler=no + ac_save_CC="$CC" ac_save_CFLAGS="$CFLAGS" - CFLAGS="-x assembler-with-cpp $CFLAGS" + CFLAGS="$CCASFLAGS -x assembler-with-cpp" + CC="$CCAS" AC_COMPILE_IFELSE([[ .text .fpu neon .arch armv7a .object_arch armv4 .arm - .altmacro pld [r0] vmovn.u16 d0, q0]], ac_good_gnu_arm_assembler=yes) + + ac_use_gas_preprocessor=no + if test "x$ac_good_gnu_arm_assembler" = "xno" ; then + CC="gas-preprocessor.pl $CCAS" + AC_COMPILE_IFELSE([[ + .text + .fpu neon + .arch armv7a + .object_arch armv4 + .arm + pld [r0] + vmovn.u16 d0, q0]], ac_use_gas_preprocessor=yes) + fi CFLAGS="$ac_save_CFLAGS" + CC="$ac_save_CC" + + if test "x$ac_use_gas_preprocessor" = "xyes" ; then + CCAS="gas-preprocessor.pl $CCAS" + AC_SUBST([CCAS]) + ac_good_gnu_arm_assembler=yes + fi + if test "x$ac_good_gnu_arm_assembler" = "xyes" ; then $1 else diff --git a/configure.ac b/configure.ac index 9aa9dfa..83922c5 100644 --- a/configure.ac +++ b/configure.ac @@ -260,6 +260,21 @@ AC_SUBST(JAVA_RPM_CONTENTS_1) AC_SUBST(JAVA_RPM_CONTENTS_2) AC_SUBST(RPM_CONFIG_ARGS) +# optionally force using gas-preprocessor.pl for compatibility testing +AC_ARG_WITH([gas-preprocessor], + AC_HELP_STRING([--with-gas-preprocessor],[Force using gas-preprocessor.pl on ARM.])) +if test "x${with_gas_preprocessor}" = "xyes"; then + case $host_os in + darwin*) + CCAS="gas-preprocessor.pl -fix-unreq $CC" + ;; + *) + CCAS="gas-preprocessor.pl -no-fix-unreq $CC" + ;; + esac + AC_SUBST([CCAS]) +fi + # SIMD is optional AC_ARG_WITH([simd], AC_HELP_STRING([--without-simd],[Omit SIMD extensions.])) diff --git a/simd/jsimd_arm.c b/simd/jsimd_arm.c index b70b94e..b4d094d 100644 --- a/simd/jsimd_arm.c +++ b/simd/jsimd_arm.c @@ -29,7 +29,7 @@ static unsigned int simd_support = ~0; -#ifdef __linux__ +#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) #define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) @@ -100,14 +100,21 @@ LOCAL(void) init_simd (void) { char *env = NULL; +#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) int bufsize = 1024; /* an initial guess for the line buffer size limit */ +#endif if (simd_support != ~0) return; simd_support = 0; -#ifdef __linux__ +#if defined(__ARM_NEON__) + simd_support |= JSIMD_ARM_NEON; +#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) + /* We still have a chance to use NEON regardless of globally used + * -mcpu/-mfpu options passed to gcc by performing runtime detection via + * /proc/cpuinfo parsing on linux/android */ while (!parse_proc_cpuinfo(bufsize)) { bufsize *= 2; if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S index 2d66ab2..6f4de8d 100644 --- a/simd/jsimd_arm_neon.S +++ b/simd/jsimd_arm_neon.S @@ -30,28 +30,33 @@ .fpu neon .arch armv7a .object_arch armv4 -.altmacro .arm /*****************************************************************************/ /* Supplementary macro for setting function attributes */ .macro asm_function fname - .func fname - .global fname +#ifdef __APPLE__ + .func _\fname + .globl _\fname +_\fname: +#else + .func \fname + .global \fname #ifdef __ELF__ - .hidden fname - .type fname, %function + .hidden \fname + .type \fname, %function +#endif +\fname: #endif -fname: .endm /* Transpose a block of 4x4 coefficients in four 64-bit registers */ .macro transpose_4x4 x0, x1, x2, x3 - vtrn.16 x0, x1 - vtrn.16 x2, x3 - vtrn.32 x0, x2 - vtrn.32 x1, x3 + vtrn.16 \x0, \x1 + vtrn.16 \x2, \x3 + vtrn.32 \x0, \x2 + vtrn.32 \x1, \x3 .endm /*****************************************************************************/ @@ -224,7 +229,7 @@ asm_function jsimd_idct_ifast_neon .irp x, d4, d6, d8, d10, d12, d14, d16, d18 ldr TMP, [OUTPUT_BUF], #4 add TMP, TMP, OUTPUT_COL - vst1.8 {x}, [TMP]! + vst1.8 {\x}, [TMP]! .endr vpop {d8-d15} @@ -252,22 +257,16 @@ asm_function jsimd_idct_ifast_neon * Colorspace conversion YCbCr -> RGB */ -.balign 16 -jsimd_ycc_rgb_neon_consts: - .short 0, 0, 0, 0 - .short 22971, -11277, -23401, 29033 - .short -128, -128, -128, -128 - .short -128, -128, -128, -128 .macro do_load size - .if size == 8 + .if \size == 8 vld1.8 {d4}, [U]! vld1.8 {d5}, [V]! vld1.8 {d0}, [Y]! pld [Y, #64] pld [U, #64] pld [V, #64] - .elseif size == 4 + .elseif \size == 4 vld1.8 {d4[0]}, [U]! vld1.8 {d4[1]}, [U]! vld1.8 {d4[2]}, [U]! @@ -280,14 +279,14 @@ jsimd_ycc_rgb_neon_consts: vld1.8 {d0[1]}, [Y]! vld1.8 {d0[2]}, [Y]! vld1.8 {d0[3]}, [Y]! - .elseif size == 2 + .elseif \size == 2 vld1.8 {d4[4]}, [U]! vld1.8 {d4[5]}, [U]! vld1.8 {d5[4]}, [V]! vld1.8 {d5[5]}, [V]! vld1.8 {d0[4]}, [Y]! vld1.8 {d0[5]}, [Y]! - .elseif size == 1 + .elseif \size == 1 vld1.8 {d4[6]}, [U]! vld1.8 {d5[6]}, [V]! vld1.8 {d0[6]}, [Y]! @@ -297,34 +296,34 @@ jsimd_ycc_rgb_neon_consts: .endm .macro do_store bpp, size - .if bpp == 24 - .if size == 8 + .if \bpp == 24 + .if \size == 8 vst3.8 {d10, d11, d12}, [RGB]! - .elseif size == 4 + .elseif \size == 4 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! - .elseif size == 2 + .elseif \size == 2 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! - .elseif size == 1 + .elseif \size == 1 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! .else .error unsupported macroblock size .endif - .elseif bpp == 32 - .if size == 8 + .elseif \bpp == 32 + .if \size == 8 vst4.8 {d10, d11, d12, d13}, [RGB]! - .elseif size == 4 + .elseif \size == 4 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! - .elseif size == 2 + .elseif \size == 2 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! - .elseif size == 1 + .elseif \size == 1 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! .else .error unsupported macroblock size @@ -356,12 +355,23 @@ jsimd_ycc_rgb_neon_consts: vaddw.u8 q10, q10, d0 vaddw.u8 q12, q12, d0 vaddw.u8 q14, q14, d0 - vqmovun.s16 d1&g_offs, q10 - vqmovun.s16 d1&r_offs, q12 - vqmovun.s16 d1&b_offs, q14 + vqmovun.s16 d1\g_offs, q10 + vqmovun.s16 d1\r_offs, q12 + vqmovun.s16 d1\b_offs, q14 .endm -asm_function jsimd_ycc_&colorid&_convert_neon +/* Apple gas crashes on adrl, work around that by using adr. + * But this requires a copy of these constants for each function. + */ + +.balign 16 +jsimd_ycc_\colorid\()_neon_consts: + .short 0, 0, 0, 0 + .short 22971, -11277, -23401, 29033 + .short -128, -128, -128, -128 + .short -128, -128, -128, -128 + +asm_function jsimd_ycc_\colorid\()_convert_neon OUTPUT_WIDTH .req r0 INPUT_BUF .req r1 INPUT_ROW .req r2 @@ -379,7 +389,7 @@ asm_function jsimd_ycc_&colorid&_convert_neon N .req ip /* Load constants to d1, d2, d3 (d0 is just used for padding) */ - adrl ip, jsimd_ycc_rgb_neon_consts + adr ip, jsimd_ycc_\colorid\()_neon_consts vld1.16 {d0, d1, d2, d3}, [ip, :128] /* Save ARM registers and handle input arguments */ @@ -414,7 +424,7 @@ asm_function jsimd_ycc_&colorid&_convert_neon 1: do_load 8 do_yuv_to_rgb - do_store bpp, 8 + do_store \bpp, 8 subs N, N, #8 bge 1b tst N, #7 @@ -435,15 +445,15 @@ asm_function jsimd_ycc_&colorid&_convert_neon do_yuv_to_rgb tst N, #4 beq 6f - do_store bpp, 4 + do_store \bpp, 4 6: tst N, #2 beq 7f - do_store bpp, 2 + do_store \bpp, 2 7: tst N, #1 beq 8f - do_store bpp, 1 + do_store \bpp, 1 8: subs NUM_ROWS, NUM_ROWS, #1 bgt 0b -- 2.40.0