From bdc7650b9e5a716a83898664d5d616c638072a49 Mon Sep 17 00:00:00 2001 From: DRC Date: Sat, 23 Aug 2014 15:57:38 +0000 Subject: [PATCH] ARM64 NEON SIMD support for YCC-to-RGB565 conversion git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1386 632fc199-4ca6-4c93-a231-07263d6284db --- simd/jsimd_arm64.c | 16 ++++++++++- simd/jsimd_arm_neon_64.S | 57 ++++++++++++++++++++++++++++++++++------ 2 files changed, 64 insertions(+), 9 deletions(-) diff --git a/simd/jsimd_arm64.c b/simd/jsimd_arm64.c index a346d16..65724cb 100644 --- a/simd/jsimd_arm64.c +++ b/simd/jsimd_arm64.c @@ -98,6 +98,17 @@ jsimd_can_ycc_rgb (void) GLOBAL(int) jsimd_can_ycc_rgb565 (void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + return 0; } @@ -145,7 +156,7 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, case JCS_EXT_ARGB: neonfct=jsimd_ycc_extxrgb_convert_neon; break; - default: + default: neonfct=jsimd_ycc_extrgb_convert_neon; break; } @@ -159,6 +170,9 @@ jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows) { + if (simd_support & JSIMD_ARM_NEON) + jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row, + output_buf, num_rows); } GLOBAL(int) diff --git a/simd/jsimd_arm_neon_64.S b/simd/jsimd_arm_neon_64.S index b0ba480..0ef770a 100644 --- a/simd/jsimd_arm_neon_64.S +++ b/simd/jsimd_arm_neon_64.S @@ -4,7 +4,7 @@ * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). * All rights reserved. * Author: Siarhei Siamashka - * Copyright (C) 2013, Linaro Limited + * Copyright (C) 2013-2014, Linaro Limited * Author: Ragesh Radhakrishnan * * This software is provided 'as-is', without any express or implied @@ -1576,7 +1576,20 @@ asm_function jsimd_idct_2x2_neon .else .error unsupported macroblock size .endif - .else + .elseif \bpp==16 + .if \size == 8 + st1 {v25.8h}, [RGB],16 + .elseif \size == 4 + st1 {v25.4h}, [RGB],8 + .elseif \size == 2 + st1 {v25.h}[4], [RGB],2 + st1 {v25.h}[5], [RGB],2 + .elseif \size == 1 + st1 {v25.h}[6], [RGB],2 + .else + .error unsupported macroblock size + .endif + .else .error unsupported bpp .endif .endm @@ -1610,24 +1623,33 @@ asm_function jsimd_idct_2x2_neon uaddw v20.8h, v20.8h, v0.8b uaddw v24.8h, v24.8h, v0.8b uaddw v28.8h, v28.8h, v0.8b +.if \bpp != 16 sqxtun v1\g_offs\defsize, v20.8h sqxtun v1\r_offs\defsize, v24.8h sqxtun v1\b_offs\defsize, v28.8h +.else + sqshlu v21.8h, v20.8h, #8 + sqshlu v25.8h, v24.8h, #8 + sqshlu v29.8h, v28.8h, #8 + sri v25.8h, v21.8h, #5 + sri v25.8h, v29.8h, #11 +.endif .endm .macro do_yuv_to_rgb_stage2_store_load_stage1 - ld1 {v4.8b}, [U], 8 rshrn v20.4h, v20.4s, #15 - rshrn2 v20.8h, v22.4s, #15 rshrn v24.4h, v24.4s, #14 - rshrn2 v24.8h, v26.4s, #14 rshrn v28.4h, v28.4s, #14 - ld1 {v5.8b}, [V], 8 + ld1 {v4.8b}, [U], 8 + rshrn2 v20.8h, v22.4s, #15 + rshrn2 v24.8h, v26.4s, #14 rshrn2 v28.8h, v30.4s, #14 + ld1 {v5.8b}, [V], 8 uaddw v20.8h, v20.8h, v0.8b uaddw v24.8h, v24.8h, v0.8b uaddw v28.8h, v28.8h, v0.8b +.if \bpp != 16 /**************** rgb24/rgb32 *********************************/ sqxtun v1\g_offs\defsize, v20.8h ld1 {v0.8b}, [Y], 8 sqxtun v1\r_offs\defsize, v24.8h @@ -1637,13 +1659,32 @@ asm_function jsimd_idct_2x2_neon sqxtun v1\b_offs\defsize, v28.8h uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ - do_store \bpp, 8 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ +.else /**************************** rgb565 ***********************************/ + sqshlu v21.8h, v20.8h, #8 + sqshlu v25.8h, v24.8h, #8 + sqshlu v29.8h, v28.8h, #8 + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ + ld1 {v0.8b}, [Y], 8 + smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ + smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ + smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ + smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ + sri v25.8h, v21.8h, #5 + smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ + smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ + prfm PLDL1KEEP, [U, #64] + prfm PLDL1KEEP, [V, #64] + prfm PLDL1KEEP, [Y, #64] + sri v25.8h, v29.8h, #11 +.endif + do_store \bpp, 8 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */ smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */ .endm @@ -1812,6 +1853,6 @@ generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, . generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b - +generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b .purgem do_load .purgem do_store -- 2.40.0