From ec6941f7bc5b3ea354825cef24085e8d4abbab02 Mon Sep 17 00:00:00 2001 From: DRC Date: Fri, 15 Jan 2016 09:29:11 -0600 Subject: [PATCH] Complete the ARM64 NEON SIMD implementation This adds 64-bit NEON coverage for all of the algorithms that are covered by the 32-bit NEON implementation, except for h2v1 (4:2:2) fancy upsampling (used when decompressing 4:2:2 JPEG images.) It also adds 64-bit NEON SIMD coverage for: * slow integer forward DCT (compressor) * h2v2 (4:2:0) downsampling (compressor) * h2v1 (4:2:2) downsampling (compressor) which are not covered in the 32-bit implementation. Compression speedups relative to libjpeg-turbo 1.4.2: Apple A7 (iPhone 5S), iOS, 64-bit: 113-150% (reported) 48-core ThunderX (RunAbove ARM Cloud), Linux, 64-bit: 2.1-33% (avg. 15%) Refer to #44 and #49 for discussion This commit also removes the unnecessary if (simd_support & JSIMD_ARM_NEON) statements from the jsimd* algorithm functions. Since the jsimd_can*() functions check for the existence of NEON, the corresponding algorithm functions will never be called if NEON isn't available. Based on: https://github.com/mayeut/libjpeg-turbo/commit/dcd9d84f10fae192c0e3935818dc289bca9c3e29 https://github.com/mayeut/libjpeg-turbo/commit/b0d87b811f37bd560083deea8c6e7d704e5cd944 https://github.com/mayeut/libjpeg-turbo/commit/70cd5c8a493a67f4d54dd2067ae6dedb65d95389 https://github.com/mayeut/libjpeg-turbo/commit/3e58d9a064648503c57ec2650ee79880f749a52b https://github.com/mayeut/libjpeg-turbo/commit/837b19542f53fa81af83e6ba002d559877aaf597 https://github.com/mayeut/libjpeg-turbo/commit/73dc43ccc870c2e10ba893e9764b8e48d6836585 https://github.com/mayeut/libjpeg-turbo/commit/a82b71a261b4c0213f558baf4bc745f1c27356d8 https://github.com/mayeut/libjpeg-turbo/commit/c1b1188c2106d6ea7b76644b6023b57edeb602e1 https://github.com/mayeut/libjpeg-turbo/commit/305c89284e1bb222b34fbc7261f697a0cc452a41 https://github.com/mayeut/libjpeg-turbo/commit/7f443f99950b4d7d442b9b879648eca5273209bd https://github.com/mayeut/libjpeg-turbo/commit/4c2b53b77da5a20e30e2aadaeddb0efbfe24e06d Unified version with fixes: https://github.com/mayeut/libjpeg-turbo/commit/1004a3cd05870612a194b410efeaa1b4da76d246 --- ChangeLog.txt | 7 + simd/jsimd.h | 12 + simd/jsimd_arm64.c | 148 ++++- simd/jsimd_arm64_neon.S | 1211 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 1359 insertions(+), 19 deletions(-) diff --git a/ChangeLog.txt b/ChangeLog.txt index f79660e..379cfbd 100644 --- a/ChangeLog.txt +++ b/ChangeLog.txt @@ -71,6 +71,13 @@ setting the JSIMD_NOHUFFENC environment variable to 1. platforms. This speeds up the compression of full-color JPEGs by about 30% on average. +[14] Completed the ARM 64-bit (ARMv8) NEON SIMD implementation. 64-bit ARM +now has SIMD coverage for all of the algorithms that are covered in the 32-bit +(ARMv7) implementation, except for h2v1 (4:2:2) fancy upsampling. +Additionally, the ARM 64-bit SIMD implementation now accelerates the slow +integer forward DCT and h2v2 & h2v1 downsampling algorithms, which are not +accelerated in the 32-bit implementation. + 1.4.2 ===== diff --git a/simd/jsimd.h b/simd/jsimd.h index e259fea..1c598f0 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -360,6 +360,11 @@ EXTERN(void) jsimd_h2v1_downsample_sse2 JDIMENSION v_samp_factor, JDIMENSION width_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); +EXTERN(void) jsimd_h2v1_downsample_neon + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + EXTERN(void) jsimd_h2v1_downsample_mips_dspr2 (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, JDIMENSION width_blocks, @@ -381,6 +386,11 @@ EXTERN(void) jsimd_h2v2_downsample_sse2 JDIMENSION v_samp_factor, JDIMENSION width_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); +EXTERN(void) jsimd_h2v2_downsample_neon + (JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data); + EXTERN(void) jsimd_h2v2_downsample_mips_dspr2 (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, JDIMENSION width_blocks, @@ -680,6 +690,8 @@ EXTERN(void) jsimd_fdct_islow_mmx (DCTELEM * data); extern const int jconst_fdct_islow_sse2[]; EXTERN(void) jsimd_fdct_islow_sse2 (DCTELEM * data); +EXTERN(void) jsimd_fdct_islow_neon (DCTELEM * data); + EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM * data); EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM * data); diff --git a/simd/jsimd_arm64.c b/simd/jsimd_arm64.c index 585feeb..2b0e6f7 100644 --- a/simd/jsimd_arm64.c +++ b/simd/jsimd_arm64.c @@ -2,8 +2,8 @@ * jsimd_arm64.c * * Copyright 2009 Pierre Ossman for Cendio AB - * Copyright 2009-2011, 2013-2014 D. R. Commander - * Copyright 2015 Matthieu Darbois + * Copyright 2009-2011, 2013-2014, 2016 D. R. Commander + * Copyright 2015-2016 Matthieu Darbois * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -66,6 +66,17 @@ jsimd_can_rgb_ycc (void) { init_simd(); + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + return 0; } @@ -118,6 +129,37 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows) { + void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch(cinfo->in_color_space) { + case JCS_EXT_RGB: + neonfct=jsimd_extrgb_ycc_convert_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct=jsimd_extrgbx_ycc_convert_neon; + break; + case JCS_EXT_BGR: + neonfct=jsimd_extbgr_ycc_convert_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct=jsimd_extbgrx_ycc_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct=jsimd_extxbgr_ycc_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct=jsimd_extxrgb_ycc_convert_neon; + break; + default: + neonfct=jsimd_extrgb_ycc_convert_neon; + break; + } + + neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); } GLOBAL(void) @@ -162,8 +204,7 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, break; } - if (simd_support & JSIMD_ARM_NEON) - neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); + neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); } GLOBAL(void) @@ -171,9 +212,8 @@ jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows) { - if (simd_support & JSIMD_ARM_NEON) - jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row, - output_buf, num_rows); + jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row, + output_buf, num_rows); } GLOBAL(int) @@ -181,6 +221,17 @@ jsimd_can_h2v2_downsample (void) { init_simd(); + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + return 0; } @@ -189,6 +240,17 @@ jsimd_can_h2v1_downsample (void) { init_simd(); + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + return 0; } @@ -196,12 +258,18 @@ GLOBAL(void) jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { + jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); } GLOBAL(void) jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { + jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); } GLOBAL(int) @@ -305,6 +373,19 @@ jsimd_can_convsamp (void) { init_simd(); + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + return 0; } @@ -320,6 +401,7 @@ GLOBAL(void) jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace) { + jsimd_convsamp_neon(sample_data, start_col, workspace); } GLOBAL(void) @@ -333,6 +415,15 @@ jsimd_can_fdct_islow (void) { init_simd(); + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + return 0; } @@ -341,6 +432,15 @@ jsimd_can_fdct_ifast (void) { init_simd(); + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + return 0; } @@ -355,11 +455,13 @@ jsimd_can_fdct_float (void) GLOBAL(void) jsimd_fdct_islow (DCTELEM * data) { + jsimd_fdct_islow_neon(data); } GLOBAL(void) jsimd_fdct_ifast (DCTELEM * data) { + jsimd_fdct_ifast_neon(data); } GLOBAL(void) @@ -372,6 +474,17 @@ jsimd_can_quantize (void) { init_simd(); + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + return 0; } @@ -387,6 +500,7 @@ GLOBAL(void) jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace) { + jsimd_quantize_neon(coef_block, divisors, workspace); } GLOBAL(void) @@ -446,9 +560,8 @@ jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { - if (simd_support & JSIMD_ARM_NEON) - jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, - output_col); + jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, + output_col); } GLOBAL(void) @@ -456,9 +569,8 @@ jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { - if (simd_support & JSIMD_ARM_NEON) - jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, - output_col); + jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, + output_col); } GLOBAL(int) @@ -522,9 +634,8 @@ jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { - if (simd_support & JSIMD_ARM_NEON) - jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, - output_col); + jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, + output_col); } GLOBAL(void) @@ -532,9 +643,8 @@ jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { - if (simd_support & JSIMD_ARM_NEON) - jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, - output_col); + jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, + output_col); } GLOBAL(void) diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S index 89fc660..040386d 100644 --- a/simd/jsimd_arm64_neon.S +++ b/simd/jsimd_arm64_neon.S @@ -7,6 +7,7 @@ * Copyright (C) 2013-2014, Linaro Limited * Author: Ragesh Radhakrishnan * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -91,6 +92,35 @@ _\fname: transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b .endm +.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3 + trn1 \t0\().8h, \l0\().8h, \l1\().8h + trn1 \t1\().8h, \l2\().8h, \l3\().8h + trn1 \t2\().8h, \l4\().8h, \l5\().8h + trn1 \t3\().8h, \l6\().8h, \l7\().8h + trn2 \l1\().8h, \l0\().8h, \l1\().8h + trn2 \l3\().8h, \l2\().8h, \l3\().8h + trn2 \l5\().8h, \l4\().8h, \l5\().8h + trn2 \l7\().8h, \l6\().8h, \l7\().8h + + trn1 \l4\().4s, \t2\().4s, \t3\().4s + trn2 \t3\().4s, \t2\().4s, \t3\().4s + trn1 \t2\().4s, \t0\().4s, \t1\().4s + trn2 \l2\().4s, \t0\().4s, \t1\().4s + trn1 \t0\().4s, \l1\().4s, \l3\().4s + trn2 \l3\().4s, \l1\().4s, \l3\().4s + trn2 \t1\().4s, \l5\().4s, \l7\().4s + trn1 \l5\().4s, \l5\().4s, \l7\().4s + + trn2 \l6\().2d, \l2\().2d, \t3\().2d + trn1 \l0\().2d, \t2\().2d, \l4\().2d + trn1 \l1\().2d, \t0\().2d, \l5\().2d + trn2 \l7\().2d, \l3\().2d, \t1\().2d + trn1 \l2\().2d, \l2\().2d, \t3\().2d + trn2 \l4\().2d, \t2\().2d, \l4\().2d + trn1 \l3\().2d, \l3\().2d, \t1\().2d + trn2 \l5\().2d, \t0\().2d, \l5\().2d +.endm + #define CENTERJSAMPLE 128 @@ -1055,6 +1085,7 @@ asm_function jsimd_idct_ifast_neon .unreq TMP2 .unreq TMP3 .unreq TMP4 + .unreq TMP5 /*****************************************************************************/ @@ -1859,3 +1890,1183 @@ generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, . generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b .purgem do_load .purgem do_store + +/*****************************************************************************/ + +/* + * jsimd_extrgb_ycc_convert_neon + * jsimd_extbgr_ycc_convert_neon + * jsimd_extrgbx_ycc_convert_neon + * jsimd_extbgrx_ycc_convert_neon + * jsimd_extxbgr_ycc_convert_neon + * jsimd_extxrgb_ycc_convert_neon + * + * Colorspace conversion RGB -> YCbCr + */ + +.macro do_store size + .if \size == 8 + st1 {v20.8b}, [Y], #8 + st1 {v21.8b}, [U], #8 + st1 {v22.8b}, [V], #8 + .elseif \size == 4 + st1 {v20.b}[0], [Y], #1 + st1 {v20.b}[1], [Y], #1 + st1 {v20.b}[2], [Y], #1 + st1 {v20.b}[3], [Y], #1 + st1 {v21.b}[0], [U], #1 + st1 {v21.b}[1], [U], #1 + st1 {v21.b}[2], [U], #1 + st1 {v21.b}[3], [U], #1 + st1 {v22.b}[0], [V], #1 + st1 {v22.b}[1], [V], #1 + st1 {v22.b}[2], [V], #1 + st1 {v22.b}[3], [V], #1 + .elseif \size == 2 + st1 {v20.b}[4], [Y], #1 + st1 {v20.b}[5], [Y], #1 + st1 {v21.b}[4], [U], #1 + st1 {v21.b}[5], [U], #1 + st1 {v22.b}[4], [V], #1 + st1 {v22.b}[5], [V], #1 + .elseif \size == 1 + st1 {v20.b}[6], [Y], #1 + st1 {v21.b}[6], [U], #1 + st1 {v22.b}[6], [V], #1 + .else + .error unsupported macroblock size + .endif +.endm + +.macro do_load bpp, size + .if \bpp == 24 + .if \size == 8 + ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24 + prfm pldl1keep, [RGB, #128] + .elseif \size == 4 + ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3 + .elseif \size == 2 + ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3 + ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3 + .elseif \size == 1 + ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3 + .else + .error unsupported macroblock size + .endif + .elseif \bpp == 32 + .if \size == 8 + ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32 + prfm pldl1keep, [RGB, #128] + .elseif \size == 4 + ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4 + .elseif \size == 2 + ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4 + ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4 + .elseif \size == 1 + ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4 + .else + .error unsupported macroblock size + .endif + .else + .error unsupported bpp + .endif +.endm + +.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs + +/* + * 2-stage pipelined RGB->YCbCr conversion + */ + +.macro do_rgb_to_yuv_stage1 + ushll v4.8h, v1\r_offs\().8b, #0 /* r = { d4, d5 } */ + ushll v6.8h, v1\g_offs\().8b, #0 /* g = { d6, d7 } */ + ushll v8.8h, v1\b_offs\().8b, #0 /* b = { d8, d9 } */ + ins v5.d[0], v4.d[1] + ins v7.d[0], v6.d[1] + ins v9.d[0], v8.d[1] + rev64 v18.4s, v1.4s + rev64 v26.4s, v1.4s + rev64 v28.4s, v1.4s + rev64 v30.4s, v1.4s + umull v14.4s, v4.4h, v0.h[0] + umull v16.4s, v5.4h, v0.h[0] + umlsl v18.4s, v4.4h, v0.h[3] + umlsl v26.4s, v5.4h, v0.h[3] + umlal v28.4s, v4.4h, v0.h[5] + umlal v30.4s, v5.4h, v0.h[5] + umlal v14.4s, v6.4h, v0.h[1] + umlal v16.4s, v7.4h, v0.h[1] + umlsl v18.4s, v6.4h, v0.h[4] + umlsl v26.4s, v7.4h, v0.h[4] + umlsl v28.4s, v6.4h, v0.h[6] + umlsl v30.4s, v7.4h, v0.h[6] + umlal v14.4s, v8.4h, v0.h[2] + umlal v16.4s, v9.4h, v0.h[2] + umlal v18.4s, v8.4h, v0.h[5] + umlal v26.4s, v9.4h, v0.h[5] + umlsl v28.4s, v8.4h, v0.h[7] + umlsl v30.4s, v9.4h, v0.h[7] +.endm + +.macro do_rgb_to_yuv_stage2 + rshrn v20.4h, v14.4s, #16 + rshrn v21.4h, v16.4s, #16 + shrn v22.4h, v18.4s, #16 + shrn v23.4h, v26.4s, #16 + shrn v24.4h, v28.4s, #16 + shrn v25.4h, v30.4s, #16 + ins v20.d[1], v21.d[0] + ins v22.d[1], v23.d[0] + ins v24.d[1], v25.d[0] + xtn v20.8b, v20.8h /* v20 = y */ + xtn v21.8b, v22.8h /* v21 = u */ + xtn v22.8b, v24.8h /* v22 = v */ +.endm + +.macro do_rgb_to_yuv + do_rgb_to_yuv_stage1 + do_rgb_to_yuv_stage2 +.endm + +.macro do_rgb_to_yuv_stage2_store_load_stage1 + rshrn v20.4h, v14.4s, #16 + rshrn v21.4h, v16.4s, #16 + shrn v22.4h, v18.4s, #16 + rev64 v18.4s, v1.4s + shrn v23.4h, v26.4s, #16 + ins v20.d[1], v21.d[0] + rev64 v26.4s, v1.4s + shrn v24.4h, v28.4s, #16 + shrn v25.4h, v30.4s, #16 + ins v22.d[1], v23.d[0] + do_load \bpp, 8 + xtn v20.8b, v20.8h /* dv0 = y */ + ins v24.d[1], v25.d[0] + ushll v4.8h, v1\r_offs\().8b, #0 /* r = { v4.8h } */ + xtn v21.8b, v22.8h /* v21 = u */ + ushll v6.8h, v1\g_offs\().8b, #0 /* g = { v6.8h } */ + ushll v8.8h, v1\b_offs\().8b, #0 /* b = { v8.8h } */ + xtn v22.8b, v24.8h /* v22 = v */ + ins v5.d[0], v4.d[1] + ins v7.d[0], v6.d[1] + ins v9.d[0], v8.d[1] + st1 {v20.8b}, [Y], #8 + umull v14.4s, v4.4h, v0.h[0] + umull v16.4s, v5.4h, v0.h[0] + umlsl v18.4s, v4.4h, v0.h[3] + umlal v14.4s, v6.4h, v0.h[1] + umlal v16.4s, v7.4h, v0.h[1] + umlsl v18.4s, v6.4h, v0.h[4] + umlal v14.4s, v8.4h, v0.h[2] + umlal v16.4s, v9.4h, v0.h[2] + umlal v18.4s, v8.4h, v0.h[5] + rev64 v28.4s, v1.4s + rev64 v30.4s, v1.4s + st1 {v21.8b}, [U], #8 + umlsl v26.4s, v5.4h, v0.h[3] + umlal v28.4s, v4.4h, v0.h[5] + umlal v30.4s, v5.4h, v0.h[5] + st1 {v22.8b}, [V], #8 + umlsl v26.4s, v7.4h, v0.h[4] + umlsl v28.4s, v6.4h, v0.h[6] + umlsl v30.4s, v7.4h, v0.h[6] + umlal v26.4s, v9.4h, v0.h[5] + umlsl v28.4s, v8.4h, v0.h[7] + umlsl v30.4s, v9.4h, v0.h[7] +.endm + +.balign 16 +Ljsimd_\colorid\()_ycc_neon_consts: + .short 19595, 38470, 7471, 11059 + .short 21709, 32768, 27439, 5329 + .short 32767, 128, 32767, 128 + .short 32767, 128, 32767, 128 + +asm_function jsimd_\colorid\()_ycc_convert_neon + OUTPUT_WIDTH .req w0 + INPUT_BUF .req x1 + OUTPUT_BUF .req x2 + OUTPUT_ROW .req x3 + NUM_ROWS .req x4 + + OUTPUT_BUF0 .req x5 + OUTPUT_BUF1 .req x6 + OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */ + + RGB .req x7 + Y .req x9 + U .req x10 + V .req x11 + N .req w12 + + /* Load constants to d0, d1, d2, d3 */ + adr x13, Ljsimd_\colorid\()_ycc_neon_consts + ld1 {v0.8h, v1.8h}, [x13] + + ldr OUTPUT_BUF0, [OUTPUT_BUF] + ldr OUTPUT_BUF1, [OUTPUT_BUF, #8] + ldr OUTPUT_BUF2, [OUTPUT_BUF, #16] + .unreq OUTPUT_BUF + + /* Save NEON registers */ + sub sp, sp, #64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + + /* Outer loop over scanlines */ + cmp NUM_ROWS, #1 + b.lt 9f +0: + ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #3] + ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #3] + mov N, OUTPUT_WIDTH + ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #3] + add OUTPUT_ROW, OUTPUT_ROW, #1 + ldr RGB, [INPUT_BUF], #8 + + /* Inner loop over pixels */ + subs N, N, #8 + b.lt 3f + do_load \bpp, 8 + do_rgb_to_yuv_stage1 + subs N, N, #8 + b.lt 2f +1: + do_rgb_to_yuv_stage2_store_load_stage1 + subs N, N, #8 + b.ge 1b +2: + do_rgb_to_yuv_stage2 + do_store 8 + tst N, #7 + b.eq 8f +3: + tbz N, #2, 3f + do_load \bpp, 4 +3: + tbz N, #1, 4f + do_load \bpp, 2 +4: + tbz N, #0, 5f + do_load \bpp, 1 +5: + do_rgb_to_yuv + tbz N, #2, 6f + do_store 4 +6: + tbz N, #1, 7f + do_store 2 +7: + tbz N, #0, 8f + do_store 1 +8: + subs NUM_ROWS, NUM_ROWS, #1 + b.gt 0b +9: + /* Restore all registers and return */ + sub sp, sp, #64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + br x30 + + .unreq OUTPUT_WIDTH + .unreq OUTPUT_ROW + .unreq INPUT_BUF + .unreq NUM_ROWS + .unreq OUTPUT_BUF0 + .unreq OUTPUT_BUF1 + .unreq OUTPUT_BUF2 + .unreq RGB + .unreq Y + .unreq U + .unreq V + .unreq N + +.purgem do_rgb_to_yuv +.purgem do_rgb_to_yuv_stage1 +.purgem do_rgb_to_yuv_stage2 +.purgem do_rgb_to_yuv_stage2_store_load_stage1 + +.endm + +/*--------------------------------- id ----- bpp R G B */ +generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 +generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 +generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 +generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 +generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 +generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 + +.purgem do_load +.purgem do_store + +/*****************************************************************************/ + +/* + * Load data into workspace, applying unsigned->signed conversion + * + * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get + * rid of VST1.16 instructions + */ + +asm_function jsimd_convsamp_neon + SAMPLE_DATA .req x0 + START_COL .req x1 + WORKSPACE .req x2 + TMP1 .req x9 + TMP2 .req x10 + TMP3 .req x11 + TMP4 .req x12 + TMP5 .req x13 + TMP6 .req x14 + TMP7 .req x15 + TMP8 .req x4 + TMPDUP .req w3 + + + mov TMPDUP, #128 + ldp TMP1, TMP2, [SAMPLE_DATA], 16 + ldp TMP3, TMP4, [SAMPLE_DATA], 16 + dup v0.8b, TMPDUP + add TMP1, TMP1, START_COL + add TMP2, TMP2, START_COL + ldp TMP5, TMP6, [SAMPLE_DATA], 16 + add TMP3, TMP3, START_COL + add TMP4, TMP4, START_COL + ldp TMP7, TMP8, [SAMPLE_DATA], 16 + add TMP5, TMP5, START_COL + add TMP6, TMP6, START_COL + ld1 {v16.8b}, [TMP1] + add TMP7, TMP7, START_COL + add TMP8, TMP8, START_COL + ld1 {v17.8b}, [TMP2] + usubl v16.8h, v16.8b, v0.8b + ld1 {v18.8b}, [TMP3] + usubl v17.8h, v17.8b, v0.8b + ld1 {v19.8b}, [TMP4] + usubl v18.8h, v18.8b, v0.8b + ld1 {v20.8b}, [TMP5] + usubl v19.8h, v19.8b, v0.8b + ld1 {v21.8b}, [TMP6] + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64 + usubl v20.8h, v20.8b, v0.8b + ld1 {v22.8b}, [TMP7] + usubl v21.8h, v21.8b, v0.8b + ld1 {v23.8b}, [TMP8] + usubl v22.8h, v22.8b, v0.8b + usubl v23.8h, v23.8b, v0.8b + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64 + + br x30 + + .unreq SAMPLE_DATA + .unreq START_COL + .unreq WORKSPACE + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMP4 + .unreq TMP5 + .unreq TMP6 + .unreq TMP7 + .unreq TMP8 + .unreq TMPDUP + +/*****************************************************************************/ + +/* + * jsimd_fdct_islow_neon + * + * This file contains a slow-but-accurate integer implementation of the + * forward DCT (Discrete Cosine Transform). The following code is based + * directly on the IJG''s original jfdctint.c; see the jfdctint.c for + * more details. + * + * TODO: can be combined with 'jsimd_convsamp_neon' to get + * rid of a bunch of VLD1.16 instructions + */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define DESCALE_P1 (CONST_BITS-PASS1_BITS) +#define DESCALE_P2 (CONST_BITS+PASS1_BITS) + +#if CONST_BITS == 13 +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ +#else +#define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +#define F_0_298 DESCALE( 320652955, 30-CONST_BITS) /* FIX(0.298631336) */ +#define F_0_390 DESCALE( 418953276, 30-CONST_BITS) /* FIX(0.390180644) */ +#define F_0_541 DESCALE( 581104887, 30-CONST_BITS) /* FIX(0.541196100) */ +#define F_0_765 DESCALE( 821806413, 30-CONST_BITS) /* FIX(0.765366865) */ +#define F_0_899 DESCALE( 966342111, 30-CONST_BITS) /* FIX(0.899976223) */ +#define F_1_175 DESCALE(1262586813, 30-CONST_BITS) /* FIX(1.175875602) */ +#define F_1_501 DESCALE(1612031267, 30-CONST_BITS) /* FIX(1.501321110) */ +#define F_1_847 DESCALE(1984016188, 30-CONST_BITS) /* FIX(1.847759065) */ +#define F_1_961 DESCALE(2106220350, 30-CONST_BITS) /* FIX(1.961570560) */ +#define F_2_053 DESCALE(2204520673, 30-CONST_BITS) /* FIX(2.053119869) */ +#define F_2_562 DESCALE(2751909506, 30-CONST_BITS) /* FIX(2.562915447) */ +#define F_3_072 DESCALE(3299298341, 30-CONST_BITS) /* FIX(3.072711026) */ +#endif + +.balign 16 +Ljsimd_fdct_islow_neon_consts: + .short F_0_298 + .short -F_0_390 + .short F_0_541 + .short F_0_765 + .short - F_0_899 + .short F_1_175 + .short F_1_501 + .short - F_1_847 + .short - F_1_961 + .short F_2_053 + .short - F_2_562 + .short F_3_072 + .short 0 /* padding */ + .short 0 + .short 0 + .short 0 + +#undef F_0_298 +#undef F_0_390 +#undef F_0_541 +#undef F_0_765 +#undef F_0_899 +#undef F_1_175 +#undef F_1_501 +#undef F_1_847 +#undef F_1_961 +#undef F_2_053 +#undef F_2_562 +#undef F_3_072 +#define XFIX_P_0_298 v0.h[0] +#define XFIX_N_0_390 v0.h[1] +#define XFIX_P_0_541 v0.h[2] +#define XFIX_P_0_765 v0.h[3] +#define XFIX_N_0_899 v0.h[4] +#define XFIX_P_1_175 v0.h[5] +#define XFIX_P_1_501 v0.h[6] +#define XFIX_N_1_847 v0.h[7] +#define XFIX_N_1_961 v1.h[0] +#define XFIX_P_2_053 v1.h[1] +#define XFIX_N_2_562 v1.h[2] +#define XFIX_P_3_072 v1.h[3] + +asm_function jsimd_fdct_islow_neon + + DATA .req x0 + TMP .req x9 + + /* Load constants */ + adr TMP, Ljsimd_fdct_islow_neon_consts + ld1 {v0.8h, v1.8h}, [TMP] + + /* Save NEON registers */ + sub sp, sp, #64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + + /* Load all DATA into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 | v16.8h + * 1 | d18 | d19 | v17.8h + * 2 | d20 | d21 | v18.8h + * 3 | d22 | d23 | v19.8h + * 4 | d24 | d25 | v20.8h + * 5 | d26 | d27 | v21.8h + * 6 | d28 | d29 | v22.8h + * 7 | d30 | d31 | v23.8h + */ + + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + sub DATA, DATA, #64 + + /* Transpose */ + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 + /* 1-D FDCT */ + add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ + sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ + add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ + sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ + add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ + sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ + add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ + sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ + + /* even part */ + + add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ + sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ + add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ + sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ + + add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ + sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ + + add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ + + shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */ + shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */ + + smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + mov v22.16b, v18.16b + mov v25.16b, v24.16b + + smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + + rshrn v18.4h, v18.4s, #DESCALE_P1 + rshrn v22.4h, v22.4s, #DESCALE_P1 + rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), + CONST_BITS-PASS1_BITS); */ + rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), + CONST_BITS-PASS1_BITS); */ + + /* Odd part */ + + add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ + add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ + add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ + add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ + smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ + smull2 v5.4s, v10.8h, XFIX_P_1_175 + smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ + smlal2 v5.4s, v11.8h, XFIX_P_1_175 + + smull2 v24.4s, v28.8h, XFIX_P_0_298 + smull2 v25.4s, v29.8h, XFIX_P_2_053 + smull2 v26.4s, v30.8h, XFIX_P_3_072 + smull2 v27.4s, v31.8h, XFIX_P_1_501 + smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ + smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ + smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ + smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ + + smull2 v12.4s, v8.8h, XFIX_N_0_899 + smull2 v13.4s, v9.8h, XFIX_N_2_562 + smull2 v14.4s, v10.8h, XFIX_N_1_961 + smull2 v15.4s, v11.8h, XFIX_N_0_390 + smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223); */ + smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447); */ + smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560); */ + smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644); */ + + add v10.4s, v10.4s, v4.4s /* z3 += z5 */ + add v14.4s, v14.4s, v5.4s + add v11.4s, v11.4s, v4.4s /* z4 += z5 */ + add v15.4s, v15.4s, v5.4s + + add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ + add v24.4s, v24.4s, v12.4s + add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ + add v25.4s, v25.4s, v13.4s + add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ + add v26.4s, v26.4s, v14.4s + add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ + add v27.4s, v27.4s, v15.4s + + add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ + add v24.4s, v24.4s, v14.4s + add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ + add v25.4s, v25.4s, v15.4s + add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ + add v26.4s, v26.4s, v13.4s + add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ + add v27.4s, v27.4s, v12.4s + + rshrn v23.4h, v28.4s, #DESCALE_P1 + rshrn v21.4h, v29.4s, #DESCALE_P1 + rshrn v19.4h, v30.4s, #DESCALE_P1 + rshrn v17.4h, v31.4s, #DESCALE_P1 + rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ + rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ + + /* Transpose */ + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 + + /* 1-D FDCT */ + add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ + sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ + add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ + sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ + add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ + sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ + add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ + sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ + + /* even part */ + + add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ + sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ + add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ + sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ + + add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ + sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ + + add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ + + srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */ + srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */ + + smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ + mov v22.16b, v18.16b + mov v25.16b, v24.16b + + smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ + smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ + + rshrn v18.4h, v18.4s, #DESCALE_P2 + rshrn v22.4h, v22.4s, #DESCALE_P2 + rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), + CONST_BITS-PASS1_BITS); */ + rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), + CONST_BITS-PASS1_BITS); */ + + /* Odd part */ + + add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ + add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ + add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ + add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ + + smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ + smull2 v5.4s, v10.8h, XFIX_P_1_175 + smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ + smlal2 v5.4s, v11.8h, XFIX_P_1_175 + + smull2 v24.4s, v28.8h, XFIX_P_0_298 + smull2 v25.4s, v29.8h, XFIX_P_2_053 + smull2 v26.4s, v30.8h, XFIX_P_3_072 + smull2 v27.4s, v31.8h, XFIX_P_1_501 + smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ + smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ + smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ + smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ + + smull2 v12.4s, v8.8h, XFIX_N_0_899 + smull2 v13.4s, v9.8h, XFIX_N_2_562 + smull2 v14.4s, v10.8h, XFIX_N_1_961 + smull2 v15.4s, v11.8h, XFIX_N_0_390 + smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223); */ + smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447); */ + smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560); */ + smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644); */ + + add v10.4s, v10.4s, v4.4s + add v14.4s, v14.4s, v5.4s + add v11.4s, v11.4s, v4.4s + add v15.4s, v15.4s, v5.4s + + add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ + add v24.4s, v24.4s, v12.4s + add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ + add v25.4s, v25.4s, v13.4s + add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ + add v26.4s, v26.4s, v14.4s + add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ + add v27.4s, v27.4s, v15.4s + + add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ + add v24.4s, v24.4s, v14.4s + add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ + add v25.4s, v25.4s, v15.4s + add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ + add v26.4s, v26.4s, v13.4s + add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ + add v27.4s, v27.4s, v12.4s + + rshrn v23.4h, v28.4s, #DESCALE_P2 + rshrn v21.4h, v29.4s, #DESCALE_P2 + rshrn v19.4h, v30.4s, #DESCALE_P2 + rshrn v17.4h, v31.4s, #DESCALE_P2 + rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ + rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ + rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ + + /* store results */ + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + + /* Restore NEON registers */ + sub sp, sp, #64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 + + br x30 + + .unreq DATA + .unreq TMP + +#undef XFIX_P_0_298 +#undef XFIX_N_0_390 +#undef XFIX_P_0_541 +#undef XFIX_P_0_765 +#undef XFIX_N_0_899 +#undef XFIX_P_1_175 +#undef XFIX_P_1_501 +#undef XFIX_N_1_847 +#undef XFIX_N_1_961 +#undef XFIX_P_2_053 +#undef XFIX_N_2_562 +#undef XFIX_P_3_072 + +/*****************************************************************************/ + +/* + * jsimd_fdct_ifast_neon + * + * This function contains a fast, not so accurate integer implementation of + * the forward DCT (Discrete Cosine Transform). It uses the same calculations + * and produces exactly the same output as IJG''s original 'jpeg_fdct_ifast' + * function from jfdctfst.c + * + * TODO: can be combined with 'jsimd_convsamp_neon' to get + * rid of a bunch of VLD1.16 instructions + */ + +#undef XFIX_0_541196100 +#define XFIX_0_382683433 v0.h[0] +#define XFIX_0_541196100 v0.h[1] +#define XFIX_0_707106781 v0.h[2] +#define XFIX_1_306562965 v0.h[3] + +.balign 16 +Ljsimd_fdct_ifast_neon_consts: + .short (98 * 128) /* XFIX_0_382683433 */ + .short (139 * 128) /* XFIX_0_541196100 */ + .short (181 * 128) /* XFIX_0_707106781 */ + .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ + +asm_function jsimd_fdct_ifast_neon + + DATA .req x0 + TMP .req x9 + + /* Load constants */ + adr TMP, Ljsimd_fdct_ifast_neon_consts + ld1 {v0.4h}, [TMP] + + /* Load all DATA into NEON registers with the following allocation: + * 0 1 2 3 | 4 5 6 7 + * ---------+-------- + * 0 | d16 | d17 | v0.8h + * 1 | d18 | d19 | q9 + * 2 | d20 | d21 | q10 + * 3 | d22 | d23 | q11 + * 4 | d24 | d25 | q12 + * 5 | d26 | d27 | q13 + * 6 | d28 | d29 | q14 + * 7 | d30 | d31 | q15 + */ + + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + mov TMP, #2 + sub DATA, DATA, #64 +1: + /* Transpose */ + transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4 + subs TMP, TMP, #1 + /* 1-D FDCT */ + add v4.8h, v19.8h, v20.8h + sub v20.8h, v19.8h, v20.8h + sub v28.8h, v18.8h, v21.8h + add v18.8h, v18.8h, v21.8h + sub v29.8h, v17.8h, v22.8h + add v17.8h, v17.8h, v22.8h + sub v21.8h, v16.8h, v23.8h + add v16.8h, v16.8h, v23.8h + sub v6.8h, v17.8h, v18.8h + sub v7.8h, v16.8h, v4.8h + add v5.8h, v17.8h, v18.8h + add v6.8h, v6.8h, v7.8h + add v4.8h, v16.8h, v4.8h + sqdmulh v6.8h, v6.8h, XFIX_0_707106781 + add v19.8h, v20.8h, v28.8h + add v16.8h, v4.8h, v5.8h + sub v20.8h, v4.8h, v5.8h + add v5.8h, v28.8h, v29.8h + add v29.8h, v29.8h, v21.8h + sqdmulh v5.8h, v5.8h, XFIX_0_707106781 + sub v28.8h, v19.8h, v29.8h + add v18.8h, v7.8h, v6.8h + sqdmulh v28.8h, v28.8h, XFIX_0_382683433 + sub v22.8h, v7.8h, v6.8h + sqdmulh v19.8h, v19.8h, XFIX_0_541196100 + sqdmulh v7.8h, v29.8h, XFIX_1_306562965 + add v6.8h, v21.8h, v5.8h + sub v5.8h, v21.8h, v5.8h + add v29.8h, v29.8h, v28.8h + add v19.8h, v19.8h, v28.8h + add v29.8h, v29.8h, v7.8h + add v21.8h, v5.8h, v19.8h + sub v19.8h, v5.8h, v19.8h + add v17.8h, v6.8h, v29.8h + sub v23.8h, v6.8h, v29.8h + + b.ne 1b + + /* store results */ + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] + + br x30 + + .unreq DATA + .unreq TMP +#undef XFIX_0_382683433 +#undef XFIX_0_541196100 +#undef XFIX_0_707106781 +#undef XFIX_1_306562965 + +/*****************************************************************************/ + +/* + * GLOBAL(void) + * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, + * DCTELEM * workspace); + * + */ +asm_function jsimd_quantize_neon + + COEF_BLOCK .req x0 + DIVISORS .req x1 + WORKSPACE .req x2 + + RECIPROCAL .req DIVISORS + CORRECTION .req x9 + SHIFT .req x10 + LOOP_COUNT .req x11 + + mov LOOP_COUNT, #2 + add CORRECTION, DIVISORS, #(64 * 2) + add SHIFT, DIVISORS, #(64 * 6) +1: + subs LOOP_COUNT, LOOP_COUNT, #1 + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64 + abs v20.8h, v0.8h + abs v21.8h, v1.8h + abs v22.8h, v2.8h + abs v23.8h, v3.8h + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64 + add v20.8h, v20.8h, v4.8h /* add correction */ + add v21.8h, v21.8h, v5.8h + add v22.8h, v22.8h, v6.8h + add v23.8h, v23.8h, v7.8h + umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */ + umull2 v16.4s, v20.8h, v28.8h + umull v5.4s, v21.4h, v29.4h + umull2 v17.4s, v21.8h, v29.8h + umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */ + umull2 v18.4s, v22.8h, v30.8h + umull v7.4s, v23.4h, v31.4h + umull2 v19.4s, v23.8h, v31.8h + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64 + shrn v4.4h, v4.4s, #16 + shrn v5.4h, v5.4s, #16 + shrn v6.4h, v6.4s, #16 + shrn v7.4h, v7.4s, #16 + shrn2 v4.8h, v16.4s, #16 + shrn2 v5.8h, v17.4s, #16 + shrn2 v6.8h, v18.4s, #16 + shrn2 v7.8h, v19.4s, #16 + neg v24.8h, v24.8h + neg v25.8h, v25.8h + neg v26.8h, v26.8h + neg v27.8h, v27.8h + sshr v0.8h, v0.8h, #15 /* extract sign */ + sshr v1.8h, v1.8h, #15 + sshr v2.8h, v2.8h, #15 + sshr v3.8h, v3.8h, #15 + ushl v4.8h, v4.8h, v24.8h /* shift */ + ushl v5.8h, v5.8h, v25.8h + ushl v6.8h, v6.8h, v26.8h + ushl v7.8h, v7.8h, v27.8h + + eor v4.16b, v4.16b, v0.16b /* restore sign */ + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + sub v4.8h, v4.8h, v0.8h + sub v5.8h, v5.8h, v1.8h + sub v6.8h, v6.8h, v2.8h + sub v7.8h, v7.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64 + + b.ne 1b + + br x30 /* return */ + + .unreq COEF_BLOCK + .unreq DIVISORS + .unreq WORKSPACE + .unreq RECIPROCAL + .unreq CORRECTION + .unreq SHIFT + .unreq LOOP_COUNT + +/*****************************************************************************/ + +/* + * Downsample pixel values of a single component. + * This version handles the common case of 2:1 horizontal and 1:1 vertical, + * without smoothing. + * + * GLOBAL(void) + * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor, + * JDIMENSION v_samp_factor, JDIMENSION width_blocks, + * JSAMPARRAY input_data, JSAMPARRAY output_data); + */ + +.balign 16 +Ljsimd_h2v1_downsample_neon_consts: + .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E /* diff 0, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F /* diff 0, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E /* diff 1, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0E /* diff 1, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0D /* diff 2, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0D /* diff 2, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0C /* diff 3, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0C, 0x0C /* diff 3, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0B, 0x0B /* diff 4, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0B, 0x0B /* diff 4, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0A, 0x0A /* diff 5, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0A, 0x0A, 0x0A /* diff 5, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x09, 0x09, 0x09 /* diff 6, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x09, 0x09, 0x09 /* diff 6, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x08, 0x08, 0x08 /* diff 7, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x08, 0x08, 0x08, 0x08 /* diff 7, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x07, 0x07, 0x07, 0x07 /* diff 8, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9, even */ + .byte 0x01, 0x03, 0x05, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9, odd */ + .byte 0x00, 0x02, 0x04, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10, even */ + .byte 0x01, 0x03, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10, odd */ + .byte 0x00, 0x02, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11, even */ + .byte 0x01, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11, odd */ + .byte 0x00, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12, even */ + .byte 0x01, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12, odd */ + .byte 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13, even */ + .byte 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13, odd */ + .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14, even */ + .byte 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14, odd */ + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15, even */ + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15, odd */ + +asm_function jsimd_h2v1_downsample_neon + IMAGE_WIDTH .req x0 + MAX_V_SAMP .req x1 + V_SAMP .req x2 + BLOCK_WIDTH .req x3 + INPUT_DATA .req x4 + OUTPUT_DATA .req x5 + OUTPTR .req x9 + INPTR .req x10 + TMP1 .req x11 + TMP2 .req x12 + TMP3 .req x13 + TMPDUP .req w15 + + mov TMPDUP, #0x10000 + lsl TMP2, BLOCK_WIDTH, #4 + sub TMP2, TMP2, IMAGE_WIDTH + adr TMP3, Ljsimd_h2v1_downsample_neon_consts + add TMP3, TMP3, TMP2, lsl #4 + dup v16.4s, TMPDUP + ld1 {v18.8b, v19.8b}, [TMP3] + +1: /* row loop */ + ldr INPTR, [INPUT_DATA], #8 + ldr OUTPTR, [OUTPUT_DATA], #8 + subs TMP1, BLOCK_WIDTH, #1 + b.eq 3f +2: /* columns */ + ld2 {v0.8b, v1.8b}, [INPTR], #16 + subs TMP1, TMP1, #1 + uaddl v2.8h, v0.8b, v1.8b + add v2.8h, v2.8h, v16.8h + shrn v2.8b, v2.8h, #1 + st1 {v2.8b}, [OUTPTR], #8 + b.ne 2b +3: /* last columns */ + ld1 {v0.16b}, [INPTR] + subs V_SAMP, V_SAMP, #1 + /* expand right */ + tbl v2.8b, {v0.16b}, v18.8b + tbl v3.8b, {v0.16b}, v19.8b + uaddl v2.8h, v2.8b, v3.8b + add v2.8h, v2.8h, v16.8h + shrn v2.8b, v2.8h, #1 + st1 {v2.8b}, [OUTPTR], #8 + b.ne 1b + + br x30 + + .unreq IMAGE_WIDTH + .unreq MAX_V_SAMP + .unreq V_SAMP + .unreq BLOCK_WIDTH + .unreq INPUT_DATA + .unreq OUTPUT_DATA + .unreq OUTPTR + .unreq INPTR + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMPDUP + +/*****************************************************************************/ + +/* + * Downsample pixel values of a single component. + * This version handles the common case of 2:1 horizontal and 2:1 vertical, + * without smoothing. + * + * GLOBAL(void) + * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor, + * JDIMENSION v_samp_factor, JDIMENSION width_blocks, + * JSAMPARRAY input_data, JSAMPARRAY output_data); + */ + +.balign 16 +Ljsimd_h2v2_downsample_neon_consts: + .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E /* diff 0, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F /* diff 0, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E /* diff 1, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0E /* diff 1, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0D /* diff 2, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0D /* diff 2, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0C /* diff 3, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0C, 0x0C /* diff 3, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0B, 0x0B /* diff 4, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0B, 0x0B /* diff 4, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0A, 0x0A /* diff 5, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0A, 0x0A, 0x0A /* diff 5, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x09, 0x09, 0x09 /* diff 6, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x09, 0x09, 0x09 /* diff 6, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x08, 0x08, 0x08 /* diff 7, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x08, 0x08, 0x08, 0x08 /* diff 7, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x07, 0x07, 0x07, 0x07 /* diff 8, even */ + .byte 0x01, 0x03, 0x05, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8, odd */ + .byte 0x00, 0x02, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9, even */ + .byte 0x01, 0x03, 0x05, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9, odd */ + .byte 0x00, 0x02, 0x04, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10, even */ + .byte 0x01, 0x03, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10, odd */ + .byte 0x00, 0x02, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11, even */ + .byte 0x01, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11, odd */ + .byte 0x00, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12, even */ + .byte 0x01, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12, odd */ + .byte 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13, even */ + .byte 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13, odd */ + .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14, even */ + .byte 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14, odd */ + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15, even */ + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15, odd */ + +asm_function jsimd_h2v2_downsample_neon + IMAGE_WIDTH .req x0 + MAX_V_SAMP .req x1 + V_SAMP .req x2 + BLOCK_WIDTH .req x3 + INPUT_DATA .req x4 + OUTPUT_DATA .req x5 + OUTPTR .req x9 + INPTR0 .req x10 + INPTR1 .req x14 + TMP1 .req x11 + TMP2 .req x12 + TMP3 .req x13 + TMPDUP .req w15 + + mov TMPDUP, #1 + lsl TMP2, BLOCK_WIDTH, #4 + lsl TMPDUP, TMPDUP, #17 + sub TMP2, TMP2, IMAGE_WIDTH + adr TMP3, Ljsimd_h2v2_downsample_neon_consts + orr TMPDUP, TMPDUP, #1 + add TMP3, TMP3, TMP2, lsl #4 + dup v16.4s, TMPDUP + ld1 {v18.8b, v19.8b}, [TMP3] + +1: /* row loop */ + ldr INPTR0, [INPUT_DATA], #8 + ldr OUTPTR, [OUTPUT_DATA], #8 + ldr INPTR1, [INPUT_DATA], #8 + subs TMP1, BLOCK_WIDTH, #1 + b.eq 3f +2: /* columns */ + ld2 {v0.8b, v1.8b}, [INPTR0], #16 + ld2 {v2.8b, v3.8b}, [INPTR1], #16 + subs TMP1, TMP1, #1 + uaddl v4.8h, v0.8b, v1.8b + uaddl v6.8h, v2.8b, v3.8b + add v4.8h, v4.8h, v6.8h + add v4.8h, v4.8h, v16.8h + shrn v4.8b, v4.8h, #2 + st1 {v4.8b}, [OUTPTR], #8 + b.ne 2b +3: /* last columns */ + ld1 {v0.16b}, [INPTR0] + ld1 {v1.16b}, [INPTR1] + subs V_SAMP, V_SAMP, #1 + /* expand right */ + tbl v4.8b, {v0.16b}, v18.8b + tbl v5.8b, {v0.16b}, v19.8b + tbl v6.8b, {v1.16b}, v18.8b + tbl v7.8b, {v1.16b}, v19.8b + uaddl v4.8h, v4.8b, v5.8b + uaddl v6.8h, v6.8b, v7.8b + add v4.8h, v4.8h, v6.8h + add v4.8h, v4.8h, v16.8h + shrn v4.8b, v4.8h, #2 + st1 {v4.8b}, [OUTPTR], #8 + b.ne 1b + + br x30 + + .unreq IMAGE_WIDTH + .unreq MAX_V_SAMP + .unreq V_SAMP + .unreq BLOCK_WIDTH + .unreq INPUT_DATA + .unreq OUTPUT_DATA + .unreq OUTPTR + .unreq INPTR0 + .unreq INPTR1 + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 + .unreq TMPDUP -- 2.40.0