]> granicus.if.org Git - libjpeg-turbo/commitdiff
Complete the ARM64 NEON SIMD implementation
authorDRC <information@libjpeg-turbo.org>
Fri, 15 Jan 2016 15:29:11 +0000 (09:29 -0600)
committerDRC <information@libjpeg-turbo.org>
Fri, 15 Jan 2016 17:21:48 +0000 (11:21 -0600)
This adds 64-bit NEON coverage for all of the algorithms that are
covered by the 32-bit NEON implementation, except for h2v1 (4:2:2) fancy
upsampling (used when decompressing 4:2:2 JPEG images.)  It also adds
64-bit NEON SIMD coverage for:

* slow integer forward DCT (compressor)
* h2v2 (4:2:0) downsampling (compressor)
* h2v1 (4:2:2) downsampling (compressor)

which are not covered in the 32-bit implementation.

Compression speedups relative to libjpeg-turbo 1.4.2:
Apple A7 (iPhone 5S), iOS, 64-bit: 113-150% (reported)
48-core ThunderX (RunAbove ARM Cloud), Linux, 64-bit: 2.1-33% (avg. 15%)

Refer to #44 and #49 for discussion

This commit also removes the unnecessary

    if (simd_support & JSIMD_ARM_NEON)

statements from the jsimd* algorithm functions.  Since the jsimd_can*()
functions check for the existence of NEON, the corresponding algorithm
functions will never be called if NEON isn't available.

Based on:
https://github.com/mayeut/libjpeg-turbo/commit/dcd9d84f10fae192c0e3935818dc289bca9c3e29
https://github.com/mayeut/libjpeg-turbo/commit/b0d87b811f37bd560083deea8c6e7d704e5cd944
https://github.com/mayeut/libjpeg-turbo/commit/70cd5c8a493a67f4d54dd2067ae6dedb65d95389
https://github.com/mayeut/libjpeg-turbo/commit/3e58d9a064648503c57ec2650ee79880f749a52b
https://github.com/mayeut/libjpeg-turbo/commit/837b19542f53fa81af83e6ba002d559877aaf597
https://github.com/mayeut/libjpeg-turbo/commit/73dc43ccc870c2e10ba893e9764b8e48d6836585
https://github.com/mayeut/libjpeg-turbo/commit/a82b71a261b4c0213f558baf4bc745f1c27356d8
https://github.com/mayeut/libjpeg-turbo/commit/c1b1188c2106d6ea7b76644b6023b57edeb602e1
https://github.com/mayeut/libjpeg-turbo/commit/305c89284e1bb222b34fbc7261f697a0cc452a41
https://github.com/mayeut/libjpeg-turbo/commit/7f443f99950b4d7d442b9b879648eca5273209bd
https://github.com/mayeut/libjpeg-turbo/commit/4c2b53b77da5a20e30e2aadaeddb0efbfe24e06d

Unified version with fixes:
https://github.com/mayeut/libjpeg-turbo/commit/1004a3cd05870612a194b410efeaa1b4da76d246

ChangeLog.txt
simd/jsimd.h
simd/jsimd_arm64.c
simd/jsimd_arm64_neon.S

index f79660e901f152f3e314197e43c93096ddc4063f..379cfbd8607dc2ee7f8c82d9bbbfc81cac3cace0 100644 (file)
@@ -71,6 +71,13 @@ setting the JSIMD_NOHUFFENC environment variable to 1.
 platforms.  This speeds up the compression of full-color JPEGs by about 30% on
 average.
 
+[14] Completed the ARM 64-bit (ARMv8) NEON SIMD implementation.  64-bit ARM
+now has SIMD coverage for all of the algorithms that are covered in the 32-bit
+(ARMv7) implementation, except for h2v1 (4:2:2) fancy upsampling.
+Additionally, the ARM 64-bit SIMD implementation now accelerates the slow
+integer forward DCT and h2v2 & h2v1 downsampling algorithms, which are not
+accelerated in the 32-bit implementation.
+
 
 1.4.2
 =====
index e259feab857697fd270b097781215f882fdcbf4d..1c598f0528792448a9257ce944f478788c5e3834 100644 (file)
@@ -360,6 +360,11 @@ EXTERN(void) jsimd_h2v1_downsample_sse2
          JDIMENSION v_samp_factor, JDIMENSION width_blocks,
          JSAMPARRAY input_data, JSAMPARRAY output_data);
 
+EXTERN(void) jsimd_h2v1_downsample_neon
+        (JDIMENSION image_width, int max_v_samp_factor,
+         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
+
 EXTERN(void) jsimd_h2v1_downsample_mips_dspr2
         (JDIMENSION image_width, int max_v_samp_factor,
          JDIMENSION v_samp_factor, JDIMENSION width_blocks,
@@ -381,6 +386,11 @@ EXTERN(void) jsimd_h2v2_downsample_sse2
          JDIMENSION v_samp_factor, JDIMENSION width_blocks,
          JSAMPARRAY input_data, JSAMPARRAY output_data);
 
+EXTERN(void) jsimd_h2v2_downsample_neon
+        (JDIMENSION image_width, int max_v_samp_factor,
+         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+         JSAMPARRAY input_data, JSAMPARRAY output_data);
+
 EXTERN(void) jsimd_h2v2_downsample_mips_dspr2
         (JDIMENSION image_width, int max_v_samp_factor,
          JDIMENSION v_samp_factor, JDIMENSION width_blocks,
@@ -680,6 +690,8 @@ EXTERN(void) jsimd_fdct_islow_mmx (DCTELEM * data);
 extern const int jconst_fdct_islow_sse2[];
 EXTERN(void) jsimd_fdct_islow_sse2 (DCTELEM * data);
 
+EXTERN(void) jsimd_fdct_islow_neon (DCTELEM * data);
+
 EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM * data);
 
 EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM * data);
index 585feeb978352c9d61f1b249d7ef52a6f3b8b40c..2b0e6f77d5ac524fcd9e448cbf6aa34d3a7271cb 100644 (file)
@@ -2,8 +2,8 @@
  * jsimd_arm64.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011, 2013-2014 D. R. Commander
- * Copyright 2015 Matthieu Darbois
+ * Copyright 2009-2011, 2013-2014, 2016 D. R. Commander
+ * Copyright 2015-2016 Matthieu Darbois
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -66,6 +66,17 @@ jsimd_can_rgb_ycc (void)
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -118,6 +129,37 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
                        JDIMENSION output_row, int num_rows)
 {
+  void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch(cinfo->in_color_space) {
+    case JCS_EXT_RGB:
+      neonfct=jsimd_extrgb_ycc_convert_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct=jsimd_extrgbx_ycc_convert_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct=jsimd_extbgr_ycc_convert_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct=jsimd_extbgrx_ycc_convert_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct=jsimd_extxbgr_ycc_convert_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct=jsimd_extxrgb_ycc_convert_neon;
+      break;
+    default:
+      neonfct=jsimd_extrgb_ycc_convert_neon;
+      break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)
@@ -162,8 +204,7 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
       break;
   }
 
-  if (simd_support & JSIMD_ARM_NEON)
-    neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
 }
 
 GLOBAL(void)
@@ -171,9 +212,8 @@ jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
                           JSAMPIMAGE input_buf, JDIMENSION input_row,
                           JSAMPARRAY output_buf, int num_rows)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
-                                  output_buf, num_rows);
+  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+                                output_buf, num_rows);
 }
 
 GLOBAL(int)
@@ -181,6 +221,17 @@ jsimd_can_h2v2_downsample (void)
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -189,6 +240,17 @@ jsimd_can_h2v1_downsample (void)
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -196,12 +258,18 @@ GLOBAL(void)
 jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
 }
 
 GLOBAL(void)
 jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
                        JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
 }
 
 GLOBAL(int)
@@ -305,6 +373,19 @@ jsimd_can_convsamp (void)
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -320,6 +401,7 @@ GLOBAL(void)
 jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
                 DCTELEM * workspace)
 {
+  jsimd_convsamp_neon(sample_data, start_col, workspace);
 }
 
 GLOBAL(void)
@@ -333,6 +415,15 @@ jsimd_can_fdct_islow (void)
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -341,6 +432,15 @@ jsimd_can_fdct_ifast (void)
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -355,11 +455,13 @@ jsimd_can_fdct_float (void)
 GLOBAL(void)
 jsimd_fdct_islow (DCTELEM * data)
 {
+  jsimd_fdct_islow_neon(data);
 }
 
 GLOBAL(void)
 jsimd_fdct_ifast (DCTELEM * data)
 {
+  jsimd_fdct_ifast_neon(data);
 }
 
 GLOBAL(void)
@@ -372,6 +474,17 @@ jsimd_can_quantize (void)
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -387,6 +500,7 @@ GLOBAL(void)
 jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
                 DCTELEM * workspace)
 {
+  jsimd_quantize_neon(coef_block, divisors, workspace);
 }
 
 GLOBAL(void)
@@ -446,9 +560,8 @@ jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
+  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
+                      output_col);
 }
 
 GLOBAL(void)
@@ -456,9 +569,8 @@ jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
                 JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
+  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
+                      output_col);
 }
 
 GLOBAL(int)
@@ -522,9 +634,8 @@ jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
-                          output_col);
+  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
 }
 
 GLOBAL(void)
@@ -532,9 +643,8 @@ jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
                   JCOEFPTR coef_block, JSAMPARRAY output_buf,
                   JDIMENSION output_col)
 {
-  if (simd_support & JSIMD_ARM_NEON)
-    jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
-                          output_col);
+  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
 }
 
 GLOBAL(void)
index 89fc6603131aa4cff2b9319cd76be38125c6daa6..040386d5ba730975eb1d946ad7b86effaededa4b 100644 (file)
@@ -7,6 +7,7 @@
  * Copyright (C) 2013-2014, Linaro Limited
  * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
  * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015-2016, Matthieu Darbois.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -91,6 +92,35 @@ _\fname:
     transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b
 .endm
 
+.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
+    trn1 \t0\().8h, \l0\().8h, \l1\().8h
+    trn1 \t1\().8h, \l2\().8h, \l3\().8h
+    trn1 \t2\().8h, \l4\().8h, \l5\().8h
+    trn1 \t3\().8h, \l6\().8h, \l7\().8h
+    trn2 \l1\().8h, \l0\().8h, \l1\().8h
+    trn2 \l3\().8h, \l2\().8h, \l3\().8h
+    trn2 \l5\().8h, \l4\().8h, \l5\().8h
+    trn2 \l7\().8h, \l6\().8h, \l7\().8h
+
+    trn1 \l4\().4s, \t2\().4s, \t3\().4s
+    trn2 \t3\().4s, \t2\().4s, \t3\().4s
+    trn1 \t2\().4s, \t0\().4s, \t1\().4s
+    trn2 \l2\().4s, \t0\().4s, \t1\().4s
+    trn1 \t0\().4s, \l1\().4s, \l3\().4s
+    trn2 \l3\().4s, \l1\().4s, \l3\().4s
+    trn2 \t1\().4s, \l5\().4s, \l7\().4s
+    trn1 \l5\().4s, \l5\().4s, \l7\().4s
+
+    trn2 \l6\().2d, \l2\().2d, \t3\().2d
+    trn1 \l0\().2d, \t2\().2d, \l4\().2d
+    trn1 \l1\().2d, \t0\().2d, \l5\().2d
+    trn2 \l7\().2d, \l3\().2d, \t1\().2d
+    trn1 \l2\().2d, \l2\().2d, \t3\().2d
+    trn2 \l4\().2d, \t2\().2d, \l4\().2d
+    trn1 \l3\().2d, \l3\().2d, \t1\().2d
+    trn2 \l5\().2d, \t0\().2d, \l5\().2d
+.endm
+
 
 #define CENTERJSAMPLE 128
 
@@ -1055,6 +1085,7 @@ asm_function jsimd_idct_ifast_neon
     .unreq          TMP2
     .unreq          TMP3
     .unreq          TMP4
+    .unreq          TMP5
 
 
 /*****************************************************************************/
@@ -1859,3 +1890,1183 @@ generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,   2, .4h,   3, .4h,   .
 generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,   0, .4h,   0, .4h,   .8b
 .purgem do_load
 .purgem do_store
+
+/*****************************************************************************/
+
+/*
+ * jsimd_extrgb_ycc_convert_neon
+ * jsimd_extbgr_ycc_convert_neon
+ * jsimd_extrgbx_ycc_convert_neon
+ * jsimd_extbgrx_ycc_convert_neon
+ * jsimd_extxbgr_ycc_convert_neon
+ * jsimd_extxrgb_ycc_convert_neon
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro do_store size
+    .if \size == 8
+        st1  {v20.8b}, [Y], #8
+        st1  {v21.8b}, [U], #8
+        st1  {v22.8b}, [V], #8
+    .elseif \size == 4
+        st1  {v20.b}[0], [Y], #1
+        st1  {v20.b}[1], [Y], #1
+        st1  {v20.b}[2], [Y], #1
+        st1  {v20.b}[3], [Y], #1
+        st1  {v21.b}[0], [U], #1
+        st1  {v21.b}[1], [U], #1
+        st1  {v21.b}[2], [U], #1
+        st1  {v21.b}[3], [U], #1
+        st1  {v22.b}[0], [V], #1
+        st1  {v22.b}[1], [V], #1
+        st1  {v22.b}[2], [V], #1
+        st1  {v22.b}[3], [V], #1
+    .elseif \size == 2
+        st1  {v20.b}[4], [Y], #1
+        st1  {v20.b}[5], [Y], #1
+        st1  {v21.b}[4], [U], #1
+        st1  {v21.b}[5], [U], #1
+        st1  {v22.b}[4], [V], #1
+        st1  {v22.b}[5], [V], #1
+    .elseif \size == 1
+        st1  {v20.b}[6], [Y], #1
+        st1  {v21.b}[6], [U], #1
+        st1  {v22.b}[6], [V], #1
+    .else
+        .error unsupported macroblock size
+    .endif
+.endm
+
+.macro do_load bpp, size
+    .if \bpp == 24
+        .if \size == 8
+            ld3  {v10.8b, v11.8b, v12.8b}, [RGB], #24
+            prfm pldl1keep, [RGB, #128]
+        .elseif \size == 4
+            ld3  {v10.b, v11.b, v12.b}[0], [RGB], #3
+            ld3  {v10.b, v11.b, v12.b}[1], [RGB], #3
+            ld3  {v10.b, v11.b, v12.b}[2], [RGB], #3
+            ld3  {v10.b, v11.b, v12.b}[3], [RGB], #3
+        .elseif \size == 2
+            ld3  {v10.b, v11.b, v12.b}[4], [RGB], #3
+            ld3  {v10.b, v11.b, v12.b}[5], [RGB], #3
+        .elseif \size == 1
+            ld3  {v10.b, v11.b, v12.b}[6], [RGB], #3
+        .else
+            .error unsupported macroblock size
+        .endif
+    .elseif \bpp == 32
+        .if \size == 8
+            ld4  {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
+            prfm pldl1keep, [RGB, #128]
+        .elseif \size == 4
+            ld4  {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
+            ld4  {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
+            ld4  {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
+            ld4  {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
+        .elseif \size == 2
+            ld4  {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
+            ld4  {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
+        .elseif \size == 1
+            ld4  {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
+        .else
+            .error unsupported macroblock size
+        .endif
+    .else
+        .error unsupported bpp
+    .endif
+.endm
+
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
+
+/*
+ * 2-stage pipelined RGB->YCbCr conversion
+ */
+
+.macro do_rgb_to_yuv_stage1
+    ushll       v4.8h, v1\r_offs\().8b, #0  /* r = { d4, d5 } */
+    ushll       v6.8h, v1\g_offs\().8b, #0  /* g = { d6, d7 } */
+    ushll       v8.8h, v1\b_offs\().8b, #0  /* b = { d8, d9 } */
+    ins         v5.d[0], v4.d[1]
+    ins         v7.d[0], v6.d[1]
+    ins         v9.d[0], v8.d[1]
+    rev64       v18.4s, v1.4s
+    rev64       v26.4s, v1.4s
+    rev64       v28.4s, v1.4s
+    rev64       v30.4s, v1.4s
+    umull       v14.4s, v4.4h, v0.h[0]
+    umull       v16.4s, v5.4h, v0.h[0]
+    umlsl       v18.4s, v4.4h, v0.h[3]
+    umlsl       v26.4s, v5.4h, v0.h[3]
+    umlal       v28.4s, v4.4h, v0.h[5]
+    umlal       v30.4s, v5.4h, v0.h[5]
+    umlal       v14.4s, v6.4h, v0.h[1]
+    umlal       v16.4s, v7.4h, v0.h[1]
+    umlsl       v18.4s, v6.4h, v0.h[4]
+    umlsl       v26.4s, v7.4h, v0.h[4]
+    umlsl       v28.4s, v6.4h, v0.h[6]
+    umlsl       v30.4s, v7.4h, v0.h[6]
+    umlal       v14.4s, v8.4h, v0.h[2]
+    umlal       v16.4s, v9.4h, v0.h[2]
+    umlal       v18.4s, v8.4h, v0.h[5]
+    umlal       v26.4s, v9.4h, v0.h[5]
+    umlsl       v28.4s, v8.4h, v0.h[7]
+    umlsl       v30.4s, v9.4h, v0.h[7]
+.endm
+
+.macro do_rgb_to_yuv_stage2
+    rshrn       v20.4h, v14.4s, #16
+    rshrn       v21.4h, v16.4s, #16
+    shrn        v22.4h, v18.4s, #16
+    shrn        v23.4h, v26.4s, #16
+    shrn        v24.4h, v28.4s, #16
+    shrn        v25.4h, v30.4s, #16
+    ins         v20.d[1], v21.d[0]
+    ins         v22.d[1], v23.d[0]
+    ins         v24.d[1], v25.d[0]
+    xtn         v20.8b, v20.8h      /* v20 = y */
+    xtn         v21.8b, v22.8h      /* v21 = u */
+    xtn         v22.8b, v24.8h      /* v22 = v */
+.endm
+
+.macro do_rgb_to_yuv
+    do_rgb_to_yuv_stage1
+    do_rgb_to_yuv_stage2
+.endm
+
+.macro do_rgb_to_yuv_stage2_store_load_stage1
+      rshrn       v20.4h, v14.4s, #16
+      rshrn       v21.4h, v16.4s, #16
+      shrn        v22.4h, v18.4s, #16
+    rev64       v18.4s, v1.4s
+      shrn        v23.4h, v26.4s, #16
+      ins         v20.d[1], v21.d[0]
+    rev64       v26.4s, v1.4s
+      shrn        v24.4h, v28.4s, #16
+      shrn        v25.4h, v30.4s, #16
+      ins         v22.d[1], v23.d[0]
+    do_load     \bpp, 8
+      xtn         v20.8b, v20.8h            /* dv0 = y */
+      ins         v24.d[1], v25.d[0]
+    ushll       v4.8h, v1\r_offs\().8b, #0  /* r = { v4.8h } */
+      xtn         v21.8b, v22.8h            /* v21 = u */
+    ushll       v6.8h, v1\g_offs\().8b, #0  /* g = { v6.8h } */
+    ushll       v8.8h, v1\b_offs\().8b, #0  /* b = { v8.8h } */
+      xtn         v22.8b, v24.8h            /* v22 = v */
+    ins         v5.d[0], v4.d[1]
+    ins         v7.d[0], v6.d[1]
+    ins         v9.d[0], v8.d[1]
+      st1         {v20.8b}, [Y], #8
+    umull       v14.4s, v4.4h, v0.h[0]
+    umull       v16.4s, v5.4h, v0.h[0]
+    umlsl       v18.4s, v4.4h, v0.h[3]
+    umlal       v14.4s, v6.4h, v0.h[1]
+    umlal       v16.4s, v7.4h, v0.h[1]
+    umlsl       v18.4s, v6.4h, v0.h[4]
+    umlal       v14.4s, v8.4h, v0.h[2]
+    umlal       v16.4s, v9.4h, v0.h[2]
+    umlal       v18.4s, v8.4h, v0.h[5]
+    rev64       v28.4s, v1.4s
+    rev64       v30.4s, v1.4s
+      st1         {v21.8b}, [U], #8
+    umlsl       v26.4s, v5.4h, v0.h[3]
+    umlal       v28.4s, v4.4h, v0.h[5]
+    umlal       v30.4s, v5.4h, v0.h[5]
+      st1         {v22.8b}, [V], #8
+    umlsl       v26.4s, v7.4h, v0.h[4]
+    umlsl       v28.4s, v6.4h, v0.h[6]
+    umlsl       v30.4s, v7.4h, v0.h[6]
+    umlal       v26.4s, v9.4h, v0.h[5]
+    umlsl       v28.4s, v8.4h, v0.h[7]
+    umlsl       v30.4s, v9.4h, v0.h[7]
+.endm
+
+.balign 16
+Ljsimd_\colorid\()_ycc_neon_consts:
+    .short          19595, 38470, 7471,  11059
+    .short          21709, 32768, 27439, 5329
+    .short          32767, 128,   32767, 128
+    .short          32767, 128,   32767, 128
+
+asm_function jsimd_\colorid\()_ycc_convert_neon
+    OUTPUT_WIDTH    .req w0
+    INPUT_BUF       .req x1
+    OUTPUT_BUF      .req x2
+    OUTPUT_ROW      .req x3
+    NUM_ROWS        .req x4
+
+    OUTPUT_BUF0     .req x5
+    OUTPUT_BUF1     .req x6
+    OUTPUT_BUF2     .req x2  /* OUTPUT_BUF */
+
+    RGB             .req x7
+    Y               .req x9
+    U               .req x10
+    V               .req x11
+    N               .req w12
+
+    /* Load constants to d0, d1, d2, d3 */
+    adr             x13, Ljsimd_\colorid\()_ycc_neon_consts
+    ld1             {v0.8h, v1.8h}, [x13]
+
+    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
+    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #8]
+    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #16]
+    .unreq          OUTPUT_BUF
+
+    /* Save NEON registers */
+    sub             sp, sp, #64
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+
+    /* Outer loop over scanlines */
+    cmp             NUM_ROWS, #1
+    b.lt            9f
+0:
+    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #3]
+    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #3]
+    mov             N, OUTPUT_WIDTH
+    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #3]
+    add             OUTPUT_ROW, OUTPUT_ROW, #1
+    ldr             RGB, [INPUT_BUF], #8
+
+    /* Inner loop over pixels */
+    subs            N, N, #8
+    b.lt            3f
+    do_load         \bpp, 8
+    do_rgb_to_yuv_stage1
+    subs            N, N, #8
+    b.lt            2f
+1:
+    do_rgb_to_yuv_stage2_store_load_stage1
+    subs            N, N, #8
+    b.ge            1b
+2:
+    do_rgb_to_yuv_stage2
+    do_store        8
+    tst             N, #7
+    b.eq            8f
+3:
+    tbz             N, #2, 3f
+    do_load         \bpp, 4
+3:
+    tbz             N, #1, 4f
+    do_load         \bpp, 2
+4:
+    tbz             N, #0, 5f
+    do_load         \bpp, 1
+5:
+    do_rgb_to_yuv
+    tbz             N, #2, 6f
+    do_store        4
+6:
+    tbz             N, #1, 7f
+    do_store        2
+7:
+    tbz             N, #0, 8f
+    do_store        1
+8:
+    subs            NUM_ROWS, NUM_ROWS, #1
+    b.gt            0b
+9:
+    /* Restore all registers and return */
+    sub             sp, sp, #64
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    br              x30
+
+    .unreq          OUTPUT_WIDTH
+    .unreq          OUTPUT_ROW
+    .unreq          INPUT_BUF
+    .unreq          NUM_ROWS
+    .unreq          OUTPUT_BUF0
+    .unreq          OUTPUT_BUF1
+    .unreq          OUTPUT_BUF2
+    .unreq          RGB
+    .unreq          Y
+    .unreq          U
+    .unreq          V
+    .unreq          N
+
+.purgem do_rgb_to_yuv
+.purgem do_rgb_to_yuv_stage1
+.purgem do_rgb_to_yuv_stage2
+.purgem do_rgb_to_yuv_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R  G  B */
+generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
+
+.purgem do_load
+.purgem do_store
+
+/*****************************************************************************/
+
+/*
+ * Load data into workspace, applying unsigned->signed conversion
+ *
+ * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
+ *       rid of VST1.16 instructions
+ */
+
+asm_function jsimd_convsamp_neon
+    SAMPLE_DATA     .req x0
+    START_COL       .req x1
+    WORKSPACE       .req x2
+    TMP1            .req x9
+    TMP2            .req x10
+    TMP3            .req x11
+    TMP4            .req x12
+    TMP5            .req x13
+    TMP6            .req x14
+    TMP7            .req x15
+    TMP8            .req x4
+    TMPDUP          .req w3
+
+
+    mov             TMPDUP, #128
+    ldp             TMP1, TMP2, [SAMPLE_DATA], 16
+    ldp             TMP3, TMP4, [SAMPLE_DATA], 16
+    dup             v0.8b, TMPDUP
+    add             TMP1, TMP1, START_COL
+    add             TMP2, TMP2, START_COL
+    ldp             TMP5, TMP6, [SAMPLE_DATA], 16
+    add             TMP3, TMP3, START_COL
+    add             TMP4, TMP4, START_COL
+    ldp             TMP7, TMP8, [SAMPLE_DATA], 16
+    add             TMP5, TMP5, START_COL
+    add             TMP6, TMP6, START_COL
+    ld1             {v16.8b}, [TMP1]
+    add             TMP7, TMP7, START_COL
+    add             TMP8, TMP8, START_COL
+    ld1             {v17.8b}, [TMP2]
+    usubl           v16.8h, v16.8b, v0.8b
+    ld1             {v18.8b}, [TMP3]
+    usubl           v17.8h, v17.8b, v0.8b
+    ld1             {v19.8b}, [TMP4]
+    usubl           v18.8h, v18.8b, v0.8b
+    ld1             {v20.8b}, [TMP5]
+    usubl           v19.8h, v19.8b, v0.8b
+    ld1             {v21.8b}, [TMP6]
+    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
+    usubl           v20.8h, v20.8b, v0.8b
+    ld1             {v22.8b}, [TMP7]
+    usubl           v21.8h, v21.8b, v0.8b
+    ld1             {v23.8b}, [TMP8]
+    usubl           v22.8h, v22.8b, v0.8b
+    usubl           v23.8h, v23.8b, v0.8b
+    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
+
+    br              x30
+
+    .unreq          SAMPLE_DATA
+    .unreq          START_COL
+    .unreq          WORKSPACE
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+    .unreq          TMP5
+    .unreq          TMP6
+    .unreq          TMP7
+    .unreq          TMP8
+    .unreq          TMPDUP
+
+/*****************************************************************************/
+
+/*
+ * jsimd_fdct_islow_neon
+ *
+ * This file contains a slow-but-accurate integer implementation of the
+ * forward DCT (Discrete Cosine Transform). The following code is based
+ * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
+ * more details.
+ *
+ * TODO: can be combined with 'jsimd_convsamp_neon' to get
+ *       rid of a bunch of VLD1.16 instructions
+ */
+
+#define CONST_BITS      13
+#define PASS1_BITS      2
+
+#define DESCALE_P1      (CONST_BITS-PASS1_BITS)
+#define DESCALE_P2      (CONST_BITS+PASS1_BITS)
+
+#if CONST_BITS == 13
+#define F_0_298      2446           /* FIX(0.298631336) */
+#define F_0_390      3196           /* FIX(0.390180644) */
+#define F_0_541      4433           /* FIX(0.541196100) */
+#define F_0_765      6270           /* FIX(0.765366865) */
+#define F_0_899      7373           /* FIX(0.899976223) */
+#define F_1_175      9633           /* FIX(1.175875602) */
+#define F_1_501     12299           /* FIX(1.501321110) */
+#define F_1_847     15137           /* FIX(1.847759065) */
+#define F_1_961     16069           /* FIX(1.961570560) */
+#define F_2_053     16819           /* FIX(2.053119869) */
+#define F_2_562     20995           /* FIX(2.562915447) */
+#define F_3_072     25172           /* FIX(3.072711026) */
+#else
+#define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+#define F_0_298     DESCALE( 320652955, 30-CONST_BITS)  /* FIX(0.298631336) */
+#define F_0_390     DESCALE( 418953276, 30-CONST_BITS)  /* FIX(0.390180644) */
+#define F_0_541     DESCALE( 581104887, 30-CONST_BITS)  /* FIX(0.541196100) */
+#define F_0_765     DESCALE( 821806413, 30-CONST_BITS)  /* FIX(0.765366865) */
+#define F_0_899     DESCALE( 966342111, 30-CONST_BITS)  /* FIX(0.899976223) */
+#define F_1_175     DESCALE(1262586813, 30-CONST_BITS)  /* FIX(1.175875602) */
+#define F_1_501     DESCALE(1612031267, 30-CONST_BITS)  /* FIX(1.501321110) */
+#define F_1_847     DESCALE(1984016188, 30-CONST_BITS)  /* FIX(1.847759065) */
+#define F_1_961     DESCALE(2106220350, 30-CONST_BITS)  /* FIX(1.961570560) */
+#define F_2_053     DESCALE(2204520673, 30-CONST_BITS)  /* FIX(2.053119869) */
+#define F_2_562     DESCALE(2751909506, 30-CONST_BITS)  /* FIX(2.562915447) */
+#define F_3_072     DESCALE(3299298341, 30-CONST_BITS)  /* FIX(3.072711026) */
+#endif
+
+.balign 16
+Ljsimd_fdct_islow_neon_consts:
+    .short F_0_298
+    .short -F_0_390
+    .short F_0_541
+    .short F_0_765
+    .short - F_0_899
+    .short F_1_175
+    .short F_1_501
+    .short - F_1_847
+    .short - F_1_961
+    .short F_2_053
+    .short - F_2_562
+    .short F_3_072
+    .short 0  /* padding */
+    .short 0
+    .short 0
+    .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+#define XFIX_P_0_298 v0.h[0]
+#define XFIX_N_0_390 v0.h[1]
+#define XFIX_P_0_541 v0.h[2]
+#define XFIX_P_0_765 v0.h[3]
+#define XFIX_N_0_899 v0.h[4]
+#define XFIX_P_1_175 v0.h[5]
+#define XFIX_P_1_501 v0.h[6]
+#define XFIX_N_1_847 v0.h[7]
+#define XFIX_N_1_961 v1.h[0]
+#define XFIX_P_2_053 v1.h[1]
+#define XFIX_N_2_562 v1.h[2]
+#define XFIX_P_3_072 v1.h[3]
+
+asm_function jsimd_fdct_islow_neon
+
+    DATA    .req x0
+    TMP     .req x9
+
+    /* Load constants */
+    adr     TMP, Ljsimd_fdct_islow_neon_consts
+    ld1     {v0.8h, v1.8h}, [TMP]
+
+    /* Save NEON registers */
+    sub     sp, sp, #64
+    st1     {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    st1     {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+
+    /* Load all DATA into NEON registers with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17    | v16.8h
+     *   1 | d18     | d19    | v17.8h
+     *   2 | d20     | d21    | v18.8h
+     *   3 | d22     | d23    | v19.8h
+     *   4 | d24     | d25    | v20.8h
+     *   5 | d26     | d27    | v21.8h
+     *   6 | d28     | d29    | v22.8h
+     *   7 | d30     | d31    | v23.8h
+     */
+
+    ld1     {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    ld1     {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+    sub     DATA, DATA, #64
+
+    /* Transpose */
+    transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
+    /* 1-D FDCT */
+    add v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
+    sub v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
+    add v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
+    sub v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
+    add v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
+    sub v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
+    add v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
+    sub v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
+
+    /* even part */
+
+    add v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
+    sub v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
+    add v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
+    sub v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
+
+    add v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
+    sub v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
+
+    add v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
+
+    shl v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
+    shl v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
+
+    smull2 v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    smull  v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    mov    v22.16b, v18.16b
+    mov    v25.16b, v24.16b
+
+    smlal  v18.4s,  v9.4h, XFIX_P_0_765  /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal2 v24.4s,  v9.8h, XFIX_P_0_765  /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal  v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+    smlal2 v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+
+    rshrn  v18.4h, v18.4s, #DESCALE_P1
+    rshrn  v22.4h, v22.4s, #DESCALE_P1
+    rshrn2 v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765),
+                                                                          CONST_BITS-PASS1_BITS); */
+    rshrn2 v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847),
+                                                                          CONST_BITS-PASS1_BITS); */
+
+    /* Odd part */
+
+    add  v8.8h, v28.8h, v31.8h  /* z1 = tmp4 + tmp7; */
+    add  v9.8h, v29.8h, v30.8h  /* z2 = tmp5 + tmp6; */
+    add v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
+    add v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
+    smull  v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
+    smull2 v5.4s, v10.8h, XFIX_P_1_175
+    smlal  v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+    smlal2 v5.4s, v11.8h, XFIX_P_1_175
+
+    smull2 v24.4s, v28.8h, XFIX_P_0_298
+    smull2 v25.4s, v29.8h, XFIX_P_2_053
+    smull2 v26.4s, v30.8h, XFIX_P_3_072
+    smull2 v27.4s, v31.8h, XFIX_P_1_501
+    smull  v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+    smull  v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+    smull  v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+    smull  v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+
+    smull2 v12.4s,  v8.8h, XFIX_N_0_899
+    smull2 v13.4s,  v9.8h, XFIX_N_2_562
+    smull2 v14.4s, v10.8h, XFIX_N_1_961
+    smull2 v15.4s, v11.8h, XFIX_N_0_390
+    smull   v8.4s,  v8.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
+    smull   v9.4s,  v9.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
+    smull  v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
+    smull  v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
+
+    add v10.4s, v10.4s, v4.4s  /* z3 += z5 */
+    add v14.4s, v14.4s, v5.4s
+    add v11.4s, v11.4s, v4.4s  /* z4 += z5 */
+    add v15.4s, v15.4s, v5.4s
+
+    add v28.4s, v28.4s,  v8.4s  /* tmp4 += z1 */
+    add v24.4s, v24.4s, v12.4s
+    add v29.4s, v29.4s,  v9.4s  /* tmp5 += z2 */
+    add v25.4s, v25.4s, v13.4s
+    add v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
+    add v26.4s, v26.4s, v14.4s
+    add v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
+    add v27.4s, v27.4s, v15.4s
+
+    add v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
+    add v24.4s, v24.4s, v14.4s
+    add v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
+    add v25.4s, v25.4s, v15.4s
+    add v30.4s, v30.4s,  v9.4s  /* tmp6 += z2 */
+    add v26.4s, v26.4s, v13.4s
+    add v31.4s, v31.4s,  v8.4s  /* tmp7 += z1 */
+    add v27.4s, v27.4s, v12.4s
+
+    rshrn  v23.4h, v28.4s, #DESCALE_P1
+    rshrn  v21.4h, v29.4s, #DESCALE_P1
+    rshrn  v19.4h, v30.4s, #DESCALE_P1
+    rshrn  v17.4h, v31.4s, #DESCALE_P1
+    rshrn2 v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2 v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2 v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2 v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+
+    /* Transpose */
+    transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
+
+    /* 1-D FDCT */
+    add v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
+    sub v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
+    add v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
+    sub v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
+    add v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
+    sub v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
+    add v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
+    sub v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
+
+    /* even part */
+
+    add v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
+    sub v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
+    add v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
+    sub v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
+
+    add v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
+    sub v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
+
+    add v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
+
+    srshr  v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */
+    srshr  v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */
+
+    smull2 v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    smull  v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+    mov    v22.16b, v18.16b
+    mov    v25.16b, v24.16b
+
+    smlal  v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal2 v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+    smlal  v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+    smlal2 v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+
+    rshrn  v18.4h, v18.4s, #DESCALE_P2
+    rshrn  v22.4h, v22.4s, #DESCALE_P2
+    rshrn2 v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765),
+                                                                          CONST_BITS-PASS1_BITS); */
+    rshrn2 v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847),
+                                                                          CONST_BITS-PASS1_BITS); */
+
+    /* Odd part */
+
+    add  v8.8h, v28.8h, v31.8h  /* z1 = tmp4 + tmp7; */
+    add  v9.8h, v29.8h, v30.8h  /* z2 = tmp5 + tmp6; */
+    add v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
+    add v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
+
+    smull  v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
+    smull2 v5.4s, v10.8h, XFIX_P_1_175
+    smlal  v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+    smlal2 v5.4s, v11.8h, XFIX_P_1_175
+
+    smull2 v24.4s, v28.8h, XFIX_P_0_298
+    smull2 v25.4s, v29.8h, XFIX_P_2_053
+    smull2 v26.4s, v30.8h, XFIX_P_3_072
+    smull2 v27.4s, v31.8h, XFIX_P_1_501
+    smull  v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+    smull  v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+    smull  v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+    smull  v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+
+    smull2 v12.4s,  v8.8h, XFIX_N_0_899
+    smull2 v13.4s,  v9.8h, XFIX_N_2_562
+    smull2 v14.4s, v10.8h, XFIX_N_1_961
+    smull2 v15.4s, v11.8h, XFIX_N_0_390
+    smull   v8.4s,  v8.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
+    smull   v9.4s,  v9.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
+    smull  v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
+    smull  v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
+
+    add v10.4s, v10.4s, v4.4s
+    add v14.4s, v14.4s, v5.4s
+    add v11.4s, v11.4s, v4.4s
+    add v15.4s, v15.4s, v5.4s
+
+    add v28.4s, v28.4s,  v8.4s  /* tmp4 += z1 */
+    add v24.4s, v24.4s, v12.4s
+    add v29.4s, v29.4s,  v9.4s  /* tmp5 += z2 */
+    add v25.4s, v25.4s, v13.4s
+    add v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
+    add v26.4s, v26.4s, v14.4s
+    add v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
+    add v27.4s, v27.4s, v15.4s
+
+    add v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
+    add v24.4s, v24.4s, v14.4s
+    add v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
+    add v25.4s, v25.4s, v15.4s
+    add v30.4s, v30.4s,  v9.4s  /* tmp6 += z2 */
+    add v26.4s, v26.4s, v13.4s
+    add v31.4s, v31.4s,  v8.4s  /* tmp7 += z1 */
+    add v27.4s, v27.4s, v12.4s
+
+    rshrn  v23.4h, v28.4s, #DESCALE_P2
+    rshrn  v21.4h, v29.4s, #DESCALE_P2
+    rshrn  v19.4h, v30.4s, #DESCALE_P2
+    rshrn  v17.4h, v31.4s, #DESCALE_P2
+    rshrn2 v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2 v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2 v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2 v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+
+    /* store results */
+    st1     {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    st1     {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+
+    /* Restore NEON registers */
+    sub             sp, sp, #64
+    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+
+    br              x30
+
+    .unreq          DATA
+    .unreq          TMP
+
+#undef XFIX_P_0_298
+#undef XFIX_N_0_390
+#undef XFIX_P_0_541
+#undef XFIX_P_0_765
+#undef XFIX_N_0_899
+#undef XFIX_P_1_175
+#undef XFIX_P_1_501
+#undef XFIX_N_1_847
+#undef XFIX_N_1_961
+#undef XFIX_P_2_053
+#undef XFIX_N_2_562
+#undef XFIX_P_3_072
+
+/*****************************************************************************/
+
+/*
+ * jsimd_fdct_ifast_neon
+ *
+ * This function contains a fast, not so accurate integer implementation of
+ * the forward DCT (Discrete Cosine Transform). It uses the same calculations
+ * and produces exactly the same output as IJG''s original 'jpeg_fdct_ifast'
+ * function from jfdctfst.c
+ *
+ * TODO: can be combined with 'jsimd_convsamp_neon' to get
+ *       rid of a bunch of VLD1.16 instructions
+ */
+
+#undef XFIX_0_541196100
+#define XFIX_0_382683433 v0.h[0]
+#define XFIX_0_541196100 v0.h[1]
+#define XFIX_0_707106781 v0.h[2]
+#define XFIX_1_306562965 v0.h[3]
+
+.balign 16
+Ljsimd_fdct_ifast_neon_consts:
+    .short (98 * 128)               /* XFIX_0_382683433 */
+    .short (139 * 128)              /* XFIX_0_541196100 */
+    .short (181 * 128)              /* XFIX_0_707106781 */
+    .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
+
+asm_function jsimd_fdct_ifast_neon
+
+    DATA    .req x0
+    TMP     .req x9
+
+    /* Load constants */
+    adr     TMP, Ljsimd_fdct_ifast_neon_consts
+    ld1     {v0.4h}, [TMP]
+
+    /* Load all DATA into NEON registers with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17    | v0.8h
+     *   1 | d18     | d19    | q9
+     *   2 | d20     | d21    | q10
+     *   3 | d22     | d23    | q11
+     *   4 | d24     | d25    | q12
+     *   5 | d26     | d27    | q13
+     *   6 | d28     | d29    | q14
+     *   7 | d30     | d31    | q15
+     */
+
+    ld1     {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    ld1     {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+    mov     TMP, #2
+    sub     DATA, DATA, #64
+1:
+    /* Transpose */
+    transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
+    subs    TMP, TMP, #1
+    /* 1-D FDCT */
+    add      v4.8h, v19.8h, v20.8h
+    sub     v20.8h, v19.8h, v20.8h
+    sub     v28.8h, v18.8h, v21.8h
+    add     v18.8h, v18.8h, v21.8h
+    sub     v29.8h, v17.8h, v22.8h
+    add     v17.8h, v17.8h, v22.8h
+    sub     v21.8h, v16.8h, v23.8h
+    add     v16.8h, v16.8h, v23.8h
+    sub      v6.8h, v17.8h, v18.8h
+    sub      v7.8h, v16.8h, v4.8h
+    add      v5.8h, v17.8h, v18.8h
+    add      v6.8h,  v6.8h, v7.8h
+    add      v4.8h, v16.8h, v4.8h
+    sqdmulh  v6.8h,  v6.8h, XFIX_0_707106781
+    add     v19.8h, v20.8h, v28.8h
+    add     v16.8h,  v4.8h, v5.8h
+    sub     v20.8h,  v4.8h, v5.8h
+    add      v5.8h, v28.8h, v29.8h
+    add     v29.8h, v29.8h, v21.8h
+    sqdmulh  v5.8h,  v5.8h, XFIX_0_707106781
+    sub     v28.8h, v19.8h, v29.8h
+    add     v18.8h,  v7.8h, v6.8h
+    sqdmulh v28.8h, v28.8h, XFIX_0_382683433
+    sub     v22.8h,  v7.8h, v6.8h
+    sqdmulh v19.8h, v19.8h, XFIX_0_541196100
+    sqdmulh  v7.8h, v29.8h, XFIX_1_306562965
+    add      v6.8h, v21.8h, v5.8h
+    sub      v5.8h, v21.8h, v5.8h
+    add     v29.8h, v29.8h, v28.8h
+    add     v19.8h, v19.8h, v28.8h
+    add     v29.8h, v29.8h, v7.8h
+    add     v21.8h,  v5.8h, v19.8h
+    sub     v19.8h,  v5.8h, v19.8h
+    add     v17.8h,  v6.8h, v29.8h
+    sub     v23.8h,  v6.8h, v29.8h
+
+    b.ne    1b
+
+    /* store results */
+    st1     {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+    st1     {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+
+    br              x30
+
+    .unreq          DATA
+    .unreq          TMP
+#undef XFIX_0_382683433
+#undef XFIX_0_541196100
+#undef XFIX_0_707106781
+#undef XFIX_1_306562965
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(void)
+ * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors,
+ *                      DCTELEM * workspace);
+ *
+ */
+asm_function jsimd_quantize_neon
+
+    COEF_BLOCK      .req x0
+    DIVISORS        .req x1
+    WORKSPACE       .req x2
+
+    RECIPROCAL      .req DIVISORS
+    CORRECTION      .req x9
+    SHIFT           .req x10
+    LOOP_COUNT      .req x11
+
+    mov             LOOP_COUNT, #2
+    add             CORRECTION, DIVISORS, #(64 * 2)
+    add             SHIFT, DIVISORS, #(64 * 6)
+1:
+    subs            LOOP_COUNT, LOOP_COUNT, #1
+    ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
+    ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
+    abs             v20.8h, v0.8h
+    abs             v21.8h, v1.8h
+    abs             v22.8h, v2.8h
+    abs             v23.8h, v3.8h
+    ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
+    add             v20.8h, v20.8h, v4.8h  /* add correction */
+    add             v21.8h, v21.8h, v5.8h
+    add             v22.8h, v22.8h, v6.8h
+    add             v23.8h, v23.8h, v7.8h
+    umull           v4.4s, v20.4h, v28.4h  /* multiply by reciprocal */
+    umull2          v16.4s, v20.8h, v28.8h
+    umull           v5.4s, v21.4h, v29.4h
+    umull2          v17.4s, v21.8h, v29.8h
+    umull           v6.4s, v22.4h, v30.4h  /* multiply by reciprocal */
+    umull2          v18.4s, v22.8h, v30.8h
+    umull           v7.4s, v23.4h, v31.4h
+    umull2          v19.4s, v23.8h, v31.8h
+    ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
+    shrn            v4.4h, v4.4s, #16
+    shrn            v5.4h, v5.4s, #16
+    shrn            v6.4h, v6.4s, #16
+    shrn            v7.4h, v7.4s, #16
+    shrn2           v4.8h, v16.4s, #16
+    shrn2           v5.8h, v17.4s, #16
+    shrn2           v6.8h, v18.4s, #16
+    shrn2           v7.8h, v19.4s, #16
+    neg             v24.8h, v24.8h
+    neg             v25.8h, v25.8h
+    neg             v26.8h, v26.8h
+    neg             v27.8h, v27.8h
+    sshr            v0.8h,  v0.8h,  #15  /* extract sign */
+    sshr            v1.8h,  v1.8h,  #15
+    sshr            v2.8h,  v2.8h,  #15
+    sshr            v3.8h,  v3.8h,  #15
+    ushl            v4.8h, v4.8h, v24.8h  /* shift */
+    ushl            v5.8h, v5.8h, v25.8h
+    ushl            v6.8h, v6.8h, v26.8h
+    ushl            v7.8h, v7.8h, v27.8h
+
+    eor             v4.16b, v4.16b, v0.16b  /* restore sign */
+    eor             v5.16b, v5.16b, v1.16b
+    eor             v6.16b, v6.16b, v2.16b
+    eor             v7.16b, v7.16b, v3.16b
+    sub             v4.8h, v4.8h, v0.8h
+    sub             v5.8h, v5.8h, v1.8h
+    sub             v6.8h, v6.8h, v2.8h
+    sub             v7.8h, v7.8h, v3.8h
+    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
+
+    b.ne            1b
+
+    br              x30  /* return */
+
+    .unreq          COEF_BLOCK
+    .unreq          DIVISORS
+    .unreq          WORKSPACE
+    .unreq          RECIPROCAL
+    .unreq          CORRECTION
+    .unreq          SHIFT
+    .unreq          LOOP_COUNT
+
+/*****************************************************************************/
+
+/*
+ * Downsample pixel values of a single component.
+ * This version handles the common case of 2:1 horizontal and 1:1 vertical,
+ * without smoothing.
+ *
+ * GLOBAL(void)
+ * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
+ *                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+ *                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+ */
+
+.balign 16
+Ljsimd_h2v1_downsample_neon_consts:
+    .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E  /* diff 0, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F  /* diff 0, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E  /* diff 1, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0E  /* diff 1, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0D  /* diff 2, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0D  /* diff 2, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0C  /* diff 3, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0C, 0x0C  /* diff 3, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0B, 0x0B  /* diff 4, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0B, 0x0B  /* diff 4, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0A, 0x0A  /* diff 5, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0A, 0x0A, 0x0A  /* diff 5, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x09, 0x09, 0x09  /* diff 6, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x09, 0x09, 0x09  /* diff 6, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x08, 0x08, 0x08  /* diff 7, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x08, 0x08, 0x08, 0x08  /* diff 7, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x07, 0x07, 0x07, 0x07  /* diff 8, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9, even */
+    .byte 0x01, 0x03, 0x05, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9, odd */
+    .byte 0x00, 0x02, 0x04, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10, even */
+    .byte 0x01, 0x03, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10, odd */
+    .byte 0x00, 0x02, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11, even */
+    .byte 0x01, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11, odd */
+    .byte 0x00, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12, even */
+    .byte 0x01, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12, odd */
+    .byte 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13, even */
+    .byte 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13, odd */
+    .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14, even */
+    .byte 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14, odd */
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15, even */
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15, odd */
+
+asm_function jsimd_h2v1_downsample_neon
+    IMAGE_WIDTH     .req x0
+    MAX_V_SAMP      .req x1
+    V_SAMP          .req x2
+    BLOCK_WIDTH     .req x3
+    INPUT_DATA      .req x4
+    OUTPUT_DATA     .req x5
+    OUTPTR          .req x9
+    INPTR           .req x10
+    TMP1            .req x11
+    TMP2            .req x12
+    TMP3            .req x13
+    TMPDUP          .req w15
+
+    mov   TMPDUP, #0x10000
+    lsl   TMP2, BLOCK_WIDTH, #4
+    sub   TMP2, TMP2, IMAGE_WIDTH
+    adr   TMP3, Ljsimd_h2v1_downsample_neon_consts
+    add   TMP3, TMP3, TMP2, lsl #4
+    dup   v16.4s, TMPDUP
+    ld1   {v18.8b, v19.8b}, [TMP3]
+
+1:  /* row loop */
+    ldr   INPTR, [INPUT_DATA], #8
+    ldr   OUTPTR, [OUTPUT_DATA], #8
+    subs  TMP1, BLOCK_WIDTH, #1
+    b.eq  3f
+2:  /* columns */
+    ld2   {v0.8b, v1.8b}, [INPTR], #16
+    subs  TMP1, TMP1, #1
+    uaddl v2.8h, v0.8b, v1.8b
+    add   v2.8h, v2.8h, v16.8h
+    shrn  v2.8b, v2.8h, #1
+    st1   {v2.8b}, [OUTPTR], #8
+    b.ne  2b
+3:  /* last columns */
+    ld1   {v0.16b}, [INPTR]
+    subs  V_SAMP, V_SAMP, #1
+    /* expand right */
+    tbl   v2.8b, {v0.16b}, v18.8b
+    tbl   v3.8b, {v0.16b}, v19.8b
+    uaddl v2.8h, v2.8b, v3.8b
+    add   v2.8h, v2.8h, v16.8h
+    shrn  v2.8b, v2.8h, #1
+    st1   {v2.8b}, [OUTPTR], #8
+    b.ne  1b
+
+    br              x30
+
+    .unreq          IMAGE_WIDTH
+    .unreq          MAX_V_SAMP
+    .unreq          V_SAMP
+    .unreq          BLOCK_WIDTH
+    .unreq          INPUT_DATA
+    .unreq          OUTPUT_DATA
+    .unreq          OUTPTR
+    .unreq          INPTR
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMPDUP
+
+/*****************************************************************************/
+
+/*
+ * Downsample pixel values of a single component.
+ * This version handles the common case of 2:1 horizontal and 2:1 vertical,
+ * without smoothing.
+ *
+ * GLOBAL(void)
+ * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
+ *                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+ *                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+ */
+
+.balign 16
+Ljsimd_h2v2_downsample_neon_consts:
+    .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E  /* diff 0, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F  /* diff 0, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E  /* diff 1, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0E  /* diff 1, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0D  /* diff 2, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0D  /* diff 2, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0C  /* diff 3, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0C, 0x0C  /* diff 3, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0B, 0x0B  /* diff 4, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0B, 0x0B  /* diff 4, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0A, 0x0A  /* diff 5, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x0A, 0x0A, 0x0A  /* diff 5, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x09, 0x09, 0x09  /* diff 6, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x09, 0x09, 0x09, 0x09  /* diff 6, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x08, 0x08, 0x08, 0x08  /* diff 7, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x08, 0x08, 0x08, 0x08  /* diff 7, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x07, 0x07, 0x07, 0x07  /* diff 8, even */
+    .byte 0x01, 0x03, 0x05, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8, odd */
+    .byte 0x00, 0x02, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9, even */
+    .byte 0x01, 0x03, 0x05, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9, odd */
+    .byte 0x00, 0x02, 0x04, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10, even */
+    .byte 0x01, 0x03, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10, odd */
+    .byte 0x00, 0x02, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11, even */
+    .byte 0x01, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11, odd */
+    .byte 0x00, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12, even */
+    .byte 0x01, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12, odd */
+    .byte 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13, even */
+    .byte 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13, odd */
+    .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14, even */
+    .byte 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14, odd */
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15, even */
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15, odd */
+
+asm_function jsimd_h2v2_downsample_neon
+    IMAGE_WIDTH     .req x0
+    MAX_V_SAMP      .req x1
+    V_SAMP          .req x2
+    BLOCK_WIDTH     .req x3
+    INPUT_DATA      .req x4
+    OUTPUT_DATA     .req x5
+    OUTPTR          .req x9
+    INPTR0          .req x10
+    INPTR1          .req x14
+    TMP1            .req x11
+    TMP2            .req x12
+    TMP3            .req x13
+    TMPDUP          .req w15
+
+    mov   TMPDUP, #1
+    lsl   TMP2, BLOCK_WIDTH, #4
+    lsl   TMPDUP, TMPDUP, #17
+    sub   TMP2, TMP2, IMAGE_WIDTH
+    adr   TMP3, Ljsimd_h2v2_downsample_neon_consts
+    orr   TMPDUP, TMPDUP, #1
+    add   TMP3, TMP3, TMP2, lsl #4
+    dup   v16.4s, TMPDUP
+    ld1   {v18.8b, v19.8b}, [TMP3]
+
+1:  /* row loop */
+    ldr   INPTR0, [INPUT_DATA], #8
+    ldr   OUTPTR, [OUTPUT_DATA], #8
+    ldr   INPTR1, [INPUT_DATA], #8
+    subs  TMP1, BLOCK_WIDTH, #1
+    b.eq  3f
+2:  /* columns */
+    ld2   {v0.8b, v1.8b}, [INPTR0], #16
+    ld2   {v2.8b, v3.8b}, [INPTR1], #16
+    subs  TMP1, TMP1, #1
+    uaddl v4.8h, v0.8b, v1.8b
+    uaddl v6.8h, v2.8b, v3.8b
+    add   v4.8h, v4.8h, v6.8h
+    add   v4.8h, v4.8h, v16.8h
+    shrn  v4.8b, v4.8h, #2
+    st1   {v4.8b}, [OUTPTR], #8
+    b.ne  2b
+3:  /* last columns */
+    ld1   {v0.16b}, [INPTR0]
+    ld1   {v1.16b}, [INPTR1]
+    subs  V_SAMP, V_SAMP, #1
+    /* expand right */
+    tbl   v4.8b, {v0.16b}, v18.8b
+    tbl   v5.8b, {v0.16b}, v19.8b
+    tbl   v6.8b, {v1.16b}, v18.8b
+    tbl   v7.8b, {v1.16b}, v19.8b
+    uaddl v4.8h, v4.8b, v5.8b
+    uaddl v6.8h, v6.8b, v7.8b
+    add   v4.8h, v4.8h, v6.8h
+    add   v4.8h, v4.8h, v16.8h
+    shrn  v4.8b, v4.8h, #2
+    st1   {v4.8b}, [OUTPTR], #8
+    b.ne  1b
+
+    br              x30
+
+    .unreq          IMAGE_WIDTH
+    .unreq          MAX_V_SAMP
+    .unreq          V_SAMP
+    .unreq          BLOCK_WIDTH
+    .unreq          INPUT_DATA
+    .unreq          OUTPUT_DATA
+    .unreq          OUTPTR
+    .unreq          INPTR0
+    .unreq          INPTR1
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMPDUP