From 459cfc8bae26afde6a16421b6f0e5ff5269ebb80 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 9 Feb 2023 11:57:10 +0000 Subject: [PATCH] Optimize Neon high bitdepth convolve copy Use standard loads and stores instead of the significantly slower interleaving/de-interleaving variants. Also move all loads in loop bodies above all stores as a mitigation against the compiler thinking that the src and dst pointers alias (since we can't use restrict in C89.) Change-Id: Idd59dca51387f553f8db27144a2b8f2377c937d3 --- vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c | 106 +++++++++++--------- 1 file changed, 59 insertions(+), 47 deletions(-) diff --git a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c index 9d2752e09..775108208 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c @@ -26,76 +26,88 @@ void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, (void)bd; if (w < 8) { // copy4 + uint16x4_t s0, s1; do { - vst1_u16(dst, vld1_u16(src)); + s0 = vld1_u16(src); src += src_stride; - dst += dst_stride; - vst1_u16(dst, vld1_u16(src)); + s1 = vld1_u16(src); src += src_stride; + + vst1_u16(dst, s0); + dst += dst_stride; + vst1_u16(dst, s1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w == 8) { // copy8 + uint16x8_t s0, s1; do { - vst1q_u16(dst, vld1q_u16(src)); + s0 = vld1q_u16(src); src += src_stride; - dst += dst_stride; - vst1q_u16(dst, vld1q_u16(src)); + s1 = vld1q_u16(src); src += src_stride; + + vst1q_u16(dst, s0); + dst += dst_stride; + vst1q_u16(dst, s1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w < 32) { // copy16 + uint16x8_t s0, s1, s2, s3; do { - vst2q_u16(dst, vld2q_u16(src)); - src += src_stride; - dst += dst_stride; - vst2q_u16(dst, vld2q_u16(src)); + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); src += src_stride; - dst += dst_stride; - vst2q_u16(dst, vld2q_u16(src)); + s2 = vld1q_u16(src); + s3 = vld1q_u16(src + 8); src += src_stride; + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); dst += dst_stride; - vst2q_u16(dst, vld2q_u16(src)); - src += src_stride; + vst1q_u16(dst, s2); + vst1q_u16(dst + 8, s3); dst += dst_stride; - h -= 4; - } while (h > 0); + h -= 2; + } while (h != 0); } else if (w == 32) { // copy32 + uint16x8_t s0, s1, s2, s3; do { - vst4q_u16(dst, vld4q_u16(src)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); src += src_stride; + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); + vst1q_u16(dst + 16, s2); + vst1q_u16(dst + 24, s3); dst += dst_stride; - h -= 4; - } while (h > 0); + } while (--h != 0); } else { // copy64 + uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7; do { - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + s4 = vld1q_u16(src + 32); + s5 = vld1q_u16(src + 40); + s6 = vld1q_u16(src + 48); + s7 = vld1q_u16(src + 56); src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); - src += src_stride; - dst += dst_stride; - h -= 4; - } while (h > 0); + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); + vst1q_u16(dst + 16, s2); + vst1q_u16(dst + 24, s3); + vst1q_u16(dst + 32, s4); + vst1q_u16(dst + 40, s5); + vst1q_u16(dst + 48, s6); + vst1q_u16(dst + 56, s7); + dst += dst_stride; + } while (--h != 0); } } -- 2.40.0