From 68cd3052cadad08fa85b7f02a4f303a4418e1d25 Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 15 Oct 2016 11:04:37 -0700 Subject: [PATCH] vpx_highbd_convolve_copy_neon: use multi reg loads for copy16/32/64 BUG=webm:1299 Change-Id: I5080d736bde7e487c80ef3d7024dda1e96a57eaf --- vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c | 57 ++++++++++++--------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c index 68d57779b..a980ab1a3 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c @@ -49,44 +49,55 @@ void vpx_highbd_convolve_copy_neon(const uint8_t *src8, ptrdiff_t src_stride, } while (h > 0); } else if (w < 32) { // copy16 do { - vst1q_u16(dst, vld1q_u16(src)); - vst1q_u16(dst + 8, vld1q_u16(src + 8)); + vst2q_u16(dst, vld2q_u16(src)); src += src_stride; dst += dst_stride; - vst1q_u16(dst, vld1q_u16(src)); - vst1q_u16(dst + 8, vld1q_u16(src + 8)); + vst2q_u16(dst, vld2q_u16(src)); src += src_stride; dst += dst_stride; - h -= 2; + vst2q_u16(dst, vld2q_u16(src)); + src += src_stride; + dst += dst_stride; + vst2q_u16(dst, vld2q_u16(src)); + src += src_stride; + dst += dst_stride; + h -= 4; } while (h > 0); } else if (w == 32) { // copy32 do { - vst1q_u16(dst, vld1q_u16(src)); - vst1q_u16(dst + 8, vld1q_u16(src + 8)); - vst1q_u16(dst + 16, vld1q_u16(src + 16)); - vst1q_u16(dst + 24, vld1q_u16(src + 24)); + vst4q_u16(dst, vld4q_u16(src)); src += src_stride; dst += dst_stride; - vst1q_u16(dst, vld1q_u16(src)); - vst1q_u16(dst + 8, vld1q_u16(src + 8)); - vst1q_u16(dst + 16, vld1q_u16(src + 16)); - vst1q_u16(dst + 24, vld1q_u16(src + 24)); + vst4q_u16(dst, vld4q_u16(src)); src += src_stride; dst += dst_stride; - h -= 2; + vst4q_u16(dst, vld4q_u16(src)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + src += src_stride; + dst += dst_stride; + h -= 4; } while (h > 0); } else { // copy64 do { - vst1q_u16(dst, vld1q_u16(src)); - vst1q_u16(dst + 8, vld1q_u16(src + 8)); - vst1q_u16(dst + 16, vld1q_u16(src + 16)); - vst1q_u16(dst + 24, vld1q_u16(src + 24)); - vst1q_u16(dst + 32, vld1q_u16(src + 32)); - vst1q_u16(dst + 40, vld1q_u16(src + 40)); - vst1q_u16(dst + 48, vld1q_u16(src + 48)); - vst1q_u16(dst + 56, vld1q_u16(src + 56)); + vst4q_u16(dst, vld4q_u16(src)); + vst4q_u16(dst + 32, vld4q_u16(src + 32)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + vst4q_u16(dst + 32, vld4q_u16(src + 32)); src += src_stride; dst += dst_stride; - } while (--h); + vst4q_u16(dst, vld4q_u16(src)); + vst4q_u16(dst + 32, vld4q_u16(src + 32)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + vst4q_u16(dst + 32, vld4q_u16(src + 32)); + src += src_stride; + dst += dst_stride; + h -= 4; + } while (h > 0); } } -- 2.50.0