From a28d43658e3347d55d70655e6ee3d87d0d3fba8a Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 6 May 2021 14:51:05 +0100 Subject: [PATCH] Optimize Neon SAD reductions using wider ADDP instruction Implement AArch64-only paths for each of the Neon SAD reduction functions, making use of a wider pairwise addition instruction only available on AArch64. This change removes the need for shuffling between high and low halves of Neon vectors - resulting in a faster reduction that requires fewer instructions. Bug: b/181236880 Change-Id: I1c48580b4aec27222538eeab44e38ecc1f2009dc --- vpx_dsp/arm/sad4d_neon.c | 53 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 06443c699..34c0a7ade 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -34,7 +34,9 @@ static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride, uint32_t *const res) { int i; uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; +#if !defined(__aarch64__) uint16x4_t a[2]; +#endif uint32x4_t r; assert(!((intptr_t)src_ptr % sizeof(uint32_t))); @@ -51,9 +53,14 @@ static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride, abs[1] = vabal_u8(abs[1], s, ref23); } +#if defined(__aarch64__) + abs[0] = vpaddq_u16(abs[0], abs[1]); + r = vpaddlq_u16(abs[0]); +#else a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0])); a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1])); r = vpaddlq_u16(vcombine_u16(a[0], a[1])); +#endif vst1q_u32(res, r); } @@ -74,6 +81,12 @@ void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, // Can handle 512 pixels' sad sum (such as 16x32 or 32x16) static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/, uint32_t *const res) { +#if defined(__aarch64__) + const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); + const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); + const uint16x8_t b0 = vpaddq_u16(a0, a1); + const uint32x4_t r = vpaddlq_u16(b0); +#else const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); @@ -81,12 +94,21 @@ static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/, const uint16x4_t b0 = vpadd_u16(a0, a1); const uint16x4_t b1 = vpadd_u16(a2, a3); const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1)); +#endif vst1q_u32(res, r); } // Can handle 1024 pixels' sad sum (such as 32x32) static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/, uint32_t *const res) { +#if defined(__aarch64__) + const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); + const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); + const uint32x4_t b0 = vpaddlq_u16(a0); + const uint32x4_t b1 = vpaddlq_u16(a1); + const uint32x4_t r = vpaddq_u32(b0, b1); + vst1q_u32(res, r); +#else const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); @@ -96,11 +118,22 @@ static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/, const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0)); const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1)); vst1q_u32(res, vcombine_u32(c0, c1)); +#endif } // Can handle 2048 pixels' sad sum (such as 32x64 or 64x32) static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/, uint32_t *const res) { +#if defined(__aarch64__) + const uint32x4_t a0 = vpaddlq_u16(sum[0]); + const uint32x4_t a1 = vpaddlq_u16(sum[1]); + const uint32x4_t a2 = vpaddlq_u16(sum[2]); + const uint32x4_t a3 = vpaddlq_u16(sum[3]); + const uint32x4_t b0 = vpaddq_u32(a0, a1); + const uint32x4_t b1 = vpaddq_u32(a2, a3); + const uint32x4_t r = vpaddq_u32(b0, b1); + vst1q_u32(res, r); +#else const uint32x4_t a0 = vpaddlq_u16(sum[0]); const uint32x4_t a1 = vpaddlq_u16(sum[1]); const uint32x4_t a2 = vpaddlq_u16(sum[2]); @@ -112,11 +145,30 @@ static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/, const uint32x2_t c0 = vpadd_u32(b0, b1); const uint32x2_t c1 = vpadd_u32(b2, b3); vst1q_u32(res, vcombine_u32(c0, c1)); +#endif } // Can handle 4096 pixels' sad sum (such as 64x64) static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/, uint32_t *const res) { +#if defined(__aarch64__) + const uint32x4_t a0 = vpaddlq_u16(sum[0]); + const uint32x4_t a1 = vpaddlq_u16(sum[1]); + const uint32x4_t a2 = vpaddlq_u16(sum[2]); + const uint32x4_t a3 = vpaddlq_u16(sum[3]); + const uint32x4_t a4 = vpaddlq_u16(sum[4]); + const uint32x4_t a5 = vpaddlq_u16(sum[5]); + const uint32x4_t a6 = vpaddlq_u16(sum[6]); + const uint32x4_t a7 = vpaddlq_u16(sum[7]); + const uint32x4_t b0 = vaddq_u32(a0, a1); + const uint32x4_t b1 = vaddq_u32(a2, a3); + const uint32x4_t b2 = vaddq_u32(a4, a5); + const uint32x4_t b3 = vaddq_u32(a6, a7); + const uint32x4_t c0 = vpaddq_u32(b0, b1); + const uint32x4_t c1 = vpaddq_u32(b2, b3); + const uint32x4_t r = vpaddq_u32(c0, c1); + vst1q_u32(res, r); +#else const uint32x4_t a0 = vpaddlq_u16(sum[0]); const uint32x4_t a1 = vpaddlq_u16(sum[1]); const uint32x4_t a2 = vpaddlq_u16(sum[2]); @@ -136,6 +188,7 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/, const uint32x2_t d0 = vpadd_u32(c0, c1); const uint32x2_t d1 = vpadd_u32(c2, c3); vst1q_u32(res, vcombine_u32(d0, d1)); +#endif } static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride, -- 2.49.0