From f7364c05748b70a1e0fd57849665a9d9f0990803 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 6 May 2021 15:11:52 +0100 Subject: [PATCH] Manually unroll the inner loop of Neon sad16x_4d() Manually unrolling the inner loop is sufficient to stop the compiler getting confused and emitting inefficient code. Co-authored by: James Greenhalgh Bug: b/181236880 Change-Id: I860768ce0e6c0e0b6286d3fc1b94f0eae95d0a1a --- vpx_dsp/arm/sad4d_neon.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 34c0a7ade..256bc41ce 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -243,7 +243,7 @@ static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t *res, const int height) { - int i, j; + int i; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], ref_array[3] }; uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), @@ -252,10 +252,15 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, for (i = 0; i < height; ++i) { const uint8x16_t s = vld1q_u8(src_ptr); src_ptr += src_stride; - for (j = 0; j < 4; ++j) { - sad16_neon(ref_loop[j], s, &sum[j]); - ref_loop[j] += ref_stride; - } + /* Manual unrolling here stops the compiler from getting confused. */ + sad16_neon(ref_loop[0], s, &sum[0]); + ref_loop[0] += ref_stride; + sad16_neon(ref_loop[1], s, &sum[1]); + ref_loop[1] += ref_stride; + sad16_neon(ref_loop[2], s, &sum[2]); + ref_loop[2] += ref_stride; + sad16_neon(ref_loop[3], s, &sum[3]); + ref_loop[3] += ref_stride; } sad_512_pel_final_neon(sum, res); -- 2.40.0