From 2d3e33388211d2f0539900671a87a874e25e5240 Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Tue, 8 May 2018 12:52:10 -0700 Subject: [PATCH] Update SadMxNx4 NEON functions Change-Id: Ia313a6da00a05837fcd4de6ece31fa1c0016438c --- vpx_dsp/arm/mem_neon.h | 12 +- vpx_dsp/arm/sad4d_neon.c | 380 ++++++++++++++++++++++++++------------- vpx_dsp/arm/sum_neon.h | 9 - 3 files changed, 262 insertions(+), 139 deletions(-) diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index 12c0a54c8..6745464d7 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -101,9 +101,9 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) { if (stride == 4) return vld1_u8(buf); memcpy(&a, buf, 4); buf += stride; - a_u32 = vld1_lane_u32(&a, a_u32, 0); + a_u32 = vset_lane_u32(a, a_u32, 0); memcpy(&a, buf, 4); - a_u32 = vld1_lane_u32(&a, a_u32, 1); + a_u32 = vset_lane_u32(a, a_u32, 1); return vreinterpret_u8_u32(a_u32); } @@ -127,16 +127,16 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) { if (stride == 4) return vld1q_u8(buf); memcpy(&a, buf, 4); buf += stride; - a_u32 = vld1q_lane_u32(&a, a_u32, 0); + a_u32 = vsetq_lane_u32(a, a_u32, 0); memcpy(&a, buf, 4); buf += stride; - a_u32 = vld1q_lane_u32(&a, a_u32, 1); + a_u32 = vsetq_lane_u32(a, a_u32, 1); memcpy(&a, buf, 4); buf += stride; - a_u32 = vld1q_lane_u32(&a, a_u32, 2); + a_u32 = vsetq_lane_u32(a, a_u32, 2); memcpy(&a, buf, 4); buf += stride; - a_u32 = vld1q_lane_u32(&a, a_u32, 3); + a_u32 = vsetq_lane_u32(a, a_u32, 3); return vreinterpretq_u8_u32(a_u32); } diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index b04de3aff..535ec0f0d 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -10,64 +10,152 @@ #include +#include #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" +static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0, + const void *const buf1) { + uint32_t a; + uint32x2_t aa = vdup_n_u32(0); + memcpy(&a, buf0, 4); + aa = vset_lane_u32(a, aa, 0); + memcpy(&a, buf1, 4); + aa = vset_lane_u32(a, aa, 1); + return vreinterpret_u8_u32(aa); +} + +static INLINE void sad4x_4d(const uint8_t *const src, const int src_stride, + const uint8_t *const ref[4], const int ref_stride, + const int height, uint32_t *const res) { + int i; + uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; + uint16x4_t a[2]; + uint32x4_t r; + + assert(!((intptr_t)src % sizeof(uint32_t))); + assert(!(src_stride % sizeof(uint32_t))); + + for (i = 0; i < height; ++i) { + const uint8x8_t s = vreinterpret_u8_u32( + vld1_dup_u32((const uint32_t *)(src + i * src_stride))); + const uint8x8_t ref01 = load_unaligned_2_buffers(ref[0] + i * ref_stride, + ref[1] + i * ref_stride); + const uint8x8_t ref23 = load_unaligned_2_buffers(ref[2] + i * ref_stride, + ref[3] + i * ref_stride); + abs[0] = vabal_u8(abs[0], s, ref01); + abs[1] = vabal_u8(abs[1], s, ref23); + } + + a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0])); + a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1])); + r = vpaddlq_u16(vcombine_u16(a[0], a[1])); + vst1q_u32(res, r); +} + void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t *res) { - int i; - const uint8x16_t src_u8 = load_unaligned_u8q(src, src_stride); - for (i = 0; i < 4; ++i) { - const uint8x16_t ref_u8 = load_unaligned_u8q(ref[i], ref_stride); - uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); - abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); - res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0); - } + sad4x_4d(src, src_stride, ref, ref_stride, 4, res); } void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t *res) { - int i; - const uint8x16_t src_0 = load_unaligned_u8q(src, src_stride); - const uint8x16_t src_1 = load_unaligned_u8q(src + 4 * src_stride, src_stride); - for (i = 0; i < 4; ++i) { - const uint8x16_t ref_0 = load_unaligned_u8q(ref[i], ref_stride); - const uint8x16_t ref_1 = - load_unaligned_u8q(ref[i] + 4 * ref_stride, ref_stride); - uint16x8_t abs = vabdl_u8(vget_low_u8(src_0), vget_low_u8(ref_0)); - abs = vabal_u8(abs, vget_high_u8(src_0), vget_high_u8(ref_0)); - abs = vabal_u8(abs, vget_low_u8(src_1), vget_low_u8(ref_1)); - abs = vabal_u8(abs, vget_high_u8(src_1), vget_high_u8(ref_1)); - res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0); - } + sad4x_4d(src, src_stride, ref, ref_stride, 8, res); +} + +//////////////////////////////////////////////////////////////////////////////// + +// Can handle 512 pixels' sad sum (such as 16x32 or 32x16) +static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/, + uint32_t *const res) { + const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); + const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); + const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); + const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); + const uint16x4_t b0 = vpadd_u16(a0, a1); + const uint16x4_t b1 = vpadd_u16(a2, a3); + const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1)); + vst1q_u32(res, r); } -static INLINE void sad8x_4d(const uint8_t *a, int a_stride, - const uint8_t *const b[4], int b_stride, - uint32_t *result, const int height) { +// Can handle 1024 pixels' sad sum (such as 32x32) +static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/, + uint32_t *const res) { + const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); + const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); + const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); + const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); + const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1)); + const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3)); + const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0)); + const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1)); + vst1q_u32(res, vcombine_u32(c0, c1)); +} + +// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32) +static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/, + uint32_t *const res) { + const uint32x4_t a0 = vpaddlq_u16(sum[0]); + const uint32x4_t a1 = vpaddlq_u16(sum[1]); + const uint32x4_t a2 = vpaddlq_u16(sum[2]); + const uint32x4_t a3 = vpaddlq_u16(sum[3]); + const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0)); + const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1)); + const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2)); + const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3)); + const uint32x2_t c0 = vpadd_u32(b0, b1); + const uint32x2_t c1 = vpadd_u32(b2, b3); + vst1q_u32(res, vcombine_u32(c0, c1)); +} + +// Can handle 4096 pixels' sad sum (such as 64x64) +static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/, + uint32_t *const res) { + const uint32x4_t a0 = vpaddlq_u16(sum[0]); + const uint32x4_t a1 = vpaddlq_u16(sum[1]); + const uint32x4_t a2 = vpaddlq_u16(sum[2]); + const uint32x4_t a3 = vpaddlq_u16(sum[3]); + const uint32x4_t a4 = vpaddlq_u16(sum[4]); + const uint32x4_t a5 = vpaddlq_u16(sum[5]); + const uint32x4_t a6 = vpaddlq_u16(sum[6]); + const uint32x4_t a7 = vpaddlq_u16(sum[7]); + const uint32x4_t b0 = vaddq_u32(a0, a1); + const uint32x4_t b1 = vaddq_u32(a2, a3); + const uint32x4_t b2 = vaddq_u32(a4, a5); + const uint32x4_t b3 = vaddq_u32(a6, a7); + const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0)); + const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1)); + const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2)); + const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3)); + const uint32x2_t d0 = vpadd_u32(c0, c1); + const uint32x2_t d1 = vpadd_u32(c2, c3); + vst1q_u32(res, vcombine_u32(d0, d1)); +} + +static INLINE void sad8x_4d(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res, const int height) { int i, j; + const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] }; uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; - const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(a); - a += a_stride; + const uint8x8_t s = vld1_u8(src); + src += src_stride; for (j = 0; j < 4; ++j) { - const uint8x8_t b_u8 = vld1_u8(b_loop[j]); - b_loop[j] += b_stride; - sum[j] = vabal_u8(sum[j], a_u8, b_u8); + const uint8x8_t b_u8 = vld1_u8(ref_loop[j]); + ref_loop[j] += ref_stride; + sum[j] = vabal_u8(sum[j], s, b_u8); } } - for (j = 0; j < 4; ++j) { - result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); - } + sad_512_pel_final_neon(sum, res); } void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride, @@ -88,28 +176,33 @@ void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride, sad8x_4d(src, src_stride, ref, ref_stride, res, 16); } -static INLINE void sad16x_4d(const uint8_t *a, int a_stride, - const uint8_t *const b[4], int b_stride, - uint32_t *result, const int height) { +//////////////////////////////////////////////////////////////////////////////// + +static INLINE void sad16_neon(const uint8_t *ref, const uint8x16_t src, + uint16x8_t *const sum) { + const uint8x16_t r = vld1q_u8(ref); + *sum = vabal_u8(*sum, vget_low_u8(src), vget_low_u8(r)); + *sum = vabal_u8(*sum, vget_high_u8(src), vget_high_u8(r)); +} + +static INLINE void sad16x_4d(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res, const int height) { int i, j; + const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] }; uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; - const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; for (i = 0; i < height; ++i) { - const uint8x16_t a_u8 = vld1q_u8(a); - a += a_stride; + const uint8x16_t s = vld1q_u8(src); + src += src_stride; for (j = 0; j < 4; ++j) { - const uint8x16_t b_u8 = vld1q_u8(b_loop[j]); - b_loop[j] += b_stride; - sum[j] = vabal_u8(sum[j], vget_low_u8(a_u8), vget_low_u8(b_u8)); - sum[j] = vabal_u8(sum[j], vget_high_u8(a_u8), vget_high_u8(b_u8)); + sad16_neon(ref_loop[j], s, &sum[j]); + ref_loop[j] += ref_stride; } } - for (j = 0; j < 4; ++j) { - result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); - } + sad_512_pel_final_neon(sum, res); } void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride, @@ -130,113 +223,152 @@ void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride, sad16x_4d(src, src_stride, ref, ref_stride, res, 32); } -static INLINE void sad32x_4d(const uint8_t *a, int a_stride, - const uint8_t *const b[4], int b_stride, - uint32_t *result, const int height) { - int i, j; - uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), - vdupq_n_u16(0) }; - const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; +//////////////////////////////////////////////////////////////////////////////// + +static INLINE void sad32x_4d(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + const int height, uint16x8_t *const sum) { + int i; + const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] }; + + sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(a); - const uint8x16_t a_1 = vld1q_u8(a + 16); - a += a_stride; - for (j = 0; j < 4; ++j) { - const uint8x16_t b_0 = vld1q_u8(b_loop[j]); - const uint8x16_t b_1 = vld1q_u8(b_loop[j] + 16); - b_loop[j] += b_stride; - sum[j] = vabal_u8(sum[j], vget_low_u8(a_0), vget_low_u8(b_0)); - sum[j] = vabal_u8(sum[j], vget_high_u8(a_0), vget_high_u8(b_0)); - sum[j] = vabal_u8(sum[j], vget_low_u8(a_1), vget_low_u8(b_1)); - sum[j] = vabal_u8(sum[j], vget_high_u8(a_1), vget_high_u8(b_1)); - } - } + uint8x16_t s; - for (j = 0; j < 4; ++j) { - result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); + s = vld1q_u8(src + 0 * 16); + sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); + + s = vld1q_u8(src + 1 * 16); + sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); + + src += src_stride; + ref_loop[0] += ref_stride; + ref_loop[1] += ref_stride; + ref_loop[2] += ref_stride; + ref_loop[3] += ref_stride; } } void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t *res) { - sad32x_4d(src, src_stride, ref, ref_stride, res, 16); + uint16x8_t sum[4]; + sad32x_4d(src, src_stride, ref, ref_stride, 16, sum); + sad_512_pel_final_neon(sum, res); } void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t *res) { - sad32x_4d(src, src_stride, ref, ref_stride, res, 32); + uint16x8_t sum[4]; + sad32x_4d(src, src_stride, ref, ref_stride, 32, sum); + sad_1024_pel_final_neon(sum, res); } void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t *res) { - sad32x_4d(src, src_stride, ref, ref_stride, res, 64); + uint16x8_t sum[4]; + sad32x_4d(src, src_stride, ref, ref_stride, 64, sum); + sad_2048_pel_final_neon(sum, res); } -static INLINE void sum64x(const uint8x16_t a_0, const uint8x16_t a_1, - const uint8x16_t b_0, const uint8x16_t b_1, - uint16x8_t *sum) { - *sum = vabal_u8(*sum, vget_low_u8(a_0), vget_low_u8(b_0)); - *sum = vabal_u8(*sum, vget_high_u8(a_0), vget_high_u8(b_0)); - *sum = vabal_u8(*sum, vget_low_u8(a_1), vget_low_u8(b_1)); - *sum = vabal_u8(*sum, vget_high_u8(a_1), vget_high_u8(b_1)); -} +//////////////////////////////////////////////////////////////////////////////// -static INLINE void sad64x_4d(const uint8_t *a, int a_stride, - const uint8_t *const b[4], int b_stride, - uint32_t *result, const int height) { +void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { int i; - uint16x8_t sum_0 = vdupq_n_u16(0); - uint16x8_t sum_1 = vdupq_n_u16(0); - uint16x8_t sum_2 = vdupq_n_u16(0); - uint16x8_t sum_3 = vdupq_n_u16(0); - uint16x8_t sum_4 = vdupq_n_u16(0); - uint16x8_t sum_5 = vdupq_n_u16(0); - uint16x8_t sum_6 = vdupq_n_u16(0); - uint16x8_t sum_7 = vdupq_n_u16(0); - const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; + const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] }; + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; - for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(a); - const uint8x16_t a_1 = vld1q_u8(a + 16); - const uint8x16_t a_2 = vld1q_u8(a + 32); - const uint8x16_t a_3 = vld1q_u8(a + 48); - a += a_stride; - sum64x(a_0, a_1, vld1q_u8(b_loop[0]), vld1q_u8(b_loop[0] + 16), &sum_0); - sum64x(a_2, a_3, vld1q_u8(b_loop[0] + 32), vld1q_u8(b_loop[0] + 48), - &sum_1); - b_loop[0] += b_stride; - sum64x(a_0, a_1, vld1q_u8(b_loop[1]), vld1q_u8(b_loop[1] + 16), &sum_2); - sum64x(a_2, a_3, vld1q_u8(b_loop[1] + 32), vld1q_u8(b_loop[1] + 48), - &sum_3); - b_loop[1] += b_stride; - sum64x(a_0, a_1, vld1q_u8(b_loop[2]), vld1q_u8(b_loop[2] + 16), &sum_4); - sum64x(a_2, a_3, vld1q_u8(b_loop[2] + 32), vld1q_u8(b_loop[2] + 48), - &sum_5); - b_loop[2] += b_stride; - sum64x(a_0, a_1, vld1q_u8(b_loop[3]), vld1q_u8(b_loop[3] + 16), &sum_6); - sum64x(a_2, a_3, vld1q_u8(b_loop[3] + 32), vld1q_u8(b_loop[3] + 48), - &sum_7); - b_loop[3] += b_stride; - } + for (i = 0; i < 32; ++i) { + uint8x16_t s; - result[0] = vget_lane_u32(horizontal_add_long_uint16x8(sum_0, sum_1), 0); - result[1] = vget_lane_u32(horizontal_add_long_uint16x8(sum_2, sum_3), 0); - result[2] = vget_lane_u32(horizontal_add_long_uint16x8(sum_4, sum_5), 0); - result[3] = vget_lane_u32(horizontal_add_long_uint16x8(sum_6, sum_7), 0); -} + s = vld1q_u8(src + 0 * 16); + sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); -void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t *res) { - sad64x_4d(src, src_stride, ref, ref_stride, res, 32); + s = vld1q_u8(src + 1 * 16); + sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); + + s = vld1q_u8(src + 2 * 16); + sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]); + + s = vld1q_u8(src + 3 * 16); + sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]); + + src += src_stride; + ref_loop[0] += ref_stride; + ref_loop[1] += ref_stride; + ref_loop[2] += ref_stride; + ref_loop[3] += ref_stride; + } + + sad_2048_pel_final_neon(sum, res); } void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t *res) { - sad64x_4d(src, src_stride, ref, ref_stride, res, 64); + int i; + const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] }; + uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0), vdupq_n_u16(0) }; + + for (i = 0; i < 64; ++i) { + uint8x16_t s; + + s = vld1q_u8(src + 0 * 16); + sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]); + sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]); + sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]); + + s = vld1q_u8(src + 1 * 16); + sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]); + sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]); + sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]); + + s = vld1q_u8(src + 2 * 16); + sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]); + sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]); + sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]); + sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]); + + s = vld1q_u8(src + 3 * 16); + sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]); + sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]); + sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]); + sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]); + + src += src_stride; + ref_loop[0] += ref_stride; + ref_loop[1] += ref_stride; + ref_loop[2] += ref_stride; + ref_loop[3] += ref_stride; + } + + sad_4096_pel_final_neon(sum, res); } diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index d74fe0cde..c09841223 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -30,15 +30,6 @@ static INLINE uint32x2_t horizontal_add_uint16x8(const uint16x8_t a) { vreinterpret_u32_u64(vget_high_u64(c))); } -static INLINE uint32x2_t horizontal_add_long_uint16x8(const uint16x8_t a, - const uint16x8_t b) { - const uint32x4_t c = vpaddlq_u16(a); - const uint32x4_t d = vpadalq_u16(c, b); - const uint64x2_t e = vpaddlq_u32(d); - return vadd_u32(vreinterpret_u32_u64(vget_low_u64(e)), - vreinterpret_u32_u64(vget_high_u64(e))); -} - static INLINE uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) { const uint64x2_t b = vpaddlq_u32(a); return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), -- 2.40.0