From e40e78be246b2521749ce1fa6bdcd02e1f729a37 Mon Sep 17 00:00:00 2001 From: Johann Date: Tue, 27 Jun 2017 13:02:28 -0700 Subject: [PATCH] sad neon: rewrite 8x8 and 8x16 BUG=webm:1425 Change-Id: I068f06c67b841f09ea07c04ada0c2f1706102138 --- test/sad_test.cc | 1 + vpx_dsp/arm/sad_neon.c | 79 +++++++++++++++--------------------- vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 3 files changed, 35 insertions(+), 47 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 00a4bc084..045980642 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -649,6 +649,7 @@ const SadMxNParam neon_tests[] = { SadMxNParam(16, 8, &vpx_sad16x8_neon), SadMxNParam(8, 16, &vpx_sad8x16_neon), SadMxNParam(8, 8, &vpx_sad8x8_neon), + SadMxNParam(8, 4, &vpx_sad8x4_neon), SadMxNParam(4, 8, &vpx_sad4x8_neon), SadMxNParam(4, 4, &vpx_sad4x4_neon), }; diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index f5acb9149..cbc904feb 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -15,37 +15,6 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/mem_neon.h" -unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { - uint8x8_t d0, d8; - uint16x8_t q12; - uint32x4_t q1; - uint64x2_t q3; - uint32x2_t d5; - int i; - - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(d0, d8); - - for (i = 0; i < 15; i++) { - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, d0, d8); - } - - q1 = vpaddlq_u16(q12); - q3 = vpaddlq_u32(q1); - d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), - vreinterpret_u32_u64(vget_high_u64(q3))); - - return vget_lane_u32(d5, 0); -} - // TODO(johannkoenig): combine with avg_neon.h version. static INLINE uint32_t horizontal_add_16x8(const uint16x8_t vec_16x8) { const uint32x4_t a = vpaddlq_u16(vec_16x8); @@ -80,6 +49,39 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, return horizontal_add_16x8(abs); } +static INLINE uint16x8_t sad8x(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, const int height) { + int i; + uint16x8_t abs = vdupq_n_u16(0); + + for (i = 0; i < height; ++i) { + const uint8x8_t a_u8 = vld1_u8(a); + const uint8x8_t b_u8 = vld1_u8(b); + a += a_stride; + b += b_stride; + abs = vabal_u8(abs, a_u8, b_u8); + } + return abs; +} + +uint32_t vpx_sad8x4_neon(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride) { + const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, 4); + return horizontal_add_16x8(abs); +} + +uint32_t vpx_sad8x8_neon(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride) { + const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, 8); + return horizontal_add_16x8(abs); +} + +uint32_t vpx_sad8x16_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, 16); + return horizontal_add_16x8(abs); +} + unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride) { uint8x16_t q0, q4; @@ -206,18 +208,3 @@ unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride, } return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); } - -unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - int i; - uint16x8_t vec_accum = vdupq_n_u16(0); - - for (i = 0; i < 8; ++i) { - const uint8x8_t vec_src = vld1_u8(src); - const uint8x8_t vec_ref = vld1_u8(ref); - src += src_stride; - ref += ref_stride; - vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); - } - return horizontal_add_16x8(vec_accum); -} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 96e2bb7e4..88497b796 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -726,7 +726,7 @@ add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, specialize qw/vpx_sad8x8 neon msa sse2/; add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad8x4 msa sse2/; +specialize qw/vpx_sad8x4 neon msa sse2/; add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad4x8 neon msa sse2/; -- 2.49.0