From: Johann Date: Tue, 11 Jul 2017 14:22:26 +0000 (-0700) Subject: sad4d neon: 16x[8,16,32] X-Git-Tag: v1.7.0~314^2~1 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=807ce8fb1e97d35bc9743c4b02d721c044725fa5;p=libvpx sad4d neon: 16x[8,16,32] Rewrite 16x16. Use half the accumulator registers. BUG=webm:1425 Change-Id: I44b48512b1e3629505d83c2645e800f53878ccc2 --- diff --git a/test/sad_test.cc b/test/sad_test.cc index e53ba54a1..345178d4f 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -677,7 +677,9 @@ INSTANTIATE_TEST_CASE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests)); const SadMxNx4Param x4d_neon_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon), SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon), SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon), SadMxNx4Param(8, 16, &vpx_sad8x16x4d_neon), SadMxNx4Param(8, 8, &vpx_sad8x8x4d_neon), SadMxNx4Param(8, 4, &vpx_sad8x4x4d_neon), diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 420169ed3..6b7cc83a9 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -88,6 +88,48 @@ void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride, sad8x_4d(src, src_stride, ref, ref_stride, res, 16); } +static INLINE void sad16x_4d(const uint8_t *a, int a_stride, + const uint8_t *const b[4], int b_stride, + uint32_t *result, const int height) { + int i, j; + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; + + for (i = 0; i < height; ++i) { + const uint8x16_t a_u8 = vld1q_u8(a); + a += a_stride; + for (j = 0; j < 4; ++j) { + const uint8x16_t b_u8 = vld1q_u8(b_loop[j]); + b_loop[j] += b_stride; + sum[j] = vabal_u8(sum[j], vget_low_u8(a_u8), vget_low_u8(b_u8)); + sum[j] = vabal_u8(sum[j], vget_high_u8(a_u8), vget_high_u8(b_u8)); + } + } + + for (j = 0; j < 4; ++j) { + result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); + } +} + +void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad16x_4d(src, src_stride, ref, ref_stride, res, 8); +} + +void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad16x_4d(src, src_stride, ref, ref_stride, res, 16); +} + +void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad16x_4d(src, src_stride, ref, ref_stride, res, 32); +} + static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, const uint16x8_t vec_hi) { const uint32x4_t vec_l_lo = @@ -241,58 +283,3 @@ void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); } - -void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t *res) { - int i; - uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); - const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - - for (i = 0; i < 16; ++i) { - const uint8x16_t vec_src = vld1q_u8(src); - const uint8x16_t vec_ref0 = vld1q_u8(ref0); - const uint8x16_t vec_ref1 = vld1q_u8(ref1); - const uint8x16_t vec_ref2 = vld1q_u8(ref2); - const uint8x16_t vec_ref3 = vld1q_u8(ref3); - - vec_sum_ref0_lo = - vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0)); - vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref0)); - vec_sum_ref1_lo = - vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1)); - vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref1)); - vec_sum_ref2_lo = - vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2)); - vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref2)); - vec_sum_ref3_lo = - vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3)); - vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref3)); - - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; - } - - res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); - res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); - res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); - res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); -} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 8dfd2c0cc..4fe37b76b 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -872,13 +872,13 @@ add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, con specialize qw/vpx_sad32x16x4d msa sse2 vsx/; add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad16x32x4d msa sse2 vsx/; +specialize qw/vpx_sad16x32x4d neon msa sse2 vsx/; add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad16x16x4d neon msa sse2 vsx/; add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad16x8x4d msa sse2 vsx/; +specialize qw/vpx_sad16x8x4d neon msa sse2 vsx/; add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad8x16x4d neon msa sse2/;