From: jackychen Date: Thu, 3 Dec 2015 23:21:36 +0000 (-0800) Subject: Add vp9_avg_4x4_neon and the unit test. X-Git-Tag: v1.6.0~498^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=303f144eefc739c44a3e18a24f8cf61d2fe103a7;p=libvpx Add vp9_avg_4x4_neon and the unit test. Change-Id: I3ef9a9648841374ed3cc865a02053c14ad821a20 --- diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc index 290bdc75e..cbc667e5d 100644 --- a/test/vp9_avg_test.cc +++ b/test/vp9_avg_test.cc @@ -372,7 +372,10 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( make_tuple(16, 16, 0, 8, &vp9_avg_8x8_neon), make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon), - make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon))); + make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon), + make_tuple(16, 16, 0, 4, &vp9_avg_4x4_neon), + make_tuple(16, 16, 5, 4, &vp9_avg_4x4_neon), + make_tuple(32, 32, 15, 4, &vp9_avg_4x4_neon))); INSTANTIATE_TEST_CASE_P( NEON, IntProRowTest, ::testing::Values( diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 8fe6503aa..d166bbf38 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -198,7 +198,7 @@ add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p"; specialize qw/vp9_avg_8x8 sse2 neon msa/; add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p"; -specialize qw/vp9_avg_4x4 sse2 msa/; +specialize qw/vp9_avg_4x4 sse2 neon msa/; add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; specialize qw/vp9_minmax_8x8 sse2/; diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c index 5996bd426..78467cebd 100644 --- a/vp9/encoder/arm/neon/vp9_avg_neon.c +++ b/vp9/encoder/arm/neon/vp9_avg_neon.c @@ -24,6 +24,18 @@ static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { return vget_lane_u32(c, 0); } +unsigned int vp9_avg_4x4_neon(const uint8_t *s, int p) { + uint16x8_t v_sum; + uint32x2_t v_s0 = vdup_n_u32(0); + uint32x2_t v_s1 = vdup_n_u32(0); + v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0); + v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1); + v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0); + v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1); + v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1)); + return (horizontal_add_u16x8(v_sum) + 8) >> 4; +} + unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) { uint8x8_t v_s0 = vld1_u8(s); const uint8x8_t v_s1 = vld1_u8(s + p);