From: Frank Galligan Date: Tue, 16 Jun 2015 19:58:39 +0000 (-0700) Subject: Add vp9_int_pro_row_neon. X-Git-Tag: v1.5.0~475^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=944ad6cac9391fc84c189c31a950e8e88ee62205;p=libvpx Add vp9_int_pro_row_neon. BUG=https://code.google.com/p/webm/issues/detail?id=1022 Change-Id: I510c3b0a70158fa2e4da554f7c5d7558021a6ddf --- diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc index 56b5250db..09dbaf3ed 100644 --- a/test/vp9_avg_test.cc +++ b/test/vp9_avg_test.cc @@ -286,6 +286,11 @@ INSTANTIATE_TEST_CASE_P( make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon), make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon))); +INSTANTIATE_TEST_CASE_P( + NEON, IntProRowTest, ::testing::Values( + make_tuple(16, &vp9_int_pro_row_neon, &vp9_int_pro_row_c), + make_tuple(32, &vp9_int_pro_row_neon, &vp9_int_pro_row_c), + make_tuple(64, &vp9_int_pro_row_neon, &vp9_int_pro_row_c))); #endif } // namespace diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index dac142397..cb26da2b9 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -896,7 +896,7 @@ add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length"; specialize qw/vp9_satd sse2/; add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height"; -specialize qw/vp9_int_pro_row sse2/; +specialize qw/vp9_int_pro_row sse2 neon/; add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width"; specialize qw/vp9_int_pro_col sse2/; diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c index f505fcb7a..fecab5795 100644 --- a/vp9/encoder/arm/neon/vp9_avg_neon.c +++ b/vp9/encoder/arm/neon/vp9_avg_neon.c @@ -47,3 +47,56 @@ unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) { return (horizontal_add_u16x8(v_sum) + 32) >> 6; } + +void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, + const int ref_stride, const int height) { + int i; + uint16x8_t vec_sum_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_hi = vdupq_n_u16(0); + const int shift_factor = ((height >> 5) + 3) * -1; + const int16x8_t vec_shift = vdupq_n_s16(shift_factor); + + for (i = 0; i < height; i += 8) { + const uint8x16_t vec_row1 = vld1q_u8(ref); + const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride); + const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2); + const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3); + const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4); + const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5); + const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6); + const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8)); + + ref += ref_stride * 8; + } + + vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift); + vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift); + + vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo)); + hbuf += 8; + vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi)); +}