make_tuple(16, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
make_tuple(32, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
make_tuple(64, &vp9_int_pro_row_neon, &vp9_int_pro_row_c)));
+
+INSTANTIATE_TEST_CASE_P(
+ NEON, IntProColTest, ::testing::Values(
+ make_tuple(16, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
+ make_tuple(32, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
+ make_tuple(64, &vp9_int_pro_col_neon, &vp9_int_pro_col_c)));
#endif
#if HAVE_MSA
specialize qw/vp9_int_pro_row sse2 neon/;
add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width";
-specialize qw/vp9_int_pro_col sse2/;
+specialize qw/vp9_int_pro_col sse2 neon/;
add_proto qw/int vp9_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
specialize qw/vp9_vector_var sse2/;
hbuf += 8;
vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
}
+
+int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) {
+ int i;
+ uint16x8_t vec_sum = vdupq_n_u16(0);
+
+ for (i = 0; i < width; i += 16) {
+ const uint8x16_t vec_row = vld1q_u8(ref);
+ vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
+ vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
+ ref += 16;
+ }
+
+ return horizontal_add_u16x8(vec_sum);
+}