}
}
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ uint8x16_t s0, s1, s2, s3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int32x4_t t0, t1, t2, t3;
+ int16x8_t t01, t23;
+ uint8x8_t d01, d23, dd01, dd23;
+ dd01 = vdup_n_u8(0);
+ dd23 = vdup_n_u8(0);
+
+ s0 = vld1q_u8(src);
+ src += src_stride;
+ s1 = vld1q_u8(src);
+ src += src_stride;
+ s2 = vld1q_u8(src);
+ src += src_stride;
+ s3 = vld1q_u8(src);
+ src += src_stride;
+
+ t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
+ t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
+ t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
+ t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+
+ t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+ t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+ d01 = vqrshrun_n_s16(t01, 7);
+ d23 = vqrshrun_n_s16(t23, 7);
+
+ dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+ d01 = vrhadd_u8(d01, dd01);
+ d23 = vrhadd_u8(d23, dd23);
+
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
+ dst += dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ s0 = vld1q_u8(s + 0 * src_stride);
+ s1 = vld1q_u8(s + 1 * src_stride);
+ s2 = vld1q_u8(s + 2 * src_stride);
+ s3 = vld1q_u8(s + 3 * src_stride);
+
+ d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
+ d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
+ d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
+ d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
+
+ dd0 = vld1_u8(d + 0 * dst_stride);
+ dd1 = vld1_u8(d + 1 * dst_stride);
+ dd2 = vld1_u8(d + 2 * dst_stride);
+ dd3 = vld1_u8(d + 3 * dst_stride);
+ d0 = vrhadd_u8(d0, dd0);
+ d1 = vrhadd_u8(d1, dd1);
+ d2 = vrhadd_u8(d2, dd2);
+ d3 = vrhadd_u8(d3, dd3);
+
+ vst1_u8(d + 0 * dst_stride, d0);
+ vst1_u8(d + 1 * dst_stride, d1);
+ vst1_u8(d + 2 * dst_stride, d2);
+ vst1_u8(d + 3 * dst_stride, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
#else
void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
}
}
-#endif
-
void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *filter, int x0_q4,
}
}
+#endif
+
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
(__ARM_FEATURE_DOTPROD == 1)