Implement vpx_convolve8_avg_horiz_neon using SDOT instruction

author Jonathan Wright <jonathan.wright@arm.com>

Mon, 17 May 2021 09:53:07 +0000 (10:53 +0100)

committer James Zern <jzern@google.com>

Tue, 18 May 2021 20:33:46 +0000 (13:33 -0700)
author Jonathan Wright <jonathan.wright@arm.com>
Mon, 17 May 2021 09:53:07 +0000 (10:53 +0100)
committer James Zern <jzern@google.com>
Tue, 18 May 2021 20:33:46 +0000 (13:33 -0700)
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c

index 73949681690c041abd425209c27813864d383b16..acb128c8556dffb131bfc751277c09326b9d59ba 100644 (file)
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -157,6 +157,117 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
    }
  }
  
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  uint8x16_t s0, s1, s2, s3;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int32x4_t t0, t1, t2, t3;
+      int16x8_t t01, t23;
+      uint8x8_t d01, d23, dd01, dd23;
+      dd01 = vdup_n_u8(0);
+      dd23 = vdup_n_u8(0);
+
+      s0 = vld1q_u8(src);
+      src += src_stride;
+      s1 = vld1q_u8(src);
+      src += src_stride;
+      s2 = vld1q_u8(src);
+      src += src_stride;
+      s3 = vld1q_u8(src);
+      src += src_stride;
+
+      t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
+      t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
+      t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
+      t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+
+      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+      d01 = vqrshrun_n_s16(t01, 7);
+      d23 = vqrshrun_n_s16(t23, 7);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
+      dst += dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        s0 = vld1q_u8(s + 0 * src_stride);
+        s1 = vld1q_u8(s + 1 * src_stride);
+        s2 = vld1q_u8(s + 2 * src_stride);
+        s3 = vld1q_u8(s + 3 * src_stride);
+
+        d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
+        d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
+        d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
+        d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
+
+        dd0 = vld1_u8(d + 0 * dst_stride);
+        dd1 = vld1_u8(d + 1 * dst_stride);
+        dd2 = vld1_u8(d + 2 * dst_stride);
+        dd3 = vld1_u8(d + 3 * dst_stride);
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        vst1_u8(d + 0 * dst_stride, d0);
+        vst1_u8(d + 1 * dst_stride, d1);
+        vst1_u8(d + 2 * dst_stride, d2);
+        vst1_u8(d + 3 * dst_stride, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
  #else
  
  void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
@@ -412,8 +523,6 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
    }
  }
  
-#endif
-
  void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
                                    const InterpKernel *filter, int x0_q4,
@@ -704,6 +813,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
    }
  }
  
+#endif
+
  #if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
      (__ARM_FEATURE_DOTPROD == 1)
author	Jonathan Wright <jonathan.wright@arm.com>
	Mon, 17 May 2021 09:53:07 +0000 (10:53 +0100)
committer	James Zern <jzern@google.com>
	Tue, 18 May 2021 20:33:46 +0000 (13:33 -0700)