postproc: vpx_mbpost_proc_down_neon

author Johann <johannkoenig@google.com>

Thu, 22 Dec 2016 18:04:42 +0000 (10:04 -0800)

committer Johann <johannkoenig@google.com>

Mon, 9 Jan 2017 18:21:56 +0000 (10:21 -0800)
author Johann <johannkoenig@google.com>
Thu, 22 Dec 2016 18:04:42 +0000 (10:04 -0800)
committer Johann <johannkoenig@google.com>
Mon, 9 Jan 2017 18:21:56 +0000 (10:21 -0800)
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc

index 4b4795accf0c637cc3e4852a1000f9ac19bd2920..7061c91597ce55705a44eb16f0288196a90939e0 100644 (file)
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -598,6 +598,9 @@ INSTANTIATE_TEST_CASE_P(
  
  INSTANTIATE_TEST_CASE_P(NEON, VpxMbPostProcAcrossIpTest,
                          ::testing::Values(vpx_mbpost_proc_across_ip_neon));
+
+INSTANTIATE_TEST_CASE_P(NEON, VpxMbPostProcDownTest,
+                        ::testing::Values(vpx_mbpost_proc_down_neon));
  #endif  // HAVE_NEON
  
  #if HAVE_MSA
diff --git a/vpx_dsp/arm/deblock_neon.c b/vpx_dsp/arm/deblock_neon.c

index ed1a4df25c1d2d315fcf2df227d61831032f5925..1fb41d29920872f13157e718cd9a66e8f8d3e30a 100644 (file)
--- a/vpx_dsp/arm/deblock_neon.c
+++ b/vpx_dsp/arm/deblock_neon.c
@@ -15,6 +15,8 @@
  #include "vpx/vpx_integer.h"
  #include "vpx_dsp/arm/transpose_neon.h"
  
+extern const int16_t vpx_rv[];
+
  static uint8x8_t average_k_out(const uint8x8_t a2, const uint8x8_t a1,
                                 const uint8x8_t v0, const uint8x8_t b1,
                                 const uint8x8_t b2) {
@@ -384,3 +386,100 @@ void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols,
      src += pitch;
    }
  }
+
+// Apply filter of (vpx_rv + sum + s[c]) >> 4.
+static uint8x8_t filter_pixels_rv(const int16x8_t sum, const uint8x8_t s,
+                                  const int16x8_t rv) {
+  const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));
+  const int16x8_t sum_s = vaddq_s16(sum, s16);
+  const int16x8_t rounded = vaddq_s16(sum_s, rv);
+
+  return vqshrun_n_s16(rounded, 4);
+}
+
+void vpx_mbpost_proc_down_neon(uint8_t *dst, int pitch, int rows, int cols,
+                               int flimit) {
+  int row, col, i;
+  const int32x4_t f = vdupq_n_s32(flimit);
+  uint8x8_t below_context = vdup_n_u8(0);
+
+  // 8 columns are processed at a time.
+  // If rows is less than 8 the bottom border extension fails.
+  assert(cols % 8 == 0);
+  assert(rows >= 8);
+
+  // Load and keep the first 8 values in memory. Process a vertical stripe that
+  // is 8 wide.
+  for (col = 0; col < cols; col += 8) {
+    uint8x8_t s, above_context[8];
+    int16x8_t sum, sum_tmp;
+    int32x4_t sumsq_low, sumsq_high;
+
+    // Load and extend the top border.
+    s = vld1_u8(dst);
+    for (i = 0; i < 8; i++) {
+      above_context[i] = s;
+    }
+
+    sum_tmp = vreinterpretq_s16_u16(vmovl_u8(s));
+
+    // sum * 9
+    sum = vmulq_n_s16(sum_tmp, 9);
+
+    // (sum * 9) * sum == sum * sum * 9
+    sumsq_low = vmull_s16(vget_low_s16(sum), vget_low_s16(sum_tmp));
+    sumsq_high = vmull_s16(vget_high_s16(sum), vget_high_s16(sum_tmp));
+
+    // Load and discard the next 6 values to prime sum and sumsq.
+    for (i = 1; i <= 6; ++i) {
+      const uint8x8_t a = vld1_u8(dst + i * pitch);
+      const int16x8_t b = vreinterpretq_s16_u16(vmovl_u8(a));
+      sum = vaddq_s16(sum, b);
+
+      sumsq_low = vmlal_s16(sumsq_low, vget_low_s16(b), vget_low_s16(b));
+      sumsq_high = vmlal_s16(sumsq_high, vget_high_s16(b), vget_high_s16(b));
+    }
+
+    for (row = 0; row < rows; ++row) {
+      uint8x8_t mask, output;
+      int16x8_t x, y;
+      int32x4_t xy_low, xy_high;
+
+      s = vld1_u8(dst + row * pitch);
+
+      // Extend the bottom border.
+      if (row + 7 < rows) {
+        below_context = vld1_u8(dst + (row + 7) * pitch);
+      }
+
+      x = vreinterpretq_s16_u16(vsubl_u8(below_context, above_context[0]));
+      y = vreinterpretq_s16_u16(vaddl_u8(below_context, above_context[0]));
+      xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
+      xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
+
+      sum = vaddq_s16(sum, x);
+
+      sumsq_low = vaddq_s32(sumsq_low, xy_low);
+      sumsq_high = vaddq_s32(sumsq_high, xy_high);
+
+      mask = combine_mask(vget_low_s16(sum), vget_high_s16(sum), sumsq_low,
+                          sumsq_high, f);
+
+      output = filter_pixels_rv(sum, s, vld1q_s16(vpx_rv + (row & 127)));
+      output = vbsl_u8(mask, output, s);
+
+      vst1_u8(dst + row * pitch, output);
+
+      above_context[0] = above_context[1];
+      above_context[1] = above_context[2];
+      above_context[2] = above_context[3];
+      above_context[3] = above_context[4];
+      above_context[4] = above_context[5];
+      above_context[5] = above_context[6];
+      above_context[6] = above_context[7];
+      above_context[7] = s;
+    }
+
+    dst += 8;
+  }
+}
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl

index ed7dd4da55c8e9d923a00a6abdf0c4a3491f9509..ee1b292793815e61d8d88139a9c82fda2211f6d6 100644 (file)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1751,7 +1751,7 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC")
      specialize qw/vpx_plane_add_noise sse2 msa/;
  
      add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
-    specialize qw/vpx_mbpost_proc_down sse2 msa/;
+    specialize qw/vpx_mbpost_proc_down sse2 neon msa/;
  
      add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
      specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa/;
author	Johann <johannkoenig@google.com>
	Thu, 22 Dec 2016 18:04:42 +0000 (10:04 -0800)
committer	Johann <johannkoenig@google.com>
	Mon, 9 Jan 2017 18:21:56 +0000 (10:21 -0800)
test/pp_filter_test.cc		patch \| blob \| history
vpx_dsp/arm/deblock_neon.c		patch \| blob \| history
vpx_dsp/vpx_dsp_rtcd_defs.pl		patch \| blob \| history