From 7aa27bd62fd69ed3c530b1b1a61e574f9ec7aa32 Mon Sep 17 00:00:00 2001
From: Linfeng Zhang <linfengz@google.com>
Date: Wed, 5 Oct 2016 10:15:07 -0700
Subject: [PATCH] [vpx highbd lpf NEON 1/6] horizontal 4

BUG=webm:1300

Change-Id: Idf441806e6bf397ff5ecd8776146b3f781f50c40
---
 test/lpf_test.cc                     |  17 ++-
 vp8/common/loopfilter_filters.c      |   4 +-
 vpx_dsp/arm/highbd_loopfilter_neon.c | 180 +++++++++++++++++++++++++++
 vpx_dsp/arm/loopfilter_neon.c        |   4 +-
 vpx_dsp/loopfilter.c                 |   8 +-
 vpx_dsp/vpx_dsp.mk                   |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl         |   4 +-
 7 files changed, 207 insertions(+), 11 deletions(-)
 create mode 100644 vpx_dsp/arm/highbd_loopfilter_neon.c

diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 3f0a60b5c..376433535 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -514,7 +514,22 @@ INSTANTIATE_TEST_CASE_P(
 
 #if HAVE_NEON
 #if CONFIG_VP9_HIGHBITDEPTH
-// No neon high bitdepth functions.
+INSTANTIATE_TEST_CASE_P(
+    NEON, Loop8Test6Param,
+    ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_neon,
+                                 &vpx_highbd_lpf_horizontal_4_c, 8),
+                      make_tuple(&vpx_highbd_lpf_horizontal_4_neon,
+                                 &vpx_highbd_lpf_horizontal_4_c, 10),
+                      make_tuple(&vpx_highbd_lpf_horizontal_4_neon,
+                                 &vpx_highbd_lpf_horizontal_4_c, 12)));
+INSTANTIATE_TEST_CASE_P(
+    NEON, Loop8Test9Param,
+    ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon,
+                                 &vpx_highbd_lpf_horizontal_4_dual_c, 8),
+                      make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon,
+                                 &vpx_highbd_lpf_horizontal_4_dual_c, 10),
+                      make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon,
+                                 &vpx_highbd_lpf_horizontal_4_dual_c, 12)));
 #else
 INSTANTIATE_TEST_CASE_P(
     NEON, Loop8Test6Param,
diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c
index 1f60721e1..2a7cde878 100644
--- a/vp8/common/loopfilter_filters.c
+++ b/vp8/common/loopfilter_filters.c
@@ -63,8 +63,8 @@ static void vp8_filter(signed char mask, uc hev, uc *op1, uc *op0, uc *oq0,
   filter_value &= mask;
 
   /* save bottom 3 bits so that we round one side +4 and the other +3
-   * if it equals 4 we'll set to adjust by -1 to account for the fact
-   * we'd round 3 the other way
+   * if it equals 4 we'll set it to adjust by -1 to account for the fact
+   * we'd round it by 3 the other way
    */
   Filter1 = vp8_signed_char_clamp(filter_value + 4);
   Filter2 = vp8_signed_char_clamp(filter_value + 3);
diff --git a/vpx_dsp/arm/highbd_loopfilter_neon.c b/vpx_dsp/arm/highbd_loopfilter_neon.c
new file mode 100644
index 000000000..7ef326d34
--- /dev/null
+++ b/vpx_dsp/arm/highbd_loopfilter_neon.c
@@ -0,0 +1,180 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+static INLINE void load_thresh(const uint8_t *blimit, const uint8_t *limit,
+                               const uint8_t *thresh, uint16x8_t *blimit_vec,
+                               uint16x8_t *limit_vec, uint16x8_t *thresh_vec,
+                               const int bd) {
+  const int16x8_t shift = vdupq_n_s16(bd - 8);
+  *blimit_vec = vmovl_u8(vld1_dup_u8(blimit));
+  *limit_vec = vmovl_u8(vld1_dup_u8(limit));
+  *thresh_vec = vmovl_u8(vld1_dup_u8(thresh));
+  *blimit_vec = vshlq_u16(*blimit_vec, shift);
+  *limit_vec = vshlq_u16(*limit_vec, shift);
+  *thresh_vec = vshlq_u16(*thresh_vec, shift);
+}
+
+static INLINE uint16x8_t
+filter_hev_mask4(const uint16x8_t limit, const uint16x8_t blimit,
+                 const uint16x8_t thresh, const uint16x8_t p3,
+                 const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0,
+                 const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2,
+                 const uint16x8_t q3, uint16x8_t *hev, uint16x8_t *mask) {
+  uint16x8_t max, t0, t1;
+
+  max = vabdq_u16(p1, p0);
+  max = vmaxq_u16(max, vabdq_u16(q1, q0));
+  *hev = vcgtq_u16(max, thresh);
+  *mask = vmaxq_u16(max, vabdq_u16(p3, p2));
+  *mask = vmaxq_u16(*mask, vabdq_u16(p2, p1));
+  *mask = vmaxq_u16(*mask, vabdq_u16(q2, q1));
+  *mask = vmaxq_u16(*mask, vabdq_u16(q3, q2));
+  t0 = vabdq_u16(p0, q0);
+  t1 = vabdq_u16(p1, q1);
+  t0 = vaddq_u16(t0, t0);
+  t1 = vshrq_n_u16(t1, 1);
+  t0 = vaddq_u16(t0, t1);
+  *mask = vcleq_u16(*mask, limit);
+  t0 = vcleq_u16(t0, blimit);
+  *mask = vandq_u16(*mask, t0);
+
+  return max;
+}
+
+static INLINE int16x8_t flip_sign(const uint16x8_t v, const int bd) {
+  const uint16x8_t offset = vdupq_n_u16(0x80 << (bd - 8));
+  return vreinterpretq_s16_u16(vsubq_u16(v, offset));
+}
+
+static INLINE uint16x8_t flip_sign_back(const int16x8_t v, const int bd) {
+  const int16x8_t offset = vdupq_n_s16(0x80 << (bd - 8));
+  return vreinterpretq_u16_s16(vaddq_s16(v, offset));
+}
+
+static INLINE void filter4(const uint16x8_t mask, const uint16x8_t hev,
+                           const uint16x8_t p1, const uint16x8_t p0,
+                           const uint16x8_t q0, const uint16x8_t q1,
+                           uint16x8_t *op1, uint16x8_t *op0, uint16x8_t *oq0,
+                           uint16x8_t *oq1, const int bd) {
+  const int16x8_t max = vdupq_n_s16((1 << (bd - 1)) - 1);
+  const int16x8_t min = vdupq_n_s16((int16_t)(((uint32_t)-1) << (bd - 1)));
+  int16x8_t filter, filter1, filter2, t;
+  int16x8_t ps1 = flip_sign(p1, bd);
+  int16x8_t ps0 = flip_sign(p0, bd);
+  int16x8_t qs0 = flip_sign(q0, bd);
+  int16x8_t qs1 = flip_sign(q1, bd);
+
+  /* add outer taps if we have high edge variance */
+  filter = vsubq_s16(ps1, qs1);
+  filter = vmaxq_s16(filter, min);
+  filter = vminq_s16(filter, max);
+  filter = vandq_s16(filter, vreinterpretq_s16_u16(hev));
+  t = vsubq_s16(qs0, ps0);
+
+  /* inner taps */
+  filter = vaddq_s16(filter, t);
+  filter = vaddq_s16(filter, t);
+  filter = vaddq_s16(filter, t);
+  filter = vmaxq_s16(filter, min);
+  filter = vminq_s16(filter, max);
+  filter = vandq_s16(filter, vreinterpretq_s16_u16(mask));
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3 */
+  /* if it equals 4 we'll set it to adjust by -1 to account for the fact */
+  /* we'd round it by 3 the other way */
+  t = vaddq_s16(filter, vdupq_n_s16(4));
+  t = vminq_s16(t, max);
+  filter1 = vshrq_n_s16(t, 3);
+  t = vaddq_s16(filter, vdupq_n_s16(3));
+  t = vminq_s16(t, max);
+  filter2 = vshrq_n_s16(t, 3);
+
+  qs0 = vsubq_s16(qs0, filter1);
+  qs0 = vmaxq_s16(qs0, min);
+  qs0 = vminq_s16(qs0, max);
+  ps0 = vaddq_s16(ps0, filter2);
+  ps0 = vmaxq_s16(ps0, min);
+  ps0 = vminq_s16(ps0, max);
+  *oq0 = flip_sign_back(qs0, bd);
+  *op0 = flip_sign_back(ps0, bd);
+
+  /* outer tap adjustments */
+  filter = vrshrq_n_s16(filter1, 1);
+  filter = vbicq_s16(filter, vreinterpretq_s16_u16(hev));
+
+  qs1 = vsubq_s16(qs1, filter);
+  qs1 = vmaxq_s16(qs1, min);
+  qs1 = vminq_s16(qs1, max);
+  ps1 = vaddq_s16(ps1, filter);
+  ps1 = vmaxq_s16(ps1, min);
+  ps1 = vminq_s16(ps1, max);
+  *oq1 = flip_sign_back(qs1, bd);
+  *op1 = flip_sign_back(ps1, bd);
+}
+
+static INLINE void load_8x8(const uint16_t *s, const int p, uint16x8_t *p3,
+                            uint16x8_t *p2, uint16x8_t *p1, uint16x8_t *p0,
+                            uint16x8_t *q0, uint16x8_t *q1, uint16x8_t *q2,
+                            uint16x8_t *q3) {
+  *p3 = vld1q_u16(s);
+  s += p;
+  *p2 = vld1q_u16(s);
+  s += p;
+  *p1 = vld1q_u16(s);
+  s += p;
+  *p0 = vld1q_u16(s);
+  s += p;
+  *q0 = vld1q_u16(s);
+  s += p;
+  *q1 = vld1q_u16(s);
+  s += p;
+  *q2 = vld1q_u16(s);
+  s += p;
+  *q3 = vld1q_u16(s);
+}
+
+static INLINE void store_8x4(uint16_t *s, const int p, const uint16x8_t s0,
+                             const uint16x8_t s1, const uint16x8_t s2,
+                             const uint16x8_t s3) {
+  vst1q_u16(s, s0);
+  s += p;
+  vst1q_u16(s, s1);
+  s += p;
+  vst1q_u16(s, s2);
+  s += p;
+  vst1q_u16(s, s3);
+}
+
+void vpx_highbd_lpf_horizontal_4_neon(uint16_t *s, int p, const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
+  uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      mask, hev;
+
+  load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+  load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+                   q2, q3, &hev, &mask);
+  filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd);
+  store_8x4(s - 2 * p, p, p1, p0, q0, q1);
+}
+
+void vpx_highbd_lpf_horizontal_4_dual_neon(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  vpx_highbd_lpf_horizontal_4_neon(s, p, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_4_neon(s + 8, p, blimit1, limit1, thresh1, bd);
+}
diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c
index fc080163b..b1f8f43b4 100644
--- a/vpx_dsp/arm/loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c
@@ -423,8 +423,8 @@ static INLINE void apply_15_tap_filter_16(
     filter = vand##r##s8(filter, vreinterpret##r##s8_u8(mask));               \
                                                                               \
     /* save bottom 3 bits so that we round one side +4 and the other +3 */    \
-    /* if it equals 4 we'll set to adjust by -1 to account for the fact */    \
-    /* we'd round 3 the other way */                                          \
+    /* if it equals 4 we'll set it to adjust by -1 to account for the fact */ \
+    /* we'd round it by 3 the other way */                                    \
     filter1 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(4)), 3);       \
     filter2 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(3)), 3);       \
                                                                               \
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index 60a15e23b..9866ea37d 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -94,8 +94,8 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
   filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
 
   // save bottom 3 bits so that we round one side +4 and the other +3
-  // if it equals 4 we'll set to adjust by -1 to account for the fact
-  // we'd round 3 the other way
+  // if it equals 4 we'll set it to adjust by -1 to account for the fact
+  // we'd round it by 3 the other way
   filter1 = signed_char_clamp(filter + 4) >> 3;
   filter2 = signed_char_clamp(filter + 3) >> 3;
 
@@ -425,8 +425,8 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
   filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
 
   // Save bottom 3 bits so that we round one side +4 and the other +3
-  // if it equals 4 we'll set to adjust by -1 to account for the fact
-  // we'd round 3 the other way.
+  // if it equals 4 we'll set it to adjust by -1 to account for the fact
+  // we'd round it by 3 the other way.
   filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
   filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
 
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 6cf0a3704..f0f5bf878 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -159,6 +159,7 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_horiz_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_vert_dspr2.c
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_loopfilter_neon.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 113087c9e..42f7e4c2c 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -487,10 +487,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2/;
 
   add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vpx_highbd_lpf_horizontal_4 sse2/;
+  specialize qw/vpx_highbd_lpf_horizontal_4 sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vpx_highbd_lpf_horizontal_4_dual sse2/;
+  specialize qw/vpx_highbd_lpf_horizontal_4_dual sse2 neon/;
 }  # CONFIG_VP9_HIGHBITDEPTH
 
 #
-- 
2.50.0