From 7aa27bd62fd69ed3c530b1b1a61e574f9ec7aa32 Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Wed, 5 Oct 2016 10:15:07 -0700 Subject: [PATCH] [vpx highbd lpf NEON 1/6] horizontal 4 BUG=webm:1300 Change-Id: Idf441806e6bf397ff5ecd8776146b3f781f50c40 --- test/lpf_test.cc | 17 ++- vp8/common/loopfilter_filters.c | 4 +- vpx_dsp/arm/highbd_loopfilter_neon.c | 180 +++++++++++++++++++++++++++ vpx_dsp/arm/loopfilter_neon.c | 4 +- vpx_dsp/loopfilter.c | 8 +- vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- 7 files changed, 207 insertions(+), 11 deletions(-) create mode 100644 vpx_dsp/arm/highbd_loopfilter_neon.c diff --git a/test/lpf_test.cc b/test/lpf_test.cc index 3f0a60b5c..376433535 100644 --- a/test/lpf_test.cc +++ b/test/lpf_test.cc @@ -514,7 +514,22 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_NEON #if CONFIG_VP9_HIGHBITDEPTH -// No neon high bitdepth functions. +INSTANTIATE_TEST_CASE_P( + NEON, Loop8Test6Param, + ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_neon, + &vpx_highbd_lpf_horizontal_4_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_4_neon, + &vpx_highbd_lpf_horizontal_4_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_4_neon, + &vpx_highbd_lpf_horizontal_4_c, 12))); +INSTANTIATE_TEST_CASE_P( + NEON, Loop8Test9Param, + ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon, + &vpx_highbd_lpf_horizontal_4_dual_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon, + &vpx_highbd_lpf_horizontal_4_dual_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon, + &vpx_highbd_lpf_horizontal_4_dual_c, 12))); #else INSTANTIATE_TEST_CASE_P( NEON, Loop8Test6Param, diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c index 1f60721e1..2a7cde878 100644 --- a/vp8/common/loopfilter_filters.c +++ b/vp8/common/loopfilter_filters.c @@ -63,8 +63,8 @@ static void vp8_filter(signed char mask, uc hev, uc *op1, uc *op0, uc *oq0, filter_value &= mask; /* save bottom 3 bits so that we round one side +4 and the other +3 - * if it equals 4 we'll set to adjust by -1 to account for the fact - * we'd round 3 the other way + * if it equals 4 we'll set it to adjust by -1 to account for the fact + * we'd round it by 3 the other way */ Filter1 = vp8_signed_char_clamp(filter_value + 4); Filter2 = vp8_signed_char_clamp(filter_value + 3); diff --git a/vpx_dsp/arm/highbd_loopfilter_neon.c b/vpx_dsp/arm/highbd_loopfilter_neon.c new file mode 100644 index 000000000..7ef326d34 --- /dev/null +++ b/vpx_dsp/arm/highbd_loopfilter_neon.c @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/transpose_neon.h" + +static INLINE void load_thresh(const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, uint16x8_t *blimit_vec, + uint16x8_t *limit_vec, uint16x8_t *thresh_vec, + const int bd) { + const int16x8_t shift = vdupq_n_s16(bd - 8); + *blimit_vec = vmovl_u8(vld1_dup_u8(blimit)); + *limit_vec = vmovl_u8(vld1_dup_u8(limit)); + *thresh_vec = vmovl_u8(vld1_dup_u8(thresh)); + *blimit_vec = vshlq_u16(*blimit_vec, shift); + *limit_vec = vshlq_u16(*limit_vec, shift); + *thresh_vec = vshlq_u16(*thresh_vec, shift); +} + +static INLINE uint16x8_t +filter_hev_mask4(const uint16x8_t limit, const uint16x8_t blimit, + const uint16x8_t thresh, const uint16x8_t p3, + const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2, + const uint16x8_t q3, uint16x8_t *hev, uint16x8_t *mask) { + uint16x8_t max, t0, t1; + + max = vabdq_u16(p1, p0); + max = vmaxq_u16(max, vabdq_u16(q1, q0)); + *hev = vcgtq_u16(max, thresh); + *mask = vmaxq_u16(max, vabdq_u16(p3, p2)); + *mask = vmaxq_u16(*mask, vabdq_u16(p2, p1)); + *mask = vmaxq_u16(*mask, vabdq_u16(q2, q1)); + *mask = vmaxq_u16(*mask, vabdq_u16(q3, q2)); + t0 = vabdq_u16(p0, q0); + t1 = vabdq_u16(p1, q1); + t0 = vaddq_u16(t0, t0); + t1 = vshrq_n_u16(t1, 1); + t0 = vaddq_u16(t0, t1); + *mask = vcleq_u16(*mask, limit); + t0 = vcleq_u16(t0, blimit); + *mask = vandq_u16(*mask, t0); + + return max; +} + +static INLINE int16x8_t flip_sign(const uint16x8_t v, const int bd) { + const uint16x8_t offset = vdupq_n_u16(0x80 << (bd - 8)); + return vreinterpretq_s16_u16(vsubq_u16(v, offset)); +} + +static INLINE uint16x8_t flip_sign_back(const int16x8_t v, const int bd) { + const int16x8_t offset = vdupq_n_s16(0x80 << (bd - 8)); + return vreinterpretq_u16_s16(vaddq_s16(v, offset)); +} + +static INLINE void filter4(const uint16x8_t mask, const uint16x8_t hev, + const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, + uint16x8_t *op1, uint16x8_t *op0, uint16x8_t *oq0, + uint16x8_t *oq1, const int bd) { + const int16x8_t max = vdupq_n_s16((1 << (bd - 1)) - 1); + const int16x8_t min = vdupq_n_s16((int16_t)(((uint32_t)-1) << (bd - 1))); + int16x8_t filter, filter1, filter2, t; + int16x8_t ps1 = flip_sign(p1, bd); + int16x8_t ps0 = flip_sign(p0, bd); + int16x8_t qs0 = flip_sign(q0, bd); + int16x8_t qs1 = flip_sign(q1, bd); + + /* add outer taps if we have high edge variance */ + filter = vsubq_s16(ps1, qs1); + filter = vmaxq_s16(filter, min); + filter = vminq_s16(filter, max); + filter = vandq_s16(filter, vreinterpretq_s16_u16(hev)); + t = vsubq_s16(qs0, ps0); + + /* inner taps */ + filter = vaddq_s16(filter, t); + filter = vaddq_s16(filter, t); + filter = vaddq_s16(filter, t); + filter = vmaxq_s16(filter, min); + filter = vminq_s16(filter, max); + filter = vandq_s16(filter, vreinterpretq_s16_u16(mask)); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + /* if it equals 4 we'll set it to adjust by -1 to account for the fact */ + /* we'd round it by 3 the other way */ + t = vaddq_s16(filter, vdupq_n_s16(4)); + t = vminq_s16(t, max); + filter1 = vshrq_n_s16(t, 3); + t = vaddq_s16(filter, vdupq_n_s16(3)); + t = vminq_s16(t, max); + filter2 = vshrq_n_s16(t, 3); + + qs0 = vsubq_s16(qs0, filter1); + qs0 = vmaxq_s16(qs0, min); + qs0 = vminq_s16(qs0, max); + ps0 = vaddq_s16(ps0, filter2); + ps0 = vmaxq_s16(ps0, min); + ps0 = vminq_s16(ps0, max); + *oq0 = flip_sign_back(qs0, bd); + *op0 = flip_sign_back(ps0, bd); + + /* outer tap adjustments */ + filter = vrshrq_n_s16(filter1, 1); + filter = vbicq_s16(filter, vreinterpretq_s16_u16(hev)); + + qs1 = vsubq_s16(qs1, filter); + qs1 = vmaxq_s16(qs1, min); + qs1 = vminq_s16(qs1, max); + ps1 = vaddq_s16(ps1, filter); + ps1 = vmaxq_s16(ps1, min); + ps1 = vminq_s16(ps1, max); + *oq1 = flip_sign_back(qs1, bd); + *op1 = flip_sign_back(ps1, bd); +} + +static INLINE void load_8x8(const uint16_t *s, const int p, uint16x8_t *p3, + uint16x8_t *p2, uint16x8_t *p1, uint16x8_t *p0, + uint16x8_t *q0, uint16x8_t *q1, uint16x8_t *q2, + uint16x8_t *q3) { + *p3 = vld1q_u16(s); + s += p; + *p2 = vld1q_u16(s); + s += p; + *p1 = vld1q_u16(s); + s += p; + *p0 = vld1q_u16(s); + s += p; + *q0 = vld1q_u16(s); + s += p; + *q1 = vld1q_u16(s); + s += p; + *q2 = vld1q_u16(s); + s += p; + *q3 = vld1q_u16(s); +} + +static INLINE void store_8x4(uint16_t *s, const int p, const uint16x8_t s0, + const uint16x8_t s1, const uint16x8_t s2, + const uint16x8_t s3) { + vst1q_u16(s, s0); + s += p; + vst1q_u16(s, s1); + s += p; + vst1q_u16(s, s2); + s += p; + vst1q_u16(s, s3); +} + +void vpx_highbd_lpf_horizontal_4_neon(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + mask, hev; + + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, + q2, q3, &hev, &mask); + filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd); + store_8x4(s - 2 * p, p, p1, p0, q0, q1); +} + +void vpx_highbd_lpf_horizontal_4_dual_neon( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_horizontal_4_neon(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_4_neon(s + 8, p, blimit1, limit1, thresh1, bd); +} diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c index fc080163b..b1f8f43b4 100644 --- a/vpx_dsp/arm/loopfilter_neon.c +++ b/vpx_dsp/arm/loopfilter_neon.c @@ -423,8 +423,8 @@ static INLINE void apply_15_tap_filter_16( filter = vand##r##s8(filter, vreinterpret##r##s8_u8(mask)); \ \ /* save bottom 3 bits so that we round one side +4 and the other +3 */ \ - /* if it equals 4 we'll set to adjust by -1 to account for the fact */ \ - /* we'd round 3 the other way */ \ + /* if it equals 4 we'll set it to adjust by -1 to account for the fact */ \ + /* we'd round it by 3 the other way */ \ filter1 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(4)), 3); \ filter2 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(3)), 3); \ \ diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c index 60a15e23b..9866ea37d 100644 --- a/vpx_dsp/loopfilter.c +++ b/vpx_dsp/loopfilter.c @@ -94,8 +94,8 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; // save bottom 3 bits so that we round one side +4 and the other +3 - // if it equals 4 we'll set to adjust by -1 to account for the fact - // we'd round 3 the other way + // if it equals 4 we'll set it to adjust by -1 to account for the fact + // we'd round it by 3 the other way filter1 = signed_char_clamp(filter + 4) >> 3; filter2 = signed_char_clamp(filter + 3) >> 3; @@ -425,8 +425,8 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1, filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask; // Save bottom 3 bits so that we round one side +4 and the other +3 - // if it equals 4 we'll set to adjust by -1 to account for the fact - // we'd round 3 the other way. + // if it equals 4 we'll set it to adjust by -1 to account for the fact + // we'd round it by 3 the other way. filter1 = signed_char_clamp_high(filter + 4, bd) >> 3; filter2 = signed_char_clamp_high(filter + 3, bd) >> 3; diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 6cf0a3704..f0f5bf878 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -159,6 +159,7 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_horiz_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_vert_dspr2.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_NEON) += arm/highbd_loopfilter_neon.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_loopfilter_sse2.c endif # CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 113087c9e..42f7e4c2c 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -487,10 +487,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2/; add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/vpx_highbd_lpf_horizontal_4 sse2/; + specialize qw/vpx_highbd_lpf_horizontal_4 sse2 neon/; add_proto qw/void vpx_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/vpx_highbd_lpf_horizontal_4_dual sse2/; + specialize qw/vpx_highbd_lpf_horizontal_4_dual sse2 neon/; } # CONFIG_VP9_HIGHBITDEPTH # -- 2.50.0