From c23970ec254a3328c4fcf8387b4444a724a627af Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 22 Dec 2016 10:04:42 -0800 Subject: [PATCH] postproc: vpx_mbpost_proc_down_neon This was much more amenable to optimization than the across filter. Speedup of almost 2.5x BUG=webm:1320 Change-Id: I49acc0f9cb2e7642303df90132cbc938acade4c4 --- test/pp_filter_test.cc | 3 ++ vpx_dsp/arm/deblock_neon.c | 99 ++++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 3 files changed, 103 insertions(+), 1 deletion(-) diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc index 4b4795acc..7061c9159 100644 --- a/test/pp_filter_test.cc +++ b/test/pp_filter_test.cc @@ -598,6 +598,9 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P(NEON, VpxMbPostProcAcrossIpTest, ::testing::Values(vpx_mbpost_proc_across_ip_neon)); + +INSTANTIATE_TEST_CASE_P(NEON, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_neon)); #endif // HAVE_NEON #if HAVE_MSA diff --git a/vpx_dsp/arm/deblock_neon.c b/vpx_dsp/arm/deblock_neon.c index ed1a4df25..1fb41d299 100644 --- a/vpx_dsp/arm/deblock_neon.c +++ b/vpx_dsp/arm/deblock_neon.c @@ -15,6 +15,8 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/transpose_neon.h" +extern const int16_t vpx_rv[]; + static uint8x8_t average_k_out(const uint8x8_t a2, const uint8x8_t a1, const uint8x8_t v0, const uint8x8_t b1, const uint8x8_t b2) { @@ -384,3 +386,100 @@ void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols, src += pitch; } } + +// Apply filter of (vpx_rv + sum + s[c]) >> 4. +static uint8x8_t filter_pixels_rv(const int16x8_t sum, const uint8x8_t s, + const int16x8_t rv) { + const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s)); + const int16x8_t sum_s = vaddq_s16(sum, s16); + const int16x8_t rounded = vaddq_s16(sum_s, rv); + + return vqshrun_n_s16(rounded, 4); +} + +void vpx_mbpost_proc_down_neon(uint8_t *dst, int pitch, int rows, int cols, + int flimit) { + int row, col, i; + const int32x4_t f = vdupq_n_s32(flimit); + uint8x8_t below_context = vdup_n_u8(0); + + // 8 columns are processed at a time. + // If rows is less than 8 the bottom border extension fails. + assert(cols % 8 == 0); + assert(rows >= 8); + + // Load and keep the first 8 values in memory. Process a vertical stripe that + // is 8 wide. + for (col = 0; col < cols; col += 8) { + uint8x8_t s, above_context[8]; + int16x8_t sum, sum_tmp; + int32x4_t sumsq_low, sumsq_high; + + // Load and extend the top border. + s = vld1_u8(dst); + for (i = 0; i < 8; i++) { + above_context[i] = s; + } + + sum_tmp = vreinterpretq_s16_u16(vmovl_u8(s)); + + // sum * 9 + sum = vmulq_n_s16(sum_tmp, 9); + + // (sum * 9) * sum == sum * sum * 9 + sumsq_low = vmull_s16(vget_low_s16(sum), vget_low_s16(sum_tmp)); + sumsq_high = vmull_s16(vget_high_s16(sum), vget_high_s16(sum_tmp)); + + // Load and discard the next 6 values to prime sum and sumsq. + for (i = 1; i <= 6; ++i) { + const uint8x8_t a = vld1_u8(dst + i * pitch); + const int16x8_t b = vreinterpretq_s16_u16(vmovl_u8(a)); + sum = vaddq_s16(sum, b); + + sumsq_low = vmlal_s16(sumsq_low, vget_low_s16(b), vget_low_s16(b)); + sumsq_high = vmlal_s16(sumsq_high, vget_high_s16(b), vget_high_s16(b)); + } + + for (row = 0; row < rows; ++row) { + uint8x8_t mask, output; + int16x8_t x, y; + int32x4_t xy_low, xy_high; + + s = vld1_u8(dst + row * pitch); + + // Extend the bottom border. + if (row + 7 < rows) { + below_context = vld1_u8(dst + (row + 7) * pitch); + } + + x = vreinterpretq_s16_u16(vsubl_u8(below_context, above_context[0])); + y = vreinterpretq_s16_u16(vaddl_u8(below_context, above_context[0])); + xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y)); + xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y)); + + sum = vaddq_s16(sum, x); + + sumsq_low = vaddq_s32(sumsq_low, xy_low); + sumsq_high = vaddq_s32(sumsq_high, xy_high); + + mask = combine_mask(vget_low_s16(sum), vget_high_s16(sum), sumsq_low, + sumsq_high, f); + + output = filter_pixels_rv(sum, s, vld1q_s16(vpx_rv + (row & 127))); + output = vbsl_u8(mask, output, s); + + vst1_u8(dst + row * pitch, output); + + above_context[0] = above_context[1]; + above_context[1] = above_context[2]; + above_context[2] = above_context[3]; + above_context[3] = above_context[4]; + above_context[4] = above_context[5]; + above_context[5] = above_context[6]; + above_context[6] = above_context[7]; + above_context[7] = s; + } + + dst += 8; + } +} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index ed7dd4da5..ee1b29279 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1751,7 +1751,7 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") specialize qw/vpx_plane_add_noise sse2 msa/; add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit"; - specialize qw/vpx_mbpost_proc_down sse2 msa/; + specialize qw/vpx_mbpost_proc_down sse2 neon msa/; add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit"; specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa/; -- 2.40.0