From: Johann Date: Fri, 12 May 2017 18:05:03 +0000 (-0700) Subject: neon fdct: 4x4 implementation X-Git-Tag: v1.7.0~451^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=105503b8392f0d42cfc789174e67c59e80563a83;p=libvpx neon fdct: 4x4 implementation Approximately twice as fast as C implementation. BUG=webm:1424 Change-Id: I3c0307fb08ddc23df42545cd089a78e2ed5c9d3f --- diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc index bd2327520..aa90bfa18 100644 --- a/test/fdct4x4_test.cc +++ b/test/fdct4x4_test.cc @@ -440,7 +440,7 @@ INSTANTIATE_TEST_CASE_P(C, Trans4x4WHT, #if HAVE_NEON && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P(NEON, Trans4x4DCT, - ::testing::Values(make_tuple(&vpx_fdct4x4_c, + ::testing::Values(make_tuple(&vpx_fdct4x4_neon, &vpx_idct4x4_16_add_neon, 0, VPX_BITS_8))); #if !CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/arm/fdct_neon.c b/vpx_dsp/arm/fdct_neon.c new file mode 100644 index 000000000..fe78f3f51 --- /dev/null +++ b/vpx_dsp/arm/fdct_neon.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" + +void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + int i; + // input[M * stride] * 16 + int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4); + int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4); + int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4); + int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4); + + // If the very first value != 0, then add 1. + if (input[0] != 0) { + const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1)); + input_0 = vadd_s16(input_0, one); + } + + for (i = 0; i < 2; ++i) { + const int16x8_t input_01 = vcombine_s16(input_0, input_1); + const int16x8_t input_32 = vcombine_s16(input_3, input_2); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // (s_0 +/- s_1) * cospi_16_64 + // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. + const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1); + const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1); + const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int16_t)cospi_16_64); + const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int16_t)cospi_16_64); + + // fdct_round_shift + int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS); + int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int16_t)cospi_8_64); + const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int16_t)cospi_24_64); + + const int32x4_t temp3 = + vmlal_n_s16(s_3_cospi_8_64, s_2, (int16_t)cospi_24_64); + const int32x4_t temp4 = + vmlsl_n_s16(s_3_cospi_24_64, s_2, (int16_t)cospi_8_64); + + // fdct_round_shift + int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS); + int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS); + + transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3); + + input_0 = out_0; + input_1 = out_1; + input_2 = out_2; + input_3 = out_3; + } + + { + // Not quite a rounding shift. Only add 1 despite shifting by 2. + const int16x8_t one = vdupq_n_s16(1); + int16x8_t out_01 = vcombine_s16(input_0, input_1); + int16x8_t out_23 = vcombine_s16(input_2, input_3); + out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2); + out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2); + store_s16q_to_tran_low(final_output + 0 * 8, out_01); + store_s16q_to_tran_low(final_output + 1 * 8, out_23); + } +} diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index afa14accc..6ac7182ab 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -193,6 +193,7 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm endif DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h +DSP_SRCS-$(HAVE_NEON) += arm/fdct_neon.c DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 5acbf2d68..410055077 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -484,7 +484,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct4x4 sse2/; + specialize qw/vpx_fdct4x4 neon sse2/; add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct4x4_1 sse2/; @@ -532,7 +532,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; } else { add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct4x4 sse2 msa/; + specialize qw/vpx_fdct4x4 neon sse2 msa/; add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct4x4_1 sse2/;