From: Johann Date: Fri, 23 Jun 2017 01:12:28 +0000 (-0700) Subject: partial fdct neon: add 4x4_1 X-Git-Tag: v1.7.0~352^2~2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4959dd3eb36308aa9d94ba4feb6305e8b3f9148b;p=libvpx partial fdct neon: add 4x4_1 BUG=webm:1424 Change-Id: Ib0f3cfd6116fc1f5a99acb8bfd76e25b90177ffc --- diff --git a/test/dct_partial_test.cc b/test/dct_partial_test.cc index 11b2fadcc..f197349e5 100644 --- a/test/dct_partial_test.cc +++ b/test/dct_partial_test.cc @@ -144,11 +144,13 @@ INSTANTIATE_TEST_CASE_P( NEON, PartialFdctTest, ::testing::Values(make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_12), make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_10), - make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8))); + make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8))); #else -INSTANTIATE_TEST_CASE_P(NEON, PartialFdctTest, - ::testing::Values(make_tuple(&vpx_fdct8x8_1_neon, 8, - VPX_BITS_8))); +INSTANTIATE_TEST_CASE_P( + NEON, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8))); #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON diff --git a/vpx_dsp/arm/fdct_partial_neon.c b/vpx_dsp/arm/fdct_partial_neon.c index 3db40a11b..945b96a21 100644 --- a/vpx_dsp/arm/fdct_partial_neon.c +++ b/vpx_dsp/arm/fdct_partial_neon.c @@ -12,6 +12,41 @@ #include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" +#include "vpx_dsp/arm/mem_neon.h" + +static INLINE tran_low_t sum_int16x8(const int16x8_t a) { + const int32x4_t b = vpaddlq_s16(a); + const int64x2_t c = vpaddlq_s32(b); + const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)), + vreinterpret_s32_s64(vget_high_s64(c))); +#if CONFIG_VP9_HIGHBITDEPTH + return vget_lane_s32(d, 0); +#else + return vget_lane_s16(vreinterpret_s16_s32(d), 0); +#endif +} + +void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) { + int16x4_t a0, a1, a2, a3; + int16x8_t b0, b1; + int16x8_t c; + + a0 = vld1_s16(input); + input += stride; + a1 = vld1_s16(input); + input += stride; + a2 = vld1_s16(input); + input += stride; + a3 = vld1_s16(input); + + b0 = vcombine_s16(a0, a1); + b1 = vcombine_s16(a2, a3); + + c = vaddq_s16(b0, b1); + + output[0] = sum_int16x8(c) << 1; + output[1] = 0; +} void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { int r; @@ -20,16 +55,7 @@ void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { const int16x8_t input_00 = vld1q_s16(&input[r * stride]); sum = vaddq_s16(sum, input_00); } - { - const int32x4_t a = vpaddlq_s16(sum); - const int64x2_t b = vpaddlq_s32(a); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); -#if CONFIG_VP9_HIGHBITDEPTH - output[0] = vget_lane_s32(c, 0); -#else - output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); -#endif - output[1] = 0; - } + + output[0] = sum_int16x8(sum); + output[1] = 0; } diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index b6e64efac..0453a6a6e 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -487,7 +487,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_fdct4x4 neon sse2/; add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct4x4_1 sse2/; + specialize qw/vpx_fdct4x4_1 sse2 neon/; add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct8x8 neon sse2/; @@ -537,7 +537,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_fdct4x4 neon sse2 msa/; add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct4x4_1 sse2/; + specialize qw/vpx_fdct4x4_1 sse2 neon/; add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";