From 1eb8a718bfe08dd6e6ffdfff2e4613c4b0d4d7d6 Mon Sep 17 00:00:00 2001 From: Johann Date: Mon, 30 Jan 2017 16:16:27 -0800 Subject: [PATCH] hadamard highbd neon: use tran_low_t for coeff BUG=webm:1365 Change-Id: I7e15192ead3a3631755b386f102c979f06e26279 --- test/hadamard_test.cc | 4 +++- vpx_dsp/arm/hadamard_neon.c | 38 +++++++++++++++++++----------------- vpx_dsp/arm/idct_neon.h | 11 +++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 ++-- 4 files changed, 36 insertions(+), 21 deletions(-) diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index bc09e952d..3b19b23be 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -157,12 +157,14 @@ INSTANTIATE_TEST_CASE_P(SSE2, Hadamard8x8Test, INSTANTIATE_TEST_CASE_P(SSSE3, Hadamard8x8Test, ::testing::Values(&vpx_hadamard_8x8_ssse3)); #endif // HAVE_SSSE3 && ARCH_X86_64 +#endif // !CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test, ::testing::Values(&vpx_hadamard_8x8_neon)); #endif // HAVE_NEON +#if !CONFIG_VP9_HIGHBITDEPTH #if HAVE_MSA INSTANTIATE_TEST_CASE_P(MSA, Hadamard8x8Test, ::testing::Values(&vpx_hadamard_8x8_msa)); @@ -220,12 +222,12 @@ INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_sse2)); #endif // HAVE_SSE2 -#if !CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_neon)); #endif // HAVE_NEON +#if !CONFIG_VP9_HIGHBITDEPTH #if HAVE_MSA INSTANTIATE_TEST_CASE_P(MSA, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_msa)); diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c index 977323497..ebeafed31 100644 --- a/vpx_dsp/arm/hadamard_neon.c +++ b/vpx_dsp/arm/hadamard_neon.c @@ -11,6 +11,8 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/arm/transpose_neon.h" static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, @@ -45,7 +47,7 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, } void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, - int16_t *coeff) { + tran_low_t *coeff) { int16x8_t a0 = vld1q_s16(src_diff); int16x8_t a1 = vld1q_s16(src_diff + src_stride); int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); @@ -63,18 +65,18 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, // Skip the second transpose because it is not required. - vst1q_s16(coeff + 0, a0); - vst1q_s16(coeff + 8, a1); - vst1q_s16(coeff + 16, a2); - vst1q_s16(coeff + 24, a3); - vst1q_s16(coeff + 32, a4); - vst1q_s16(coeff + 40, a5); - vst1q_s16(coeff + 48, a6); - vst1q_s16(coeff + 56, a7); + store_s16q_to_tran_low(coeff + 0, a0); + store_s16q_to_tran_low(coeff + 8, a1); + store_s16q_to_tran_low(coeff + 16, a2); + store_s16q_to_tran_low(coeff + 24, a3); + store_s16q_to_tran_low(coeff + 32, a4); + store_s16q_to_tran_low(coeff + 40, a5); + store_s16q_to_tran_low(coeff + 48, a6); + store_s16q_to_tran_low(coeff + 56, a7); } void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, - int16_t *coeff) { + tran_low_t *coeff) { int i; /* Rearrange 16x16 to 8x32 and remove stride. @@ -88,10 +90,10 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, vpx_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); for (i = 0; i < 64; i += 8) { - const int16x8_t a0 = vld1q_s16(coeff + 0); - const int16x8_t a1 = vld1q_s16(coeff + 64); - const int16x8_t a2 = vld1q_s16(coeff + 128); - const int16x8_t a3 = vld1q_s16(coeff + 192); + const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0); + const int16x8_t a1 = load_tran_low_to_s16q(coeff + 64); + const int16x8_t a2 = load_tran_low_to_s16q(coeff + 128); + const int16x8_t a3 = load_tran_low_to_s16q(coeff + 192); const int16x8_t b0 = vhaddq_s16(a0, a1); const int16x8_t b1 = vhsubq_s16(a0, a1); @@ -103,10 +105,10 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, const int16x8_t c2 = vsubq_s16(b0, b2); const int16x8_t c3 = vsubq_s16(b1, b3); - vst1q_s16(coeff + 0, c0); - vst1q_s16(coeff + 64, c1); - vst1q_s16(coeff + 128, c2); - vst1q_s16(coeff + 192, c3); + store_s16q_to_tran_low(coeff + 0, c0); + store_s16q_to_tran_low(coeff + 64, c1); + store_s16q_to_tran_low(coeff + 128, c2); + store_s16q_to_tran_low(coeff + 192, c3); coeff += 8; } diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h index d9b85223c..2f30a5add 100644 --- a/vpx_dsp/arm/idct_neon.h +++ b/vpx_dsp/arm/idct_neon.h @@ -76,6 +76,17 @@ static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) { #endif } +static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); + const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); + vst1q_s32(buf, v0); + vst1q_s32(buf + 4, v1); +#else + vst1q_s16(buf, a); +#endif +} + //------------------------------------------------------------------------------ // Multiply a by a_const. Saturate, shift and narrow by 14. diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 133f2fbcd..cf85cc89b 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -888,10 +888,10 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff"; - specialize qw/vpx_hadamard_8x8 sse2/; + specialize qw/vpx_hadamard_8x8 sse2 neon/; add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff"; - specialize qw/vpx_hadamard_16x16 sse2/; + specialize qw/vpx_hadamard_16x16 sse2 neon/; add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; specialize qw/vpx_satd/; -- 2.40.0