From: James Zern Date: Sat, 1 Oct 2016 18:59:31 +0000 (-0700) Subject: enable idct*_1_add_neon in high-bitdepth builds X-Git-Tag: v1.6.1~194^2~1 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a6be7ba1aaf67989bf97bae04707fe9bf5c7d4d4;p=libvpx enable idct*_1_add_neon in high-bitdepth builds these are compatible as they only load one element of the input so the larger size of tran_low_t makes no difference in little endian builds. note the asm is incompatible with big-endian, but there are other points of failure there so currently it's considered unsupported. BUG=webm:1294 Change-Id: Icd2665a0699bccae92d1bea43a95b0a83fb17028 --- diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 85946fbbf..9eb4d9dbb 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -201,7 +201,19 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, &vpx_idct4x4_1_add_c, TX_4X4, 1))); -#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + NEON, PartialIDctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, + &vpx_idct32x32_1_add_neon, TX_32X32, 1), + make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, + &vpx_idct16x16_1_add_neon, TX_16X16, 1), + make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, + &vpx_idct8x8_1_add_neon, TX_8X8, 1), + make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, + &vpx_idct4x4_1_add_neon, TX_4X4, 1))); +#else // !CONFIG_VP9_HIGHBITDEPTH // 32x32_34_ 32x32_135_ are implemented using the 1024 version. INSTANTIATE_TEST_CASE_P( NEON, PartialIDctTest, @@ -229,7 +241,8 @@ INSTANTIATE_TEST_CASE_P( &vpx_idct4x4_16_add_neon, TX_4X4, 16), make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, &vpx_idct4x4_1_add_neon, TX_4X4, 1))); -#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE // 32x32_135_ is implemented using the 1024 version. diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 66062b6e7..6cf0a3704 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -199,23 +199,15 @@ DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM) ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) ifeq ($(HAVE_NEON_ASM),yes) -DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM) DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) -DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM) DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM) -DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM) DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM) -DSP_SRCS-yes += arm/idct32x32_1_add_neon$(ASM) DSP_SRCS-yes += arm/idct32x32_add_neon$(ASM) else ifeq ($(HAVE_NEON),yes) -DSP_SRCS-yes += arm/idct4x4_1_add_neon.c DSP_SRCS-yes += arm/idct4x4_add_neon.c -DSP_SRCS-yes += arm/idct8x8_1_add_neon.c DSP_SRCS-yes += arm/idct8x8_add_neon.c -DSP_SRCS-yes += arm/idct16x16_1_add_neon.c DSP_SRCS-yes += arm/idct16x16_add_neon.c -DSP_SRCS-yes += arm/idct32x32_1_add_neon.c DSP_SRCS-yes += arm/idct32x32_add_neon.c endif # HAVE_NEON endif # HAVE_NEON_ASM @@ -233,7 +225,20 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c -endif # CONFIG_VP9_HIGHBITDEPTH +endif # !CONFIG_VP9_HIGHBITDEPTH + +ifeq ($(HAVE_NEON_ASM),yes) +DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct32x32_1_add_neon$(ASM) +else +DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c +endif # HAVE_NEON_ASM + endif # CONFIG_VP9 # quantization diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index d148642e3..113087c9e 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -647,7 +647,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct4x4_16_add sse2/; add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct4x4_1_add sse2/; + specialize qw/vpx_idct4x4_1_add neon sse2/; add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64"; @@ -656,7 +656,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64"; add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct8x8_1_add sse2/; + specialize qw/vpx_idct8x8_1_add neon sse2/; add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct16x16_256_add sse2/; @@ -665,7 +665,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct16x16_10_add sse2/; add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct16x16_1_add sse2/; + specialize qw/vpx_idct16x16_1_add neon sse2/; add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64"; @@ -679,7 +679,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64"; add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct32x32_1_add sse2/; + specialize qw/vpx_idct32x32_1_add neon sse2/; add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; specialize qw/vpx_highbd_idct4x4_16_add sse2/;