From f9efbad392f001c59a38733f61e53611348f7fc5 Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Fri, 12 Aug 2016 18:14:21 -0700 Subject: [PATCH] NEON asm of vpx_lpf_{horizontal,vertical}_8_dual_neon() Also expose the NEON intrinsics version. BUG=webm:1261, webm:1266. Change-Id: I8c4ae658467dcf66ebf7a75982b2ef712dbb4535 --- test/lpf_8_test.cc | 24 +++++------ vpx_dsp/arm/loopfilter_8_neon.asm | 66 +++++++++++++++++++++++++++++++ vpx_dsp/arm/loopfilter_8_neon.c | 16 ++++++++ vpx_dsp/arm/loopfilter_neon.c | 18 --------- vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +-- 5 files changed, 94 insertions(+), 36 deletions(-) diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc index 27bedb8c7..552b5e33c 100644 --- a/test/lpf_8_test.cc +++ b/test/lpf_8_test.cc @@ -531,20 +531,16 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_lpf_vertical_8_neon, &vpx_lpf_vertical_8_c, 8), make_tuple(&vpx_lpf_horizontal_4_neon, &vpx_lpf_horizontal_4_c, 8), make_tuple(&vpx_lpf_vertical_4_neon, &vpx_lpf_vertical_4_c, 8))); -INSTANTIATE_TEST_CASE_P(NEON, Loop8Test9Param, - ::testing::Values( -// Using #if inside the macro is unsupported on MSVS but the tests are not -// currently built for MSVS with ARM and NEON. -#if HAVE_NEON_ASM - make_tuple(&vpx_lpf_horizontal_8_dual_neon, - &vpx_lpf_horizontal_8_dual_c, 8), - make_tuple(&vpx_lpf_vertical_8_dual_neon, - &vpx_lpf_vertical_8_dual_c, 8), -#endif // HAVE_NEON_ASM - make_tuple(&vpx_lpf_horizontal_4_dual_neon, - &vpx_lpf_horizontal_4_dual_c, 8), - make_tuple(&vpx_lpf_vertical_4_dual_neon, - &vpx_lpf_vertical_4_dual_c, 8))); +INSTANTIATE_TEST_CASE_P( + NEON, Loop8Test9Param, + ::testing::Values(make_tuple(&vpx_lpf_horizontal_8_dual_neon, + &vpx_lpf_horizontal_8_dual_c, 8), + make_tuple(&vpx_lpf_vertical_8_dual_neon, + &vpx_lpf_vertical_8_dual_c, 8), + make_tuple(&vpx_lpf_horizontal_4_dual_neon, + &vpx_lpf_horizontal_4_dual_c, 8), + make_tuple(&vpx_lpf_vertical_4_dual_neon, + &vpx_lpf_vertical_4_dual_c, 8))); #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON diff --git a/vpx_dsp/arm/loopfilter_8_neon.asm b/vpx_dsp/arm/loopfilter_8_neon.asm index a2f20e15f..a042d40ac 100644 --- a/vpx_dsp/arm/loopfilter_8_neon.asm +++ b/vpx_dsp/arm/loopfilter_8_neon.asm @@ -9,7 +9,9 @@ ; EXPORT |vpx_lpf_horizontal_8_neon| + EXPORT |vpx_lpf_horizontal_8_dual_neon| EXPORT |vpx_lpf_vertical_8_neon| + EXPORT |vpx_lpf_vertical_8_dual_neon| ARM AREA ||.text||, CODE, READONLY, ALIGN=2 @@ -64,6 +66,38 @@ ENDP ; |vpx_lpf_horizontal_8_neon| +;void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, +; int p, +; const uint8_t *blimit0, +; const uint8_t *limit0, +; const uint8_t *thresh0, +; const uint8_t *blimit1, +; const uint8_t *limit1, +; const uint8_t *thresh1) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit0, +; r3 const uint8_t *limit0, +; sp const uint8_t *thresh0, +; sp + 4 const uint8_t *blimit1, +; sp + 8 const uint8_t *limit1, +; sp + 12 const uint8_t *thresh1, +|vpx_lpf_horizontal_8_dual_neon| PROC + push {r0-r1, lr} + ldr lr, [sp, #12] + push {lr} ; thresh0 + bl vpx_lpf_horizontal_8_neon + + ldr r2, [sp, #20] ; blimit1 + ldr r3, [sp, #24] ; limit1 + ldr lr, [sp, #28] + str lr, [sp, #16] ; thresh1 + add sp, #4 + pop {r0-r1, lr} + add r0, #8 ; s + 8 + b vpx_lpf_horizontal_8_neon + ENDP ; |vpx_lpf_horizontal_8_dual_neon| + ; void vpx_lpf_vertical_8_neon(uint8_t *s, ; int pitch, ; const uint8_t *blimit, @@ -139,6 +173,38 @@ pop {r4-r5, pc} ENDP ; |vpx_lpf_vertical_8_neon| +;void vpx_lpf_vertical_8_dual_neon(uint8_t *s, +; int pitch, +; const uint8_t *blimit0, +; const uint8_t *limit0, +; const uint8_t *thresh0, +; const uint8_t *blimit1, +; const uint8_t *limit1, +; const uint8_t *thresh1) +; r0 uint8_t *s, +; r1 int pitch +; r2 const uint8_t *blimit0, +; r3 const uint8_t *limit0, +; sp const uint8_t *thresh0, +; sp + 4 const uint8_t *blimit1, +; sp + 8 const uint8_t *limit1, +; sp + 12 const uint8_t *thresh1, +|vpx_lpf_vertical_8_dual_neon| PROC + push {r0-r1, lr} + ldr lr, [sp, #12] + push {lr} ; thresh0 + bl vpx_lpf_vertical_8_neon + + ldr r2, [sp, #20] ; blimit1 + ldr r3, [sp, #24] ; limit1 + ldr lr, [sp, #28] + str lr, [sp, #16] ; thresh1 + add sp, #4 + pop {r0-r1, lr} + add r0, r1, lsl #3 ; s + 8 * pitch + b vpx_lpf_vertical_8_neon + ENDP ; |vpx_lpf_vertical_8_dual_neon| + ; void vpx_mbloop_filter_neon(); ; This is a helper function for the loopfilters. The invidual functions do the ; necessary load, transpose (if necessary) and store. The function does not use diff --git a/vpx_dsp/arm/loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c index 854196f42..8641541b0 100644 --- a/vpx_dsp/arm/loopfilter_8_neon.c +++ b/vpx_dsp/arm/loopfilter_8_neon.c @@ -311,6 +311,14 @@ void vpx_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, return; } +void vpx_lpf_horizontal_8_dual_neon( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { + vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1); +} + void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; @@ -427,3 +435,11 @@ void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, } return; } + +void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0); + vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1); +} diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c index 7741b226c..ced5aef0a 100644 --- a/vpx_dsp/arm/loopfilter_neon.c +++ b/vpx_dsp/arm/loopfilter_neon.c @@ -21,21 +21,3 @@ void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0); vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1); } - -#if HAVE_NEON_ASM -void vpx_lpf_horizontal_8_dual_neon( - uint8_t *s, int p /* pitch */, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, - const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0); - vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1); -} - -void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0); - vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1); -} -#endif // HAVE_NEON_ASM diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 428d5e951..c7bad5222 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -514,8 +514,7 @@ add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *bl specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/; -$vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon; +specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/; @@ -533,8 +532,7 @@ add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t * specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/; -$vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon; +specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/; -- 2.40.0