From f9efbad392f001c59a38733f61e53611348f7fc5 Mon Sep 17 00:00:00 2001
From: Linfeng Zhang <linfengz@google.com>
Date: Fri, 12 Aug 2016 18:14:21 -0700
Subject: [PATCH] NEON asm of vpx_lpf_{horizontal,vertical}_8_dual_neon()

Also expose the NEON intrinsics version.

BUG=webm:1261, webm:1266.

Change-Id: I8c4ae658467dcf66ebf7a75982b2ef712dbb4535
---
 test/lpf_8_test.cc                | 24 +++++------
 vpx_dsp/arm/loopfilter_8_neon.asm | 66 +++++++++++++++++++++++++++++++
 vpx_dsp/arm/loopfilter_8_neon.c   | 16 ++++++++
 vpx_dsp/arm/loopfilter_neon.c     | 18 ---------
 vpx_dsp/vpx_dsp_rtcd_defs.pl      |  6 +--
 5 files changed, 94 insertions(+), 36 deletions(-)

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 27bedb8c7..552b5e33c 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -531,20 +531,16 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vpx_lpf_vertical_8_neon, &vpx_lpf_vertical_8_c, 8),
         make_tuple(&vpx_lpf_horizontal_4_neon, &vpx_lpf_horizontal_4_c, 8),
         make_tuple(&vpx_lpf_vertical_4_neon, &vpx_lpf_vertical_4_c, 8)));
-INSTANTIATE_TEST_CASE_P(NEON, Loop8Test9Param,
-                        ::testing::Values(
-// Using #if inside the macro is unsupported on MSVS but the tests are not
-// currently built for MSVS with ARM and NEON.
-#if HAVE_NEON_ASM
-                            make_tuple(&vpx_lpf_horizontal_8_dual_neon,
-                                       &vpx_lpf_horizontal_8_dual_c, 8),
-                            make_tuple(&vpx_lpf_vertical_8_dual_neon,
-                                       &vpx_lpf_vertical_8_dual_c, 8),
-#endif  // HAVE_NEON_ASM
-                            make_tuple(&vpx_lpf_horizontal_4_dual_neon,
-                                       &vpx_lpf_horizontal_4_dual_c, 8),
-                            make_tuple(&vpx_lpf_vertical_4_dual_neon,
-                                       &vpx_lpf_vertical_4_dual_c, 8)));
+INSTANTIATE_TEST_CASE_P(
+    NEON, Loop8Test9Param,
+    ::testing::Values(make_tuple(&vpx_lpf_horizontal_8_dual_neon,
+                                 &vpx_lpf_horizontal_8_dual_c, 8),
+                      make_tuple(&vpx_lpf_vertical_8_dual_neon,
+                                 &vpx_lpf_vertical_8_dual_c, 8),
+                      make_tuple(&vpx_lpf_horizontal_4_dual_neon,
+                                 &vpx_lpf_horizontal_4_dual_c, 8),
+                      make_tuple(&vpx_lpf_vertical_4_dual_neon,
+                                 &vpx_lpf_vertical_4_dual_c, 8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
diff --git a/vpx_dsp/arm/loopfilter_8_neon.asm b/vpx_dsp/arm/loopfilter_8_neon.asm
index a2f20e15f..a042d40ac 100644
--- a/vpx_dsp/arm/loopfilter_8_neon.asm
+++ b/vpx_dsp/arm/loopfilter_8_neon.asm
@@ -9,7 +9,9 @@
 ;
 
     EXPORT  |vpx_lpf_horizontal_8_neon|
+    EXPORT  |vpx_lpf_horizontal_8_dual_neon|
     EXPORT  |vpx_lpf_vertical_8_neon|
+    EXPORT  |vpx_lpf_vertical_8_dual_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
@@ -64,6 +66,38 @@
 
     ENDP        ; |vpx_lpf_horizontal_8_neon|
 
+;void vpx_lpf_horizontal_8_dual_neon(uint8_t *s,
+;                                    int p,
+;                                    const uint8_t *blimit0,
+;                                    const uint8_t *limit0,
+;                                    const uint8_t *thresh0,
+;                                    const uint8_t *blimit1,
+;                                    const uint8_t *limit1,
+;                                    const uint8_t *thresh1)
+; r0      uint8_t *s,
+; r1      int p, /* pitch */
+; r2      const uint8_t *blimit0,
+; r3      const uint8_t *limit0,
+; sp      const uint8_t *thresh0,
+; sp + 4  const uint8_t *blimit1,
+; sp + 8  const uint8_t *limit1,
+; sp + 12 const uint8_t *thresh1,
+|vpx_lpf_horizontal_8_dual_neon| PROC
+    push        {r0-r1, lr}
+    ldr         lr, [sp, #12]
+    push        {lr}                       ; thresh0
+    bl          vpx_lpf_horizontal_8_neon
+
+    ldr         r2, [sp, #20]              ; blimit1
+    ldr         r3, [sp, #24]              ; limit1
+    ldr         lr, [sp, #28]
+    str         lr, [sp, #16]              ; thresh1
+    add         sp, #4
+    pop         {r0-r1, lr}
+    add         r0, #8                     ; s + 8
+    b           vpx_lpf_horizontal_8_neon
+    ENDP        ; |vpx_lpf_horizontal_8_dual_neon|
+
 ; void vpx_lpf_vertical_8_neon(uint8_t *s,
 ;                              int pitch,
 ;                              const uint8_t *blimit,
@@ -139,6 +173,38 @@
     pop         {r4-r5, pc}
     ENDP        ; |vpx_lpf_vertical_8_neon|
 
+;void vpx_lpf_vertical_8_dual_neon(uint8_t *s,
+;                                  int pitch,
+;                                  const uint8_t *blimit0,
+;                                  const uint8_t *limit0,
+;                                  const uint8_t *thresh0,
+;                                  const uint8_t *blimit1,
+;                                  const uint8_t *limit1,
+;                                  const uint8_t *thresh1)
+; r0      uint8_t *s,
+; r1      int pitch
+; r2      const uint8_t *blimit0,
+; r3      const uint8_t *limit0,
+; sp      const uint8_t *thresh0,
+; sp + 4  const uint8_t *blimit1,
+; sp + 8  const uint8_t *limit1,
+; sp + 12 const uint8_t *thresh1,
+|vpx_lpf_vertical_8_dual_neon| PROC
+    push        {r0-r1, lr}
+    ldr         lr, [sp, #12]
+    push        {lr}                       ; thresh0
+    bl          vpx_lpf_vertical_8_neon
+
+    ldr         r2, [sp, #20]              ; blimit1
+    ldr         r3, [sp, #24]              ; limit1
+    ldr         lr, [sp, #28]
+    str         lr, [sp, #16]              ; thresh1
+    add         sp, #4
+    pop         {r0-r1, lr}
+    add         r0, r1, lsl #3             ; s + 8 * pitch
+    b           vpx_lpf_vertical_8_neon
+    ENDP        ; |vpx_lpf_vertical_8_dual_neon|
+
 ; void vpx_mbloop_filter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. The function does not use
diff --git a/vpx_dsp/arm/loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c
index 854196f42..8641541b0 100644
--- a/vpx_dsp/arm/loopfilter_8_neon.c
+++ b/vpx_dsp/arm/loopfilter_8_neon.c
@@ -311,6 +311,14 @@ void vpx_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
   return;
 }
 
+void vpx_lpf_horizontal_8_dual_neon(
+    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
+  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
+}
+
 void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh) {
   int i;
@@ -427,3 +435,11 @@ void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
   }
   return;
 }
+
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
+}
diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c
index 7741b226c..ced5aef0a 100644
--- a/vpx_dsp/arm/loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c
@@ -21,21 +21,3 @@ void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
   vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
   vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
 }
-
-#if HAVE_NEON_ASM
-void vpx_lpf_horizontal_8_dual_neon(
-    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
-    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
-    const uint8_t *limit1, const uint8_t *thresh1) {
-  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
-}
-#endif  // HAVE_NEON_ASM
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 428d5e951..c7bad5222 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -514,8 +514,7 @@ add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *bl
 specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
-$vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
+specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/;
@@ -533,8 +532,7 @@ add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *
 specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
-$vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
+specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/;
-- 
2.40.0