]> granicus.if.org Git - libvpx/commitdiff
NEON asm of vpx_lpf_{horizontal,vertical}_8_dual_neon()
authorLinfeng Zhang <linfengz@google.com>
Sat, 13 Aug 2016 01:14:21 +0000 (18:14 -0700)
committerLinfeng Zhang <linfengz@google.com>
Tue, 16 Aug 2016 15:50:57 +0000 (08:50 -0700)
Also expose the NEON intrinsics version.

BUG=webm:1261, webm:1266.

Change-Id: I8c4ae658467dcf66ebf7a75982b2ef712dbb4535

test/lpf_8_test.cc
vpx_dsp/arm/loopfilter_8_neon.asm
vpx_dsp/arm/loopfilter_8_neon.c
vpx_dsp/arm/loopfilter_neon.c
vpx_dsp/vpx_dsp_rtcd_defs.pl

index 27bedb8c714f22e1dd51fd522697a9ce1cc7bf8f..552b5e33c297daabdb225307eeb4e2853d2caf71 100644 (file)
@@ -531,20 +531,16 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vpx_lpf_vertical_8_neon, &vpx_lpf_vertical_8_c, 8),
         make_tuple(&vpx_lpf_horizontal_4_neon, &vpx_lpf_horizontal_4_c, 8),
         make_tuple(&vpx_lpf_vertical_4_neon, &vpx_lpf_vertical_4_c, 8)));
-INSTANTIATE_TEST_CASE_P(NEON, Loop8Test9Param,
-                        ::testing::Values(
-// Using #if inside the macro is unsupported on MSVS but the tests are not
-// currently built for MSVS with ARM and NEON.
-#if HAVE_NEON_ASM
-                            make_tuple(&vpx_lpf_horizontal_8_dual_neon,
-                                       &vpx_lpf_horizontal_8_dual_c, 8),
-                            make_tuple(&vpx_lpf_vertical_8_dual_neon,
-                                       &vpx_lpf_vertical_8_dual_c, 8),
-#endif  // HAVE_NEON_ASM
-                            make_tuple(&vpx_lpf_horizontal_4_dual_neon,
-                                       &vpx_lpf_horizontal_4_dual_c, 8),
-                            make_tuple(&vpx_lpf_vertical_4_dual_neon,
-                                       &vpx_lpf_vertical_4_dual_c, 8)));
+INSTANTIATE_TEST_CASE_P(
+    NEON, Loop8Test9Param,
+    ::testing::Values(make_tuple(&vpx_lpf_horizontal_8_dual_neon,
+                                 &vpx_lpf_horizontal_8_dual_c, 8),
+                      make_tuple(&vpx_lpf_vertical_8_dual_neon,
+                                 &vpx_lpf_vertical_8_dual_c, 8),
+                      make_tuple(&vpx_lpf_horizontal_4_dual_neon,
+                                 &vpx_lpf_horizontal_4_dual_c, 8),
+                      make_tuple(&vpx_lpf_vertical_4_dual_neon,
+                                 &vpx_lpf_vertical_4_dual_c, 8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
index a2f20e15f81f711c4e877303e136f98c0b41c157..a042d40acb2cf3bab77ed6a34d0fab01979541fc 100644 (file)
@@ -9,7 +9,9 @@
 ;
 
     EXPORT  |vpx_lpf_horizontal_8_neon|
+    EXPORT  |vpx_lpf_horizontal_8_dual_neon|
     EXPORT  |vpx_lpf_vertical_8_neon|
+    EXPORT  |vpx_lpf_vertical_8_dual_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
     ENDP        ; |vpx_lpf_horizontal_8_neon|
 
+;void vpx_lpf_horizontal_8_dual_neon(uint8_t *s,
+;                                    int p,
+;                                    const uint8_t *blimit0,
+;                                    const uint8_t *limit0,
+;                                    const uint8_t *thresh0,
+;                                    const uint8_t *blimit1,
+;                                    const uint8_t *limit1,
+;                                    const uint8_t *thresh1)
+; r0      uint8_t *s,
+; r1      int p, /* pitch */
+; r2      const uint8_t *blimit0,
+; r3      const uint8_t *limit0,
+; sp      const uint8_t *thresh0,
+; sp + 4  const uint8_t *blimit1,
+; sp + 8  const uint8_t *limit1,
+; sp + 12 const uint8_t *thresh1,
+|vpx_lpf_horizontal_8_dual_neon| PROC
+    push        {r0-r1, lr}
+    ldr         lr, [sp, #12]
+    push        {lr}                       ; thresh0
+    bl          vpx_lpf_horizontal_8_neon
+
+    ldr         r2, [sp, #20]              ; blimit1
+    ldr         r3, [sp, #24]              ; limit1
+    ldr         lr, [sp, #28]
+    str         lr, [sp, #16]              ; thresh1
+    add         sp, #4
+    pop         {r0-r1, lr}
+    add         r0, #8                     ; s + 8
+    b           vpx_lpf_horizontal_8_neon
+    ENDP        ; |vpx_lpf_horizontal_8_dual_neon|
+
 ; void vpx_lpf_vertical_8_neon(uint8_t *s,
 ;                              int pitch,
 ;                              const uint8_t *blimit,
     pop         {r4-r5, pc}
     ENDP        ; |vpx_lpf_vertical_8_neon|
 
+;void vpx_lpf_vertical_8_dual_neon(uint8_t *s,
+;                                  int pitch,
+;                                  const uint8_t *blimit0,
+;                                  const uint8_t *limit0,
+;                                  const uint8_t *thresh0,
+;                                  const uint8_t *blimit1,
+;                                  const uint8_t *limit1,
+;                                  const uint8_t *thresh1)
+; r0      uint8_t *s,
+; r1      int pitch
+; r2      const uint8_t *blimit0,
+; r3      const uint8_t *limit0,
+; sp      const uint8_t *thresh0,
+; sp + 4  const uint8_t *blimit1,
+; sp + 8  const uint8_t *limit1,
+; sp + 12 const uint8_t *thresh1,
+|vpx_lpf_vertical_8_dual_neon| PROC
+    push        {r0-r1, lr}
+    ldr         lr, [sp, #12]
+    push        {lr}                       ; thresh0
+    bl          vpx_lpf_vertical_8_neon
+
+    ldr         r2, [sp, #20]              ; blimit1
+    ldr         r3, [sp, #24]              ; limit1
+    ldr         lr, [sp, #28]
+    str         lr, [sp, #16]              ; thresh1
+    add         sp, #4
+    pop         {r0-r1, lr}
+    add         r0, r1, lsl #3             ; s + 8 * pitch
+    b           vpx_lpf_vertical_8_neon
+    ENDP        ; |vpx_lpf_vertical_8_dual_neon|
+
 ; void vpx_mbloop_filter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. The function does not use
index 854196f4272e692bc6f85333095c5139f7950032..8641541b0960a1666f5c31e30cecee2884713003 100644 (file)
@@ -311,6 +311,14 @@ void vpx_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
   return;
 }
 
+void vpx_lpf_horizontal_8_dual_neon(
+    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
+  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
+}
+
 void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh) {
   int i;
@@ -427,3 +435,11 @@ void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
   }
   return;
 }
+
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
+}
index 7741b226c429d517df8999ed92ca39a4a45a38bc..ced5aef0ab20519b4c781f5a677389d5a27c4bf5 100644 (file)
@@ -21,21 +21,3 @@ void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
   vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
   vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
 }
-
-#if HAVE_NEON_ASM
-void vpx_lpf_horizontal_8_dual_neon(
-    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
-    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
-    const uint8_t *limit1, const uint8_t *thresh1) {
-  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
-}
-#endif  // HAVE_NEON_ASM
index 428d5e951e4e7b7ae38a9fcd531cc0da8b25c898..c7bad52226cf06d0ec64240a1ff2ef3bc811a81e 100644 (file)
@@ -514,8 +514,7 @@ add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *bl
 specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
-$vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
+specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/;
@@ -533,8 +532,7 @@ add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *
 specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
-$vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
+specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/;