NEON asm of vpx_lpf_{horizontal,vertical}_8_dual_neon()

author Linfeng Zhang <linfengz@google.com>

Sat, 13 Aug 2016 01:14:21 +0000 (18:14 -0700)

committer Linfeng Zhang <linfengz@google.com>

Tue, 16 Aug 2016 15:50:57 +0000 (08:50 -0700)
author Linfeng Zhang <linfengz@google.com>
Sat, 13 Aug 2016 01:14:21 +0000 (18:14 -0700)
committer Linfeng Zhang <linfengz@google.com>
Tue, 16 Aug 2016 15:50:57 +0000 (08:50 -0700)
diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc

index 27bedb8c714f22e1dd51fd522697a9ce1cc7bf8f..552b5e33c297daabdb225307eeb4e2853d2caf71 100644 (file)
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -531,20 +531,16 @@ INSTANTIATE_TEST_CASE_P(
          make_tuple(&vpx_lpf_vertical_8_neon, &vpx_lpf_vertical_8_c, 8),
          make_tuple(&vpx_lpf_horizontal_4_neon, &vpx_lpf_horizontal_4_c, 8),
          make_tuple(&vpx_lpf_vertical_4_neon, &vpx_lpf_vertical_4_c, 8)));
-INSTANTIATE_TEST_CASE_P(NEON, Loop8Test9Param,
-                        ::testing::Values(
-// Using #if inside the macro is unsupported on MSVS but the tests are not
-// currently built for MSVS with ARM and NEON.
-#if HAVE_NEON_ASM
-                            make_tuple(&vpx_lpf_horizontal_8_dual_neon,
-                                       &vpx_lpf_horizontal_8_dual_c, 8),
-                            make_tuple(&vpx_lpf_vertical_8_dual_neon,
-                                       &vpx_lpf_vertical_8_dual_c, 8),
-#endif  // HAVE_NEON_ASM
-                            make_tuple(&vpx_lpf_horizontal_4_dual_neon,
-                                       &vpx_lpf_horizontal_4_dual_c, 8),
-                            make_tuple(&vpx_lpf_vertical_4_dual_neon,
-                                       &vpx_lpf_vertical_4_dual_c, 8)));
+INSTANTIATE_TEST_CASE_P(
+    NEON, Loop8Test9Param,
+    ::testing::Values(make_tuple(&vpx_lpf_horizontal_8_dual_neon,
+                                 &vpx_lpf_horizontal_8_dual_c, 8),
+                      make_tuple(&vpx_lpf_vertical_8_dual_neon,
+                                 &vpx_lpf_vertical_8_dual_c, 8),
+                      make_tuple(&vpx_lpf_horizontal_4_dual_neon,
+                                 &vpx_lpf_horizontal_4_dual_c, 8),
+                      make_tuple(&vpx_lpf_vertical_4_dual_neon,
+                                 &vpx_lpf_vertical_4_dual_c, 8)));
  #endif  // CONFIG_VP9_HIGHBITDEPTH
  #endif  // HAVE_NEON
  
diff --git a/vpx_dsp/arm/loopfilter_8_neon.asm b/vpx_dsp/arm/loopfilter_8_neon.asm

index a2f20e15f81f711c4e877303e136f98c0b41c157..a042d40acb2cf3bab77ed6a34d0fab01979541fc 100644 (file)
--- a/vpx_dsp/arm/loopfilter_8_neon.asm
+++ b/vpx_dsp/arm/loopfilter_8_neon.asm
@@ -9,7 +9,9 @@
  ;
  
      EXPORT  |vpx_lpf_horizontal_8_neon|
+    EXPORT  |vpx_lpf_horizontal_8_dual_neon|
      EXPORT  |vpx_lpf_vertical_8_neon|
+    EXPORT  |vpx_lpf_vertical_8_dual_neon|
      ARM
  
      AREA ||.text||, CODE, READONLY, ALIGN=2
@@ -64,6 +66,38 @@
  
      ENDP        ; |vpx_lpf_horizontal_8_neon|
  
+;void vpx_lpf_horizontal_8_dual_neon(uint8_t *s,
+;                                    int p,
+;                                    const uint8_t *blimit0,
+;                                    const uint8_t *limit0,
+;                                    const uint8_t *thresh0,
+;                                    const uint8_t *blimit1,
+;                                    const uint8_t *limit1,
+;                                    const uint8_t *thresh1)
+; r0      uint8_t *s,
+; r1      int p, /* pitch */
+; r2      const uint8_t *blimit0,
+; r3      const uint8_t *limit0,
+; sp      const uint8_t *thresh0,
+; sp + 4  const uint8_t *blimit1,
+; sp + 8  const uint8_t *limit1,
+; sp + 12 const uint8_t *thresh1,
+|vpx_lpf_horizontal_8_dual_neon| PROC
+    push        {r0-r1, lr}
+    ldr         lr, [sp, #12]
+    push        {lr}                       ; thresh0
+    bl          vpx_lpf_horizontal_8_neon
+
+    ldr         r2, [sp, #20]              ; blimit1
+    ldr         r3, [sp, #24]              ; limit1
+    ldr         lr, [sp, #28]
+    str         lr, [sp, #16]              ; thresh1
+    add         sp, #4
+    pop         {r0-r1, lr}
+    add         r0, #8                     ; s + 8
+    b           vpx_lpf_horizontal_8_neon
+    ENDP        ; |vpx_lpf_horizontal_8_dual_neon|
+
  ; void vpx_lpf_vertical_8_neon(uint8_t *s,
  ;                              int pitch,
  ;                              const uint8_t *blimit,
@@ -139,6 +173,38 @@
      pop         {r4-r5, pc}
      ENDP        ; |vpx_lpf_vertical_8_neon|
  
+;void vpx_lpf_vertical_8_dual_neon(uint8_t *s,
+;                                  int pitch,
+;                                  const uint8_t *blimit0,
+;                                  const uint8_t *limit0,
+;                                  const uint8_t *thresh0,
+;                                  const uint8_t *blimit1,
+;                                  const uint8_t *limit1,
+;                                  const uint8_t *thresh1)
+; r0      uint8_t *s,
+; r1      int pitch
+; r2      const uint8_t *blimit0,
+; r3      const uint8_t *limit0,
+; sp      const uint8_t *thresh0,
+; sp + 4  const uint8_t *blimit1,
+; sp + 8  const uint8_t *limit1,
+; sp + 12 const uint8_t *thresh1,
+|vpx_lpf_vertical_8_dual_neon| PROC
+    push        {r0-r1, lr}
+    ldr         lr, [sp, #12]
+    push        {lr}                       ; thresh0
+    bl          vpx_lpf_vertical_8_neon
+
+    ldr         r2, [sp, #20]              ; blimit1
+    ldr         r3, [sp, #24]              ; limit1
+    ldr         lr, [sp, #28]
+    str         lr, [sp, #16]              ; thresh1
+    add         sp, #4
+    pop         {r0-r1, lr}
+    add         r0, r1, lsl #3             ; s + 8 * pitch
+    b           vpx_lpf_vertical_8_neon
+    ENDP        ; |vpx_lpf_vertical_8_dual_neon|
+
  ; void vpx_mbloop_filter_neon();
  ; This is a helper function for the loopfilters. The invidual functions do the
  ; necessary load, transpose (if necessary) and store. The function does not use
diff --git a/vpx_dsp/arm/loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c

index 854196f4272e692bc6f85333095c5139f7950032..8641541b0960a1666f5c31e30cecee2884713003 100644 (file)
--- a/vpx_dsp/arm/loopfilter_8_neon.c
+++ b/vpx_dsp/arm/loopfilter_8_neon.c
@@ -311,6 +311,14 @@ void vpx_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
    return;
  }
  
+void vpx_lpf_horizontal_8_dual_neon(
+    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
+  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
+}
+
  void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
                               const uint8_t *limit, const uint8_t *thresh) {
    int i;
@@ -427,3 +435,11 @@ void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
    }
    return;
  }
+
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
+}
diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c

index 7741b226c429d517df8999ed92ca39a4a45a38bc..ced5aef0ab20519b4c781f5a677389d5a27c4bf5 100644 (file)
--- a/vpx_dsp/arm/loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c
@@ -21,21 +21,3 @@ void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
    vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
    vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
  }
-
-#if HAVE_NEON_ASM
-void vpx_lpf_horizontal_8_dual_neon(
-    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
-    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
-    const uint8_t *limit1, const uint8_t *thresh1) {
-  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
-}
-#endif  // HAVE_NEON_ASM
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl

index 428d5e951e4e7b7ae38a9fcd531cc0da8b25c898..c7bad52226cf06d0ec64240a1ff2ef3bc811a81e 100644 (file)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -514,8 +514,7 @@ add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *bl
  specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;
  
  add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
-$vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
+specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa/;
  
  add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
  specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/;
@@ -533,8 +532,7 @@ add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *
  specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;
  
  add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
-$vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
+specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa/;
  
  add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
  specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/;
author	Linfeng Zhang <linfengz@google.com>
	Sat, 13 Aug 2016 01:14:21 +0000 (18:14 -0700)
committer	Linfeng Zhang <linfengz@google.com>
	Tue, 16 Aug 2016 15:50:57 +0000 (08:50 -0700)
test/lpf_8_test.cc		patch \| blob \| history
vpx_dsp/arm/loopfilter_8_neon.asm		patch \| blob \| history
vpx_dsp/arm/loopfilter_8_neon.c		patch \| blob \| history
vpx_dsp/arm/loopfilter_neon.c		patch \| blob \| history
vpx_dsp/vpx_dsp_rtcd_defs.pl		patch \| blob \| history