]> granicus.if.org Git - libvpx/commitdiff
vp9_reconintra_neon: add DC 8x8 predictors
authorJames Zern <jzern@google.com>
Thu, 28 May 2015 01:59:57 +0000 (18:59 -0700)
committerJames Zern <jzern@google.com>
Fri, 29 May 2015 22:39:08 +0000 (15:39 -0700)
~90% faster over 20M pixels

Change-Id: Iab791510cc57c8332c2f9a5da0ed50702e5f5763

test/test_intra_pred_speed.cc
vp9/common/arm/neon/vp9_reconintra_neon.c
vp9/common/vp9_rtcd_defs.pl
vp9/vp9_common.mk

index 2ec693b977362ebdb9d7591c963f7d4b06fe5643..d10c8ec891dff2dcb20c9d2a4b079aac6aa838de 100644 (file)
@@ -248,9 +248,11 @@ INTRA_PRED_TEST(DSPR2, TestIntraPred8, vp9_dc_predictor_8x8_dspr2, NULL, NULL,
 #endif  // HAVE_DSPR2
 
 #if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred8, NULL, NULL, NULL, NULL,
-                vp9_v_predictor_8x8_neon, vp9_h_predictor_8x8_neon, NULL, NULL,
-                NULL, NULL, NULL, NULL, vp9_tm_predictor_8x8_neon)
+INTRA_PRED_TEST(NEON, TestIntraPred8, vp9_dc_predictor_8x8_neon,
+                vp9_dc_left_predictor_8x8_neon, vp9_dc_top_predictor_8x8_neon,
+                vp9_dc_128_predictor_8x8_neon, vp9_v_predictor_8x8_neon,
+                vp9_h_predictor_8x8_neon, NULL, NULL, NULL, NULL, NULL, NULL,
+                vp9_tm_predictor_8x8_neon)
 
 #endif  // HAVE_NEON
 
index 66cf6600e78e3ca7d9ec4b2870d298728888ee57..82d11d60a678bf6a87aac447f6896d90251724d9 100644 (file)
@@ -8,9 +8,85 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <stddef.h>
 #include <arm_neon.h>
 
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *above, const uint8_t *left,
+                          int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x8_t A = vld1_u8(above);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    sum_top = vcombine_u16(p2, p2);
+  }
+
+  if (do_left) {
+    const uint8x8_t L = vld1_u8(left);  // left border
+    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    sum_left = vcombine_u16(p2, p2);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 4);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 3);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 3);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 8; ++i) {
+      vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));
+    }
+  }
+}
+
+void vp9_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  dc_8x8(dst, stride, above, left, 1, 1);
+}
+
+void vp9_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  dc_8x8(dst, stride, NULL, left, 0, 1);
+}
+
+void vp9_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  dc_8x8(dst, stride, above, NULL, 1, 0);
+}
+
+void vp9_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_8x8(dst, stride, NULL, NULL, 0, 0);
+}
+
+#if !HAVE_NEON_ASM
+
 void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
                               const uint8_t *above, const uint8_t *left) {
   int i;
@@ -423,3 +499,4 @@ void vp9_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
     }
   }
 }
+#endif  // !HAVE_NEON_ASM
index 30710ba0098da0691550cc122885d01c3384f4f6..d4c9070e3d38bb6ee90cb7aca340aaedbbadcd04 100644 (file)
@@ -123,16 +123,16 @@ add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, cons
 specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc";
+specialize qw/vp9_dc_predictor_8x8 dspr2 neon/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_8x8/, "$sse_x86inc";
+specialize qw/vp9_dc_top_predictor_8x8 neon/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_8x8/, "$sse_x86inc";
+specialize qw/vp9_dc_left_predictor_8x8 neon/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_8x8/, "$sse_x86inc";
+specialize qw/vp9_dc_128_predictor_8x8 neon/, "$sse_x86inc";
 
 add_proto qw/void vp9_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc";
index cbc04888b48d92300fcfaadee52cdeb63482d1b9..7739a2b32d3a5be49a05fbbacb1f9fea418c9cf0 100644 (file)
@@ -199,8 +199,9 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon.c
 # TODO(johannkoenig): re-enable when chromium build is fixed
 # # https://code.google.com/p/chromium/issues/detail?id=443839
 #VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_8_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
 
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_reconintra_neon.c
+
 $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))