vp9_reconintra_neon: add DC 4x4 predictors

author James Zern <jzern@google.com>

Wed, 17 Jun 2015 23:34:14 +0000 (16:34 -0700)

committer James Zern <jzern@google.com>

Thu, 18 Jun 2015 22:22:43 +0000 (15:22 -0700)
author James Zern <jzern@google.com>
Wed, 17 Jun 2015 23:34:14 +0000 (16:34 -0700)
committer James Zern <jzern@google.com>
Thu, 18 Jun 2015 22:22:43 +0000 (15:22 -0700)
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc

index e387857ae4541c8ac34dfd3d919929adb7b4ad8f..9c3e527c2e83ce58b97202b9cfbdbb95ded6ca93 100644 (file)
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -208,9 +208,11 @@ INTRA_PRED_TEST(DSPR2, TestIntraPred4, vp9_dc_predictor_4x4_dspr2, NULL, NULL,
  #endif  // HAVE_DSPR2
  
  #if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred4, NULL, NULL, NULL, NULL,
-                vp9_v_predictor_4x4_neon, vp9_h_predictor_4x4_neon, NULL, NULL,
-                NULL, NULL, NULL, NULL, vp9_tm_predictor_4x4_neon)
+INTRA_PRED_TEST(NEON, TestIntraPred4, vp9_dc_predictor_4x4_neon,
+                vp9_dc_left_predictor_4x4_neon, vp9_dc_top_predictor_4x4_neon,
+                vp9_dc_128_predictor_4x4_neon, vp9_v_predictor_4x4_neon,
+                vp9_h_predictor_4x4_neon, NULL, NULL, NULL, NULL, NULL, NULL,
+                vp9_tm_predictor_4x4_neon)
  #endif  // HAVE_NEON
  
  #if HAVE_MSA
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c

index a053e4f10a69271ffdd94903a9f8c1f0b7203d00..65a29366fc746f56aa8f27731fd9f0184dd9dae8 100644 (file)
--- a/vp9/common/arm/neon/vp9_reconintra_neon.c
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.c
@@ -14,6 +14,75 @@
  #include "./vpx_config.h"
  #include "vpx/vpx_integer.h"
  
+//------------------------------------------------------------------------------
+// DC 4x4
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *above, const uint8_t *left,
+                          int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x8_t A = vld1_u8(above);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_top = vcombine_u16(p1, p1);
+  }
+
+  if (do_left) {
+    const uint8x8_t L = vld1_u8(left);  // left border
+    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_left = vcombine_u16(p1, p1);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 3);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 2);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 2);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 4; ++i) {
+      vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
+    }
+  }
+}
+
+void vp9_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  dc_4x4(dst, stride, above, left, 1, 1);
+}
+
+void vp9_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  dc_4x4(dst, stride, NULL, left, 0, 1);
+}
+
+void vp9_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  dc_4x4(dst, stride, above, NULL, 1, 0);
+}
+
+void vp9_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_4x4(dst, stride, NULL, NULL, 0, 0);
+}
+
  //------------------------------------------------------------------------------
  // DC 8x8
  
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl

index bc1a3c913f7763281acb9d8c5e4d587eacce82a7..da7d5fce3952ef2caaa50a5174848ff8a05266ad 100644 (file)
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -84,16 +84,16 @@ add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, cons
  specialize qw/vp9_tm_predictor_4x4 neon dspr2 msa/, "$sse_x86inc";
  
  add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_4x4 dspr2 msa/, "$sse_x86inc";
+specialize qw/vp9_dc_predictor_4x4 dspr2 msa neon/, "$sse_x86inc";
  
  add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_4x4 msa/, "$sse_x86inc";
+specialize qw/vp9_dc_top_predictor_4x4 msa neon/, "$sse_x86inc";
  
  add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_4x4 msa/, "$sse_x86inc";
+specialize qw/vp9_dc_left_predictor_4x4 msa neon/, "$sse_x86inc";
  
  add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_4x4 msa/, "$sse_x86inc";
+specialize qw/vp9_dc_128_predictor_4x4 msa neon/, "$sse_x86inc";
  
  add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc";
author	James Zern <jzern@google.com>
	Wed, 17 Jun 2015 23:34:14 +0000 (16:34 -0700)
committer	James Zern <jzern@google.com>
	Thu, 18 Jun 2015 22:22:43 +0000 (15:22 -0700)
test/test_intra_pred_speed.cc		patch \| blob \| history
vp9/common/arm/neon/vp9_reconintra_neon.c		patch \| blob \| history
vp9/common/vp9_rtcd_defs.pl		patch \| blob \| history