]> granicus.if.org Git - libvpx/commitdiff
vp9_reconintra_neon: add d45 4x4
authorJames Zern <jzern@google.com>
Thu, 18 Jun 2015 03:52:13 +0000 (20:52 -0700)
committerJames Zern <jzern@google.com>
Thu, 18 Jun 2015 22:25:07 +0000 (15:25 -0700)
based on webp's LD4()

~59% faster over 20M pixels

Change-Id: I371eaed9ce8f470451046997e130b0ba1a2f7a9c

test/test_intra_pred_speed.cc
vp9/common/arm/neon/vp9_reconintra_neon.c
vp9/common/vp9_rtcd_defs.pl

index 096526af4fe258523abaff44e472c73049e58c36..352cde25a9ca19069caab37e1b35b36df82ed328 100644 (file)
@@ -211,8 +211,9 @@ INTRA_PRED_TEST(DSPR2, TestIntraPred4, vp9_dc_predictor_4x4_dspr2, NULL, NULL,
 INTRA_PRED_TEST(NEON, TestIntraPred4, vp9_dc_predictor_4x4_neon,
                 vp9_dc_left_predictor_4x4_neon, vp9_dc_top_predictor_4x4_neon,
                 vp9_dc_128_predictor_4x4_neon, vp9_v_predictor_4x4_neon,
-                vp9_h_predictor_4x4_neon, NULL, vp9_d135_predictor_4x4_neon,
-                NULL, NULL, NULL, NULL, vp9_tm_predictor_4x4_neon)
+                vp9_h_predictor_4x4_neon, vp9_d45_predictor_4x4_neon,
+                vp9_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
+                vp9_tm_predictor_4x4_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
index 48bce78b3aff509f3bbd7265e6093db15c86b334..13c46a57ee13bbd986a1cb43e0026fef4a2b8be0 100644 (file)
@@ -315,6 +315,31 @@ void vp9_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 
 // -----------------------------------------------------------------------------
 
+void vp9_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
+  const uint64x1_t A1 = vshr_n_u64(A0, 8);
+  const uint64x1_t A2 = vshr_n_u64(A0, 16);
+  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
+  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
+  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  (void)left;
+  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+  dst[3 * stride + 3] = above[7];
+}
+
+// -----------------------------------------------------------------------------
+
 void vp9_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
index d191062db7c97106ba0500c249f3236d174dea8a..d5084bc251f47e33b517d43627c3573f772aba74 100644 (file)
@@ -60,7 +60,7 @@ add_proto qw/void vp9_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co
 specialize qw/vp9_d207_predictor_4x4/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d45_predictor_4x4/, "$ssse3_x86inc";
+specialize qw/vp9_d45_predictor_4x4 neon/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc";