From: James Zern Date: Thu, 18 Jun 2015 03:52:13 +0000 (-0700) Subject: vp9_reconintra_neon: add d45 4x4 X-Git-Tag: v1.5.0~548^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ce88d74d349da857828f0b7e9c1ce44cf7f56530;p=libvpx vp9_reconintra_neon: add d45 4x4 based on webp's LD4() ~59% faster over 20M pixels Change-Id: I371eaed9ce8f470451046997e130b0ba1a2f7a9c --- diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 096526af4..352cde25a 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -211,8 +211,9 @@ INTRA_PRED_TEST(DSPR2, TestIntraPred4, vp9_dc_predictor_4x4_dspr2, NULL, NULL, INTRA_PRED_TEST(NEON, TestIntraPred4, vp9_dc_predictor_4x4_neon, vp9_dc_left_predictor_4x4_neon, vp9_dc_top_predictor_4x4_neon, vp9_dc_128_predictor_4x4_neon, vp9_v_predictor_4x4_neon, - vp9_h_predictor_4x4_neon, NULL, vp9_d135_predictor_4x4_neon, - NULL, NULL, NULL, NULL, vp9_tm_predictor_4x4_neon) + vp9_h_predictor_4x4_neon, vp9_d45_predictor_4x4_neon, + vp9_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL, + vp9_tm_predictor_4x4_neon) #endif // HAVE_NEON #if HAVE_MSA diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c index 48bce78b3..13c46a57e 100644 --- a/vp9/common/arm/neon/vp9_reconintra_neon.c +++ b/vp9/common/arm/neon/vp9_reconintra_neon.c @@ -315,6 +315,31 @@ void vp9_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, // ----------------------------------------------------------------------------- +void vp9_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row + const uint64x1_t A1 = vshr_n_u64(A0, 8); + const uint64x1_t A2 = vshr_n_u64(A0, 16); + const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0); + const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); + const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); + const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00); + const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0); + const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); + const uint32x2_t r0 = vreinterpret_u32_u8(avg2); + const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); + const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); + const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); + (void)left; + vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); + vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); + vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); + vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); + dst[3 * stride + 3] = above[7]; +} + +// ----------------------------------------------------------------------------- + void vp9_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t XABCD_u8 = vld1_u8(above - 1); diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index d191062db..d5084bc25 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -60,7 +60,7 @@ add_proto qw/void vp9_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co specialize qw/vp9_d207_predictor_4x4/, "$ssse3_x86inc"; add_proto qw/void vp9_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d45_predictor_4x4/, "$ssse3_x86inc"; +specialize qw/vp9_d45_predictor_4x4 neon/, "$ssse3_x86inc"; add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc";