From: James Zern Date: Thu, 28 May 2015 01:59:57 +0000 (-0700) Subject: vp9_reconintra_neon: add DC 8x8 predictors X-Git-Tag: v1.5.0~635^2~1 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e97b84921941dd40dd6464a59558c75815ac8e9f;p=libvpx vp9_reconintra_neon: add DC 8x8 predictors ~90% faster over 20M pixels Change-Id: Iab791510cc57c8332c2f9a5da0ed50702e5f5763 --- diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 2ec693b97..d10c8ec89 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -248,9 +248,11 @@ INTRA_PRED_TEST(DSPR2, TestIntraPred8, vp9_dc_predictor_8x8_dspr2, NULL, NULL, #endif // HAVE_DSPR2 #if HAVE_NEON -INTRA_PRED_TEST(NEON, TestIntraPred8, NULL, NULL, NULL, NULL, - vp9_v_predictor_8x8_neon, vp9_h_predictor_8x8_neon, NULL, NULL, - NULL, NULL, NULL, NULL, vp9_tm_predictor_8x8_neon) +INTRA_PRED_TEST(NEON, TestIntraPred8, vp9_dc_predictor_8x8_neon, + vp9_dc_left_predictor_8x8_neon, vp9_dc_top_predictor_8x8_neon, + vp9_dc_128_predictor_8x8_neon, vp9_v_predictor_8x8_neon, + vp9_h_predictor_8x8_neon, NULL, NULL, NULL, NULL, NULL, NULL, + vp9_tm_predictor_8x8_neon) #endif // HAVE_NEON diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c index 66cf6600e..82d11d60a 100644 --- a/vp9/common/arm/neon/vp9_reconintra_neon.c +++ b/vp9/common/arm/neon/vp9_reconintra_neon.c @@ -8,9 +8,85 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include #include +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +//------------------------------------------------------------------------------ +// DC 8x8 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x8_t A = vld1_u8(above); // top row + const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top + const uint16x4_t p1 = vpadd_u16(p0, p0); + const uint16x4_t p2 = vpadd_u16(p1, p1); + sum_top = vcombine_u16(p2, p2); + } + + if (do_left) { + const uint8x8_t L = vld1_u8(left); // left border + const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left + const uint16x4_t p1 = vpadd_u16(p0, p0); + const uint16x4_t p2 = vpadd_u16(p1, p1); + sum_left = vcombine_u16(p2, p2); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 4); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 3); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 3); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x8_t dc = vdup_lane_u8(dc0, 0); + int i; + for (i = 0; i < 8; ++i) { + vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc)); + } + } +} + +void vp9_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_8x8(dst, stride, above, left, 1, 1); +} + +void vp9_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + dc_8x8(dst, stride, NULL, left, 0, 1); +} + +void vp9_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + dc_8x8(dst, stride, above, NULL, 1, 0); +} + +void vp9_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + dc_8x8(dst, stride, NULL, NULL, 0, 0); +} + +#if !HAVE_NEON_ASM + void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left) { int i; @@ -423,3 +499,4 @@ void vp9_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, } } } +#endif // !HAVE_NEON_ASM diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 30710ba00..d4c9070e3 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -123,16 +123,16 @@ add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, cons specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc"; add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc"; +specialize qw/vp9_dc_predictor_8x8 dspr2 neon/, "$sse_x86inc"; add_proto qw/void vp9_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_top_predictor_8x8/, "$sse_x86inc"; +specialize qw/vp9_dc_top_predictor_8x8 neon/, "$sse_x86inc"; add_proto qw/void vp9_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_left_predictor_8x8/, "$sse_x86inc"; +specialize qw/vp9_dc_left_predictor_8x8 neon/, "$sse_x86inc"; add_proto qw/void vp9_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_128_predictor_8x8/, "$sse_x86inc"; +specialize qw/vp9_dc_128_predictor_8x8 neon/, "$sse_x86inc"; add_proto qw/void vp9_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc"; diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index cbc04888b..7739a2b32 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -199,8 +199,9 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon.c # TODO(johannkoenig): re-enable when chromium build is fixed # # https://code.google.com/p/chromium/issues/detail?id=443839 #VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_8_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon.c endif # HAVE_NEON endif # HAVE_NEON_ASM +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_reconintra_neon.c + $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))