From 74d40cd507594fe775644d52a8f03b23f3ddf8c9 Mon Sep 17 00:00:00 2001 From: Frank Galligan Date: Tue, 13 Jan 2015 11:15:24 -0800 Subject: [PATCH] Add 64x variance Neon functions Add optimized Neon functions of: vp9_variance32x64 vp9_variance64x32 vp9_variance64x64 On Nexus 7 speed -5 and -6 saw about a 4% increase in perf. Speeds -7 and -8 saw about a 6% increase in perf. Tested on Nexus 7, built with ndk r10d, gcc 4.9. Change-Id: I5a81f13c9897eb927fa39662530f5524a0f768fa --- test/variance_test.cc | 8 ++++++- vp9/common/vp9_rtcd_defs.pl | 6 +++--- vp9/encoder/arm/neon/vp9_variance_neon.c | 27 ++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/test/variance_test.cc b/test/variance_test.cc index 4d279f686..e7517a7d8 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1914,11 +1914,17 @@ INSTANTIATE_TEST_CASE_P( const vp9_variance_fn_t variance8x8_neon = vp9_variance8x8_neon; const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon; const vp9_variance_fn_t variance32x32_neon = vp9_variance32x32_neon; +const vp9_variance_fn_t variance32x64_neon = vp9_variance32x64_neon; +const vp9_variance_fn_t variance64x32_neon = vp9_variance64x32_neon; +const vp9_variance_fn_t variance64x64_neon = vp9_variance64x64_neon; INSTANTIATE_TEST_CASE_P( NEON, VP9VarianceTest, ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0), make_tuple(4, 4, variance16x16_neon, 0), - make_tuple(5, 5, variance32x32_neon, 0))); + make_tuple(5, 5, variance32x32_neon, 0), + make_tuple(5, 6, variance32x64_neon, 0), + make_tuple(6, 5, variance64x32_neon, 0), + make_tuple(6, 6, variance64x64_neon, 0))); const vp9_subpixvariance_fn_t subpel_variance8x8_neon = vp9_sub_pixel_variance8x8_neon; diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 88f85a86d..b59e6ebe7 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -798,16 +798,16 @@ add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int sourc specialize qw/vp9_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance64x32 avx2/, "$sse2_x86inc"; +specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance32x64/, "$sse2_x86inc"; +specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc"; +specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc"; diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c index 816fbda1f..567b7deb1 100644 --- a/vp9/encoder/arm/neon/vp9_variance_neon.c +++ b/vp9/encoder/arm/neon/vp9_variance_neon.c @@ -10,6 +10,7 @@ #include #include "./vp9_rtcd.h" +#include "./vpx_config.h" #include "vpx_ports/mem.h" #include "vpx/vpx_integer.h" @@ -28,6 +29,8 @@ enum { kHeight16PlusOne = 17 }; enum { kWidth32 = 32 }; enum { kHeight32 = 32 }; enum { kHeight32PlusOne = 33 }; +enum { kWidth64 = 64 }; +enum { kHeight64 = 64 }; enum { kPixelStepOne = 1 }; enum { kAlign16 = 16 }; @@ -208,6 +211,30 @@ unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride, return *sse - (((int64_t)sum * sum) / (kWidth32 * kHeight32)); } +unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight64, sse, &sum); + return *sse - (((int64_t)sum * sum) >> 11); // >> 11 = / 32 * 64 +} + +unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight32, sse, &sum); + return *sse - (((int64_t)sum * sum) >> 11); // >> 11 = / 64 * 32 +} + +unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight64, sse, &sum); + return *sse - (((int64_t)sum * sum) >> 12); // >> 12 = / 64 * 64 +} + unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, int src_stride, int xoffset, -- 2.40.0