From: Johann Date: Wed, 3 May 2017 19:06:29 +0000 (-0700) Subject: subpel variance neon: add mixed sizes X-Git-Tag: v1.7.0~482^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2346a6da4a3703eb2cb346f3b4a8e6d8a25c70f6;p=libvpx subpel variance neon: add mixed sizes Add support for everything except block sizes of 4. Performance is better but numbers will improve again when the variance optimizations land. BUG=webm:1423 Change-Id: I92eb4312b20be423fa2fe6fdb18167a604ff4d80 --- diff --git a/test/variance_test.cc b/test/variance_test.cc index 6e31165fa..9eb9be3a1 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1227,9 +1227,16 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( NEON, VpxSubpelVarianceTest, ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_neon, 0), + make_tuple(6, 5, &vpx_sub_pixel_variance64x32_neon, 0), + make_tuple(5, 6, &vpx_sub_pixel_variance32x64_neon, 0), make_tuple(5, 5, &vpx_sub_pixel_variance32x32_neon, 0), + make_tuple(5, 4, &vpx_sub_pixel_variance32x16_neon, 0), + make_tuple(4, 5, &vpx_sub_pixel_variance16x32_neon, 0), make_tuple(4, 4, &vpx_sub_pixel_variance16x16_neon, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_neon, 0))); + make_tuple(4, 3, &vpx_sub_pixel_variance16x8_neon, 0), + make_tuple(3, 4, &vpx_sub_pixel_variance8x16_neon, 0), + make_tuple(3, 3, &vpx_sub_pixel_variance8x8_neon, 0), + make_tuple(3, 2, &vpx_sub_pixel_variance8x4_neon, 0))); #endif // HAVE_NEON #if HAVE_MSA diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c index 822989a4f..9b1622ff0 100644 --- a/vpx_dsp/arm/subpel_variance_neon.c +++ b/vpx_dsp/arm/subpel_variance_neon.c @@ -28,7 +28,6 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, unsigned int src_pixels_per_line, int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter) { const uint8x8_t f0 = vmov_n_u8(filter[0]); const uint8x8_t f1 = vmov_n_u8(filter[1]); @@ -42,7 +41,7 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, vst1_u8(&output_ptr[0], out); // Next row... src_ptr += src_pixels_per_line; - output_ptr += output_width; + output_ptr += 8; } } @@ -75,61 +74,36 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, } } -unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, - int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]); - DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); - - var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8, - bilinear_filters[xoffset]); - var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8, - bilinear_filters[yoffset]); - return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse); -} - -unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src, - int src_stride, int xoffset, - int yoffset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]); - DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); - - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16, - bilinear_filters[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16, - bilinear_filters[yoffset]); - return vpx_variance16x16(temp2, 16, dst, dst_stride, sse); -} - -unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src, - int src_stride, int xoffset, - int yoffset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]); - DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); - - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32, - bilinear_filters[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32, - bilinear_filters[yoffset]); - return vpx_variance32x32(temp2, 32, dst, dst_stride, sse); -} - -unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src, - int src_stride, int xoffset, - int yoffset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]); - DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); +// TODO(johannkoenig): support 4xM block sizes. +#define sub_pixel_varianceNxM(n, m) \ + unsigned int vpx_sub_pixel_variance##n##x##m##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse) { \ + DECLARE_ALIGNED(16, uint8_t, fdata3[n * (m + 1)]); \ + DECLARE_ALIGNED(16, uint8_t, temp2[n * m]); \ + \ + if (n == 8) { \ + var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, (m + 1), \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_w8(fdata3, temp2, n, n, m, \ + bilinear_filters[yoffset]); \ + } else { \ + var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, (m + 1), n, \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_w16(fdata3, temp2, n, n, m, n, \ + bilinear_filters[yoffset]); \ + } \ + return vpx_variance##n##x##m(temp2, n, dst, dst_stride, sse); \ + } - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64, - bilinear_filters[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64, - bilinear_filters[yoffset]); - return vpx_variance64x64(temp2, 64, dst, dst_stride, sse); -} +sub_pixel_varianceNxM(8, 4); +sub_pixel_varianceNxM(8, 8); +sub_pixel_varianceNxM(8, 16); +sub_pixel_varianceNxM(16, 8); +sub_pixel_varianceNxM(16, 16); +sub_pixel_varianceNxM(16, 32); +sub_pixel_varianceNxM(32, 16); +sub_pixel_varianceNxM(32, 32); +sub_pixel_varianceNxM(32, 64); +sub_pixel_varianceNxM(64, 32); +sub_pixel_varianceNxM(64, 64); diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index b2058929c..d025a2f5b 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1240,34 +1240,34 @@ add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance64x32 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance64x32 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance32x64 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance32x64 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance32x16 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance32x16 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance16x32 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance16x32 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance16x16 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance16x8 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance16x8 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance8x16 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance8x16 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance8x8 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance8x4 msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance8x4 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance4x8 msa sse2 ssse3/;