From 938b8dfc73d1cd1c3ce1c4a66e64147a55e163f3 Mon Sep 17 00:00:00 2001 From: Geza Lore Date: Fri, 4 Mar 2016 15:55:48 +0000 Subject: [PATCH] Extend convolution functions to 128x128 for ext-partition. Change-Id: I7f7e26cd1d58eb38417200550c6fbf4108c9f942 --- test/convolve_test.cc | 128 ++++++++++++++++----- test/masked_sad_test.cc | 2 - test/masked_variance_test.cc | 2 - vpx_dsp/vpx_convolve.c | 53 +++++---- vpx_dsp/vpx_convolve.h | 18 +++ vpx_dsp/vpx_dsp_common.h | 6 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 78 ++++++------- vpx_dsp/x86/convolve.h | 58 ++++++---- vpx_dsp/x86/vpx_convolve_copy_sse2.asm | 118 ++++++++++++++++++- vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c | 37 ++++-- 10 files changed, 366 insertions(+), 134 deletions(-) diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 12022be52..0e54c4013 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -28,7 +28,7 @@ namespace { -static const unsigned int kMaxDimension = 64; +static const unsigned int kMaxDimension = MAX_CU_SIZE; typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, @@ -102,7 +102,7 @@ void filter_block2d_8_c(const uint8_t *src_ptr, // = 23 // and filter_max_width = 16 // - uint8_t intermediate_buffer[71 * kMaxDimension]; + uint8_t intermediate_buffer[(kMaxDimension+8) * kMaxDimension]; const int intermediate_next_stride = 1 - intermediate_height * output_width; // Horizontal pass (src -> transposed intermediate). @@ -183,9 +183,9 @@ void filter_average_block2d_8_c(const uint8_t *src_ptr, assert(output_width <= kMaxDimension); assert(output_height <= kMaxDimension); - filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64, + filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, kMaxDimension, output_width, output_height); - block2d_average_c(tmp, 64, dst_ptr, dst_stride, + block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride, output_width, output_height); } @@ -214,7 +214,7 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr, * = 23 * and filter_max_width = 16 */ - uint16_t intermediate_buffer[71 * kMaxDimension]; + uint16_t intermediate_buffer[(kMaxDimension+8) * kMaxDimension]; const int intermediate_next_stride = 1 - intermediate_height * output_width; // Horizontal pass (src -> transposed intermediate). @@ -302,9 +302,10 @@ void highbd_filter_average_block2d_8_c(const uint16_t *src_ptr, assert(output_width <= kMaxDimension); assert(output_height <= kMaxDimension); - highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64, + highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, + tmp, kMaxDimension, output_width, output_height, bd); - highbd_block2d_average_c(tmp, 64, dst_ptr, dst_stride, + highbd_block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride, output_width, output_height); } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -351,7 +352,7 @@ class ConvolveTest : public ::testing::TestWithParam { protected: static const int kDataAlignment = 16; - static const int kOuterBlockSize = 256; + static const int kOuterBlockSize = 4*kMaxDimension; static const int kInputStride = kOuterBlockSize; static const int kOutputStride = kOuterBlockSize; static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize; @@ -414,7 +415,8 @@ class ConvolveTest : public ::testing::TestWithParam { void CopyOutputToRef() { memcpy(output_ref_, output_, kOutputBufferSize); #if CONFIG_VP9_HIGHBITDEPTH - memcpy(output16_ref_, output16_, kOutputBufferSize); + memcpy(output16_ref_, output16_, + kOutputBufferSize * sizeof(*output16_ref_)); #endif } @@ -426,41 +428,41 @@ class ConvolveTest : public ::testing::TestWithParam { } uint8_t *input() const { + const int index = BorderTop() * kOuterBlockSize + BorderLeft(); #if CONFIG_VP9_HIGHBITDEPTH if (UUT_->use_highbd_ == 0) { - return input_ + BorderTop() * kOuterBlockSize + BorderLeft(); + return input_ + index; } else { - return CONVERT_TO_BYTEPTR(input16_ + BorderTop() * kOuterBlockSize + - BorderLeft()); + return CONVERT_TO_BYTEPTR(input16_) + index; } #else - return input_ + BorderTop() * kOuterBlockSize + BorderLeft(); + return input_ + index; #endif } uint8_t *output() const { + const int index = BorderTop() * kOuterBlockSize + BorderLeft(); #if CONFIG_VP9_HIGHBITDEPTH if (UUT_->use_highbd_ == 0) { - return output_ + BorderTop() * kOuterBlockSize + BorderLeft(); + return output_ + index; } else { - return CONVERT_TO_BYTEPTR(output16_ + BorderTop() * kOuterBlockSize + - BorderLeft()); + return CONVERT_TO_BYTEPTR(output16_ + index); } #else - return output_ + BorderTop() * kOuterBlockSize + BorderLeft(); + return output_ + index; #endif } uint8_t *output_ref() const { + const int index = BorderTop() * kOuterBlockSize + BorderLeft(); #if CONFIG_VP9_HIGHBITDEPTH if (UUT_->use_highbd_ == 0) { - return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft(); + return output_ref_ + index; } else { - return CONVERT_TO_BYTEPTR(output16_ref_ + BorderTop() * kOuterBlockSize + - BorderLeft()); + return CONVERT_TO_BYTEPTR(output16_ref_ + index); } #else - return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft(); + return output_ref_ + index; #endif } @@ -1035,6 +1037,11 @@ const ConvolveFunctions convolve8_c( wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8); INSTANTIATE_TEST_CASE_P(C_8, ConvolveTest, ::testing::Values( +#if CONFIG_VP10 && CONFIG_EXT_PARTITION + make_tuple(128, 64, &convolve8_c), + make_tuple(64, 128, &convolve8_c), + make_tuple(128, 128, &convolve8_c), +#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION make_tuple(4, 4, &convolve8_c), make_tuple(8, 4, &convolve8_c), make_tuple(4, 8, &convolve8_c), @@ -1057,6 +1064,11 @@ const ConvolveFunctions convolve10_c( wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10, 10); INSTANTIATE_TEST_CASE_P(C_10, ConvolveTest, ::testing::Values( +#if CONFIG_VP10 && CONFIG_EXT_PARTITION + make_tuple(128, 64, &convolve10_c), + make_tuple(64, 128, &convolve10_c), + make_tuple(128, 128, &convolve10_c), +#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION make_tuple(4, 4, &convolve10_c), make_tuple(8, 4, &convolve10_c), make_tuple(4, 8, &convolve10_c), @@ -1079,6 +1091,11 @@ const ConvolveFunctions convolve12_c( wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12, 12); INSTANTIATE_TEST_CASE_P(C_12, ConvolveTest, ::testing::Values( +#if CONFIG_VP10 && CONFIG_EXT_PARTITION + make_tuple(128, 64, &convolve12_c), + make_tuple(64, 128, &convolve12_c), + make_tuple(128, 128, &convolve12_c), +#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION make_tuple(4, 4, &convolve12_c), make_tuple(8, 4, &convolve12_c), make_tuple(4, 8, &convolve12_c), @@ -1105,6 +1122,11 @@ const ConvolveFunctions convolve8_c( vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values( +#if CONFIG_VP10 && CONFIG_EXT_PARTITION + make_tuple(128, 64, &convolve8_c), + make_tuple(64, 128, &convolve8_c), + make_tuple(128, 128, &convolve8_c), +#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION make_tuple(4, 4, &convolve8_c), make_tuple(8, 4, &convolve8_c), make_tuple(4, 8, &convolve8_c), @@ -1158,7 +1180,12 @@ const ConvolveFunctions convolve12_sse2( wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12, wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12, wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12, 12); -INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values( +INSTANTIATE_TEST_CASE_P(SSE2_8, ConvolveTest, ::testing::Values( +#if CONFIG_VP10 && CONFIG_EXT_PARTITION + make_tuple(128, 64, &convolve8_sse2), + make_tuple(64, 128, &convolve8_sse2), + make_tuple(128, 128, &convolve8_sse2), +#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION make_tuple(4, 4, &convolve8_sse2), make_tuple(8, 4, &convolve8_sse2), make_tuple(4, 8, &convolve8_sse2), @@ -1171,7 +1198,13 @@ INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values( make_tuple(32, 32, &convolve8_sse2), make_tuple(64, 32, &convolve8_sse2), make_tuple(32, 64, &convolve8_sse2), - make_tuple(64, 64, &convolve8_sse2), + make_tuple(64, 64, &convolve8_sse2))); +INSTANTIATE_TEST_CASE_P(SSE2_10, ConvolveTest, ::testing::Values( +#if CONFIG_VP10 && CONFIG_EXT_PARTITION + make_tuple(128, 64, &convolve10_sse2), + make_tuple(64, 128, &convolve10_sse2), + make_tuple(128, 128, &convolve10_sse2), +#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION make_tuple(4, 4, &convolve10_sse2), make_tuple(8, 4, &convolve10_sse2), make_tuple(4, 8, &convolve10_sse2), @@ -1184,7 +1217,13 @@ INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values( make_tuple(32, 32, &convolve10_sse2), make_tuple(64, 32, &convolve10_sse2), make_tuple(32, 64, &convolve10_sse2), - make_tuple(64, 64, &convolve10_sse2), + make_tuple(64, 64, &convolve10_sse2))); +INSTANTIATE_TEST_CASE_P(SSE2_12, ConvolveTest, ::testing::Values( +#if CONFIG_VP10 && CONFIG_EXT_PARTITION + make_tuple(128, 64, &convolve12_sse2), + make_tuple(64, 128, &convolve12_sse2), + make_tuple(128, 128, &convolve12_sse2), +#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION make_tuple(4, 4, &convolve12_sse2), make_tuple(8, 4, &convolve12_sse2), make_tuple(4, 8, &convolve12_sse2), @@ -1213,6 +1252,11 @@ const ConvolveFunctions convolve8_sse2( vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values( +#if CONFIG_VP10 && CONFIG_EXT_PARTITION + make_tuple(128, 64, &convolve8_sse2), + make_tuple(64, 128, &convolve8_sse2), + make_tuple(128, 128, &convolve8_sse2), +#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION make_tuple(4, 4, &convolve8_sse2), make_tuple(8, 4, &convolve8_sse2), make_tuple(4, 8, &convolve8_sse2), @@ -1237,9 +1281,14 @@ const ConvolveFunctions convolve8_ssse3( vpx_convolve8_ssse3, vpx_convolve8_avg_ssse3, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c, - vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); + vpx_scaled_2d_ssse3, vpx_scaled_avg_2d_c, 0); INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values( +#if CONFIG_VP10 && CONFIG_EXT_PARTITION + make_tuple(128, 64, &convolve8_ssse3), + make_tuple(64, 128, &convolve8_ssse3), + make_tuple(128, 128, &convolve8_ssse3), +#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION make_tuple(4, 4, &convolve8_ssse3), make_tuple(8, 4, &convolve8_ssse3), make_tuple(4, 8, &convolve8_ssse3), @@ -1266,6 +1315,11 @@ const ConvolveFunctions convolve8_avx2( vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values( +#if CONFIG_VP10 && CONFIG_EXT_PARTITION + make_tuple(128, 64, &convolve8_avx2), + make_tuple(64, 128, &convolve8_avx2), + make_tuple(128, 128, &convolve8_avx2), +#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION make_tuple(4, 4, &convolve8_avx2), make_tuple(8, 4, &convolve8_avx2), make_tuple(4, 8, &convolve8_avx2), @@ -1281,7 +1335,8 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values( make_tuple(64, 64, &convolve8_avx2))); #endif // HAVE_AVX2 && HAVE_SSSE3 -#if HAVE_NEON +// TODO(any): Make NEON versions support 128x128 128x64 64x128 block sizes +#if HAVE_NEON && !(CONFIG_VP10 && CONFIG_EXT_PARTITION) #if HAVE_NEON_ASM const ConvolveFunctions convolve8_neon( vpx_convolve_copy_neon, vpx_convolve_avg_neon, @@ -1303,6 +1358,11 @@ const ConvolveFunctions convolve8_neon( #endif // HAVE_NEON_ASM INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values( +#if CONFIG_VP10 && CONFIG_EXT_PARTITION + make_tuple(128, 64, &convolve8_neon), + make_tuple(64, 128, &convolve8_neon), + make_tuple(128, 128, &convolve8_neon), +#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION make_tuple(4, 4, &convolve8_neon), make_tuple(8, 4, &convolve8_neon), make_tuple(4, 8, &convolve8_neon), @@ -1318,7 +1378,8 @@ INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values( make_tuple(64, 64, &convolve8_neon))); #endif // HAVE_NEON -#if HAVE_DSPR2 +// TODO(any): Make DSPR2 versions support 128x128 128x64 64x128 block sizes +#if HAVE_DSPR2 && !(CONFIG_VP10 && CONFIG_EXT_PARTITION) const ConvolveFunctions convolve8_dspr2( vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2, vpx_convolve8_horiz_dspr2, vpx_convolve8_avg_horiz_dspr2, @@ -1329,6 +1390,11 @@ const ConvolveFunctions convolve8_dspr2( vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values( +#if CONFIG_VP10 && CONFIG_EXT_PARTITION + make_tuple(128, 64, &convolve8_dspr2), + make_tuple(64, 128, &convolve8_dspr2), + make_tuple(128, 128, &convolve8_dspr2), +#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION make_tuple(4, 4, &convolve8_dspr2), make_tuple(8, 4, &convolve8_dspr2), make_tuple(4, 8, &convolve8_dspr2), @@ -1344,7 +1410,8 @@ INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values( make_tuple(64, 64, &convolve8_dspr2))); #endif -#if HAVE_MSA +// TODO(any): Make MSA versions support 128x128 128x64 64x128 block sizes +#if HAVE_MSA && !(CONFIG_VP10 && CONFIG_EXT_PARTITION) const ConvolveFunctions convolve8_msa( vpx_convolve_copy_msa, vpx_convolve_avg_msa, vpx_convolve8_horiz_msa, vpx_convolve8_avg_horiz_msa, @@ -1355,6 +1422,11 @@ const ConvolveFunctions convolve8_msa( vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, ::testing::Values( +#if CONFIG_VP10 && CONFIG_EXT_PARTITION + make_tuple(128, 64, &convolve8_msa), + make_tuple(64, 128, &convolve8_msa), + make_tuple(128, 128, &convolve8_msa), +#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION make_tuple(4, 4, &convolve8_msa), make_tuple(8, 4, &convolve8_msa), make_tuple(4, 8, &convolve8_msa), diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc index d7c6fcec4..34223eac8 100644 --- a/test/masked_sad_test.cc +++ b/test/masked_sad_test.cc @@ -22,8 +22,6 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -#define MAX_CU_SIZE 128 - using libvpx_test::ACMRandom; namespace { diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc index c312899a6..1f8bf1e22 100644 --- a/test/masked_variance_test.cc +++ b/test/masked_variance_test.cc @@ -25,8 +25,6 @@ #include "vpx_dsp/vpx_filter.h" #include "vpx_mem/vpx_mem.h" -#define MAX_CU_SIZE 128 - using libvpx_test::ACMRandom; namespace { diff --git a/vpx_dsp/vpx_convolve.c b/vpx_dsp/vpx_convolve.c index 2d1c927cb..2e85ed481 100644 --- a/vpx_dsp/vpx_convolve.c +++ b/vpx_dsp/vpx_convolve.c @@ -130,18 +130,21 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride, // --Must round-up because block may be located at sub-pixel position. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. - uint8_t temp[135 * 64]; + uint8_t temp[MAX_EXT_SIZE * MAX_CU_SIZE]; int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - assert(w <= 64); - assert(h <= 64); + assert(w <= MAX_CU_SIZE); + assert(h <= MAX_CU_SIZE); + assert(y_step_q4 <= 32); assert(x_step_q4 <= 32); - convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, + temp, MAX_CU_SIZE, x_filters, x0_q4, x_step_q4, w, intermediate_height); - convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + convolve_vert(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_CU_SIZE, + dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h); } @@ -237,13 +240,14 @@ void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h) { /* Fixed size intermediate buffer places limits on parameters. */ - DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); - assert(w <= 64); - assert(h <= 64); + DECLARE_ALIGNED(16, uint8_t, temp[MAX_CU_SIZE * MAX_CU_SIZE]); + assert(w <= MAX_CU_SIZE); + assert(h <= MAX_CU_SIZE); - vpx_convolve8_c(src, src_stride, temp, 64, + vpx_convolve8_c(src, src_stride, temp, MAX_CU_SIZE, filter_x, x_step_q4, filter_y, y_step_q4, w, h); - vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); + vpx_convolve_avg_c(temp, MAX_CU_SIZE, dst, dst_stride, + NULL, 0, NULL, 0, w, h); } void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, @@ -459,22 +463,23 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, // --Must round-up because block may be located at sub-pixel position. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. - uint16_t temp[64 * 135]; + uint16_t temp[MAX_EXT_SIZE * MAX_CU_SIZE]; int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - assert(w <= 64); - assert(h <= 64); + assert(w <= MAX_CU_SIZE); + assert(h <= MAX_CU_SIZE); assert(y_step_q4 <= 32); assert(x_step_q4 <= 32); - highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, CONVERT_TO_BYTEPTR(temp), 64, + highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, + CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE, x_filters, x0_q4, x_step_q4, w, intermediate_height, bd); - highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1), - 64, dst, dst_stride, y_filters, y0_q4, y_step_q4, - w, h, bd); + highbd_convolve_vert( + CONVERT_TO_BYTEPTR(temp) + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_CU_SIZE, + dst, dst_stride, + y_filters, y0_q4, y_step_q4, w, h, bd); } @@ -556,13 +561,15 @@ void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { // Fixed size intermediate buffer places limits on parameters. - DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]); - assert(w <= 64); - assert(h <= 64); + DECLARE_ALIGNED(16, uint16_t, temp[MAX_CU_SIZE * MAX_CU_SIZE]); + assert(w <= MAX_CU_SIZE); + assert(h <= MAX_CU_SIZE); - vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64, + vpx_highbd_convolve8_c(src, src_stride, + CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); - vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, + vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE, + dst, dst_stride, NULL, 0, NULL, 0, w, h, bd); } diff --git a/vpx_dsp/vpx_convolve.h b/vpx_dsp/vpx_convolve.h index 9ed3f1750..bd8679d10 100644 --- a/vpx_dsp/vpx_convolve.h +++ b/vpx_dsp/vpx_convolve.h @@ -17,6 +17,24 @@ extern "C" { #endif +// Note: Fixed size intermediate buffers, place limits on parameters +// of some functions. 2d filtering proceeds in 2 steps: +// (1) Interpolate horizontally into an intermediate buffer, temp. +// (2) Interpolate temp vertically to derive the sub-pixel result. +// Deriving the maximum number of rows in the temp buffer (135): +// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). +// --Largest block size is 64x64 pixels. +// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the +// original frame (in 1/16th pixel units). +// --Must round-up because block may be located at sub-pixel position. +// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. +// --((64 - 1) * 32 + 15) >> 4 + 8 = 135. +#if CONFIG_VP10 && CONFIG_EXT_PARTITION +# define MAX_EXT_SIZE 263 +#else +# define MAX_EXT_SIZE 135 +#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION + typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h index b4e6f4c27..8d9bf558d 100644 --- a/vpx_dsp/vpx_dsp_common.h +++ b/vpx_dsp/vpx_dsp_common.h @@ -20,6 +20,12 @@ extern "C" { #endif +#if CONFIG_VP10 && CONFIG_EXT_PARTITION +# define MAX_CU_SIZE 128 +#else +# define MAX_CU_SIZE 64 +#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION + #define VPXMIN(x, y) (((x) < (y)) ? (x) : (y)) #define VPXMAX(x, y) (((x) > (y)) ? (x) : (y)) diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 2ce0b99fb..583d9fa89 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -466,52 +466,44 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Sub Pixel Filters # -add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve_copy neon dspr2 msa/, "$sse2_x86inc"; - -add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve_avg neon dspr2 msa/, "$sse2_x86inc"; - -add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3"; - -add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3"; - -add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3"; - -add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa/; - +add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/; - -add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/; - -add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_scaled_2d ssse3/; - -add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_scaled_horiz/; - -add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_scaled_vert/; - -add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_scaled_avg_2d/; - -add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_scaled_avg_horiz/; - -add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vpx_scaled_avg_vert/; +add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; + +specialize qw/vpx_convolve_copy /, "$sse2_x86inc"; +specialize qw/vpx_convolve_avg /, "$sse2_x86inc"; +specialize qw/vpx_convolve8 sse2 ssse3/, "$avx2_ssse3"; +specialize qw/vpx_convolve8_horiz sse2 ssse3/, "$avx2_ssse3"; +specialize qw/vpx_convolve8_vert sse2 ssse3/, "$avx2_ssse3"; +specialize qw/vpx_convolve8_avg sse2 ssse3/; +specialize qw/vpx_convolve8_avg_horiz sse2 ssse3/; +specialize qw/vpx_convolve8_avg_vert sse2 ssse3/; +specialize qw/vpx_scaled_2d ssse3/; + +# TODO(any): These need to be extended to up to 128x128 block sizes +if (!(vpx_config("CONFIG_VP10") eq "yes" && vpx_config("CONFIG_EXT_PARTITION") eq "yes")) { + specialize qw/vpx_convolve_copy neon dspr2 msa/; + specialize qw/vpx_convolve_avg neon dspr2 msa/; + specialize qw/vpx_convolve8 neon dspr2 msa/; + specialize qw/vpx_convolve8_horiz neon dspr2 msa/; + specialize qw/vpx_convolve8_vert neon dspr2 msa/; + specialize qw/vpx_convolve8_avg neon dspr2 msa/; + specialize qw/vpx_convolve8_avg_horiz neon dspr2 msa/; + specialize qw/vpx_convolve8_avg_vert neon dspr2 msa/; +} if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - # - # Sub Pixel Filters - # add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve_copy/, "$sse2_x86inc"; diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h index b6fbfcf92..95aa790ae 100644 --- a/vpx_dsp/x86/convolve.h +++ b/vpx_dsp/x86/convolve.h @@ -15,6 +15,7 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" +#include "vpx_dsp/vpx_convolve.h" typedef void filter8_1dfunction ( const uint8_t *src_ptr, @@ -112,25 +113,27 @@ void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ int w, int h) { \ assert(filter_x[3] != 128); \ assert(filter_y[3] != 128); \ - assert(w <= 64); \ - assert(h <= 64); \ + assert(w <= MAX_CU_SIZE); \ + assert(h <= MAX_CU_SIZE); \ assert(x_step_q4 == 16); \ assert(y_step_q4 == 16); \ if (filter_x[0] || filter_x[1] || filter_x[2]|| \ filter_y[0] || filter_y[1] || filter_y[2]) { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ - vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ + DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+7)]); \ + vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + fdata2, MAX_CU_SIZE, \ filter_x, x_step_q4, filter_y, y_step_q4, \ w, h + 7); \ - vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ + vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_CU_SIZE, MAX_CU_SIZE, \ + dst, dst_stride, \ filter_x, x_step_q4, filter_y, \ y_step_q4, w, h); \ } else { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \ - vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ + DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+1)]); \ + vpx_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_CU_SIZE, \ filter_x, x_step_q4, filter_y, y_step_q4, \ w, h + 1); \ - vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ + vpx_convolve8_##avg##vert_##opt(fdata2, MAX_CU_SIZE, dst, dst_stride, \ filter_x, x_step_q4, filter_y, \ y_step_q4, w, h); \ } \ @@ -250,31 +253,40 @@ void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ const int16_t *filter_x, int x_step_q4, \ const int16_t *filter_y, int y_step_q4, \ int w, int h, int bd) { \ - assert(w <= 64); \ - assert(h <= 64); \ + assert(w <= MAX_CU_SIZE); \ + assert(h <= MAX_CU_SIZE); \ if (x_step_q4 == 16 && y_step_q4 == 16) { \ if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ - vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ - CONVERT_TO_BYTEPTR(fdata2), 64, \ + DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+7)]); \ + vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, \ + src_stride, \ + CONVERT_TO_BYTEPTR(fdata2), \ + MAX_CU_SIZE, \ filter_x, x_step_q4, \ filter_y, y_step_q4, \ w, h + 7, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \ - 64, dst, dst_stride, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt( \ + CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_CU_SIZE, \ + MAX_CU_SIZE, \ + dst, \ + dst_stride, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h, bd); \ } else { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ - vpx_highbd_convolve8_horiz_##opt(src, src_stride, \ - CONVERT_TO_BYTEPTR(fdata2), 64, \ + DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+1)]); \ + vpx_highbd_convolve8_horiz_##opt(src, \ + src_stride, \ + CONVERT_TO_BYTEPTR(fdata2), \ + MAX_CU_SIZE, \ filter_x, x_step_q4, \ filter_y, y_step_q4, \ w, h + 1, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \ - dst, dst_stride, \ + vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), \ + MAX_CU_SIZE, \ + dst, \ + dst_stride, \ filter_x, x_step_q4, \ filter_y, y_step_q4, \ w, h, bd); \ diff --git a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm index abc027065..6d43fc18e 100644 --- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm +++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm @@ -46,6 +46,119 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ je .w16 cmp r4d, 32 je .w32 + +%if CONFIG_VP10 && CONFIG_EXT_PARTITION + cmp r4d, 64 + je .w64 +%ifidn %2, highbd + cmp r4d, 128 + je .w128 + +.w256: + mov r4d, dword hm +.loop256: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + movu m0, [srcq+64] + movu m1, [srcq+80] + movu m2, [srcq+96] + movu m3, [srcq+112] +%ifidn %1, avg + pavg m0, [dstq+64] + pavg m1, [dstq+80] + pavg m2, [dstq+96] + pavg m3, [dstq+112] +%endif + mova [dstq+64], m0 + mova [dstq+80], m1 + mova [dstq+96], m2 + mova [dstq+112], m3 + movu m0, [srcq+128] + movu m1, [srcq+128+16] + movu m2, [srcq+128+32] + movu m3, [srcq+128+48] +%ifidn %1, avg + pavg m0, [dstq+128] + pavg m1, [dstq+128+16] + pavg m2, [dstq+128+32] + pavg m3, [dstq+128+48] +%endif + mova [dstq+128 ], m0 + mova [dstq+128+16], m1 + mova [dstq+128+32], m2 + mova [dstq+128+48], m3 + movu m0, [srcq+128+64] + movu m1, [srcq+128+80] + movu m2, [srcq+128+96] + movu m3, [srcq+128+112] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq+128+64] + pavg m1, [dstq+128+80] + pavg m2, [dstq+128+96] + pavg m3, [dstq+128+112] +%endif + mova [dstq+128+64], m0 + mova [dstq+128+80], m1 + mova [dstq+128+96], m2 + mova [dstq+128+112], m3 + add dstq, dst_strideq + sub r4d, 1 + jnz .loop256 + RET +%endif + +.w128: + mov r4d, dword hm +.loop128: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + movu m0, [srcq+64] + movu m1, [srcq+80] + movu m2, [srcq+96] + movu m3, [srcq+112] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq+64] + pavg m1, [dstq+80] + pavg m2, [dstq+96] + pavg m3, [dstq+112] +%endif + mova [dstq+64], m0 + mova [dstq+80], m1 + mova [dstq+96], m2 + mova [dstq+112], m3 + add dstq, dst_strideq + sub r4d, 1 + jnz .loop128 + RET + +%else ; CONFIG_VP10 && CONFIG_EXT_PARTITION + %ifidn %2, highbd cmp r4d, 64 je .w64 @@ -82,10 +195,11 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ mova [dstq+96], m2 mova [dstq+112], m3 add dstq, dst_strideq - dec r4d + sub r4d, 1 jnz .loop128 RET %endif +%endif ; CONFIG_VP10 && CONFIG_EXT_PARTITION .w64 mov r4d, dword hm @@ -106,7 +220,7 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ mova [dstq+32], m2 mova [dstq+48], m3 add dstq, dst_strideq - dec r4d + sub r4d, 1 jnz .loop64 RET diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index 6fd52087c..6c5991858 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -844,34 +844,49 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. // --Require an additional 8 rows for the horiz_w8 transpose tail. - DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); + DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_CU_SIZE]); const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - assert(w <= 64); - assert(h <= 64); + assert(w <= MAX_CU_SIZE); + assert(h <= MAX_CU_SIZE); assert(y_step_q4 <= 32); assert(x_step_q4 <= 32); if (w >= 8) { scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, 64, x_filters, x0_q4, x_step_q4, + src_stride, + temp, + MAX_CU_SIZE, + x_filters, x0_q4, x_step_q4, w, intermediate_height); } else { scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, 64, x_filters, x0_q4, x_step_q4, + src_stride, + temp, + MAX_CU_SIZE, + x_filters, x0_q4, x_step_q4, w, intermediate_height); } if (w >= 16) { - scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); + scaledconvolve_vert_w16(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_CU_SIZE, + dst, + dst_stride, + y_filters, y0_q4, y_step_q4, w, h); } else if (w == 8) { - scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); + scaledconvolve_vert_w8(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_CU_SIZE, + dst, + dst_stride, + y_filters, y0_q4, y_step_q4, w, h); } else { - scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); + scaledconvolve_vert_w4(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_CU_SIZE, + dst, + dst_stride, + y_filters, y0_q4, y_step_q4, w, h); } } -- 2.40.0