namespace {
-static const unsigned int kMaxDimension = 64;
+static const unsigned int kMaxDimension = MAX_CU_SIZE;
typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
// = 23
// and filter_max_width = 16
//
- uint8_t intermediate_buffer[71 * kMaxDimension];
+ uint8_t intermediate_buffer[(kMaxDimension+8) * kMaxDimension];
const int intermediate_next_stride = 1 - intermediate_height * output_width;
// Horizontal pass (src -> transposed intermediate).
assert(output_width <= kMaxDimension);
assert(output_height <= kMaxDimension);
- filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
+ filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, kMaxDimension,
output_width, output_height);
- block2d_average_c(tmp, 64, dst_ptr, dst_stride,
+ block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride,
output_width, output_height);
}
* = 23
* and filter_max_width = 16
*/
- uint16_t intermediate_buffer[71 * kMaxDimension];
+ uint16_t intermediate_buffer[(kMaxDimension+8) * kMaxDimension];
const int intermediate_next_stride = 1 - intermediate_height * output_width;
// Horizontal pass (src -> transposed intermediate).
assert(output_width <= kMaxDimension);
assert(output_height <= kMaxDimension);
- highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
+ highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter,
+ tmp, kMaxDimension,
output_width, output_height, bd);
- highbd_block2d_average_c(tmp, 64, dst_ptr, dst_stride,
+ highbd_block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride,
output_width, output_height);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
protected:
static const int kDataAlignment = 16;
- static const int kOuterBlockSize = 256;
+ static const int kOuterBlockSize = 4*kMaxDimension;
static const int kInputStride = kOuterBlockSize;
static const int kOutputStride = kOuterBlockSize;
static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;
void CopyOutputToRef() {
memcpy(output_ref_, output_, kOutputBufferSize);
#if CONFIG_VP9_HIGHBITDEPTH
- memcpy(output16_ref_, output16_, kOutputBufferSize);
+ memcpy(output16_ref_, output16_,
+ kOutputBufferSize * sizeof(*output16_ref_));
#endif
}
}
uint8_t *input() const {
+ const int index = BorderTop() * kOuterBlockSize + BorderLeft();
#if CONFIG_VP9_HIGHBITDEPTH
if (UUT_->use_highbd_ == 0) {
- return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+ return input_ + index;
} else {
- return CONVERT_TO_BYTEPTR(input16_ + BorderTop() * kOuterBlockSize +
- BorderLeft());
+ return CONVERT_TO_BYTEPTR(input16_) + index;
}
#else
- return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+ return input_ + index;
#endif
}
uint8_t *output() const {
+ const int index = BorderTop() * kOuterBlockSize + BorderLeft();
#if CONFIG_VP9_HIGHBITDEPTH
if (UUT_->use_highbd_ == 0) {
- return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+ return output_ + index;
} else {
- return CONVERT_TO_BYTEPTR(output16_ + BorderTop() * kOuterBlockSize +
- BorderLeft());
+ return CONVERT_TO_BYTEPTR(output16_ + index);
}
#else
- return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+ return output_ + index;
#endif
}
uint8_t *output_ref() const {
+ const int index = BorderTop() * kOuterBlockSize + BorderLeft();
#if CONFIG_VP9_HIGHBITDEPTH
if (UUT_->use_highbd_ == 0) {
- return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+ return output_ref_ + index;
} else {
- return CONVERT_TO_BYTEPTR(output16_ref_ + BorderTop() * kOuterBlockSize +
- BorderLeft());
+ return CONVERT_TO_BYTEPTR(output16_ref_ + index);
}
#else
- return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+ return output_ref_ + index;
#endif
}
wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8,
wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
INSTANTIATE_TEST_CASE_P(C_8, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+ make_tuple(128, 64, &convolve8_c),
+ make_tuple(64, 128, &convolve8_c),
+ make_tuple(128, 128, &convolve8_c),
+#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(4, 4, &convolve8_c),
make_tuple(8, 4, &convolve8_c),
make_tuple(4, 8, &convolve8_c),
wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10,
wrap_convolve8_c_10, wrap_convolve8_avg_c_10, 10);
INSTANTIATE_TEST_CASE_P(C_10, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+ make_tuple(128, 64, &convolve10_c),
+ make_tuple(64, 128, &convolve10_c),
+ make_tuple(128, 128, &convolve10_c),
+#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(4, 4, &convolve10_c),
make_tuple(8, 4, &convolve10_c),
make_tuple(4, 8, &convolve10_c),
wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12,
wrap_convolve8_c_12, wrap_convolve8_avg_c_12, 12);
INSTANTIATE_TEST_CASE_P(C_12, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+ make_tuple(128, 64, &convolve12_c),
+ make_tuple(64, 128, &convolve12_c),
+ make_tuple(128, 128, &convolve12_c),
+#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(4, 4, &convolve12_c),
make_tuple(8, 4, &convolve12_c),
make_tuple(4, 8, &convolve12_c),
vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+ make_tuple(128, 64, &convolve8_c),
+ make_tuple(64, 128, &convolve8_c),
+ make_tuple(128, 128, &convolve8_c),
+#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(4, 4, &convolve8_c),
make_tuple(8, 4, &convolve8_c),
make_tuple(4, 8, &convolve8_c),
wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12, 12);
-INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
+INSTANTIATE_TEST_CASE_P(SSE2_8, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+ make_tuple(128, 64, &convolve8_sse2),
+ make_tuple(64, 128, &convolve8_sse2),
+ make_tuple(128, 128, &convolve8_sse2),
+#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(4, 4, &convolve8_sse2),
make_tuple(8, 4, &convolve8_sse2),
make_tuple(4, 8, &convolve8_sse2),
make_tuple(32, 32, &convolve8_sse2),
make_tuple(64, 32, &convolve8_sse2),
make_tuple(32, 64, &convolve8_sse2),
- make_tuple(64, 64, &convolve8_sse2),
+ make_tuple(64, 64, &convolve8_sse2)));
+INSTANTIATE_TEST_CASE_P(SSE2_10, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+ make_tuple(128, 64, &convolve10_sse2),
+ make_tuple(64, 128, &convolve10_sse2),
+ make_tuple(128, 128, &convolve10_sse2),
+#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(4, 4, &convolve10_sse2),
make_tuple(8, 4, &convolve10_sse2),
make_tuple(4, 8, &convolve10_sse2),
make_tuple(32, 32, &convolve10_sse2),
make_tuple(64, 32, &convolve10_sse2),
make_tuple(32, 64, &convolve10_sse2),
- make_tuple(64, 64, &convolve10_sse2),
+ make_tuple(64, 64, &convolve10_sse2)));
+INSTANTIATE_TEST_CASE_P(SSE2_12, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+ make_tuple(128, 64, &convolve12_sse2),
+ make_tuple(64, 128, &convolve12_sse2),
+ make_tuple(128, 128, &convolve12_sse2),
+#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(4, 4, &convolve12_sse2),
make_tuple(8, 4, &convolve12_sse2),
make_tuple(4, 8, &convolve12_sse2),
vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+ make_tuple(128, 64, &convolve8_sse2),
+ make_tuple(64, 128, &convolve8_sse2),
+ make_tuple(128, 128, &convolve8_sse2),
+#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(4, 4, &convolve8_sse2),
make_tuple(8, 4, &convolve8_sse2),
make_tuple(4, 8, &convolve8_sse2),
vpx_convolve8_ssse3, vpx_convolve8_avg_ssse3,
vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
- vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+ vpx_scaled_2d_ssse3, vpx_scaled_avg_2d_c, 0);
INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+ make_tuple(128, 64, &convolve8_ssse3),
+ make_tuple(64, 128, &convolve8_ssse3),
+ make_tuple(128, 128, &convolve8_ssse3),
+#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(4, 4, &convolve8_ssse3),
make_tuple(8, 4, &convolve8_ssse3),
make_tuple(4, 8, &convolve8_ssse3),
vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+ make_tuple(128, 64, &convolve8_avx2),
+ make_tuple(64, 128, &convolve8_avx2),
+ make_tuple(128, 128, &convolve8_avx2),
+#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(4, 4, &convolve8_avx2),
make_tuple(8, 4, &convolve8_avx2),
make_tuple(4, 8, &convolve8_avx2),
make_tuple(64, 64, &convolve8_avx2)));
#endif // HAVE_AVX2 && HAVE_SSSE3
-#if HAVE_NEON
+// TODO(any): Make NEON versions support 128x128 128x64 64x128 block sizes
+#if HAVE_NEON && !(CONFIG_VP10 && CONFIG_EXT_PARTITION)
#if HAVE_NEON_ASM
const ConvolveFunctions convolve8_neon(
vpx_convolve_copy_neon, vpx_convolve_avg_neon,
#endif // HAVE_NEON_ASM
INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+ make_tuple(128, 64, &convolve8_neon),
+ make_tuple(64, 128, &convolve8_neon),
+ make_tuple(128, 128, &convolve8_neon),
+#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(4, 4, &convolve8_neon),
make_tuple(8, 4, &convolve8_neon),
make_tuple(4, 8, &convolve8_neon),
make_tuple(64, 64, &convolve8_neon)));
#endif // HAVE_NEON
-#if HAVE_DSPR2
+// TODO(any): Make DSPR2 versions support 128x128 128x64 64x128 block sizes
+#if HAVE_DSPR2 && !(CONFIG_VP10 && CONFIG_EXT_PARTITION)
const ConvolveFunctions convolve8_dspr2(
vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2,
vpx_convolve8_horiz_dspr2, vpx_convolve8_avg_horiz_dspr2,
vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+ make_tuple(128, 64, &convolve8_dspr2),
+ make_tuple(64, 128, &convolve8_dspr2),
+ make_tuple(128, 128, &convolve8_dspr2),
+#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(4, 4, &convolve8_dspr2),
make_tuple(8, 4, &convolve8_dspr2),
make_tuple(4, 8, &convolve8_dspr2),
make_tuple(64, 64, &convolve8_dspr2)));
#endif
-#if HAVE_MSA
+// TODO(any): Make MSA versions support 128x128 128x64 64x128 block sizes
+#if HAVE_MSA && !(CONFIG_VP10 && CONFIG_EXT_PARTITION)
const ConvolveFunctions convolve8_msa(
vpx_convolve_copy_msa, vpx_convolve_avg_msa,
vpx_convolve8_horiz_msa, vpx_convolve8_avg_horiz_msa,
vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+ make_tuple(128, 64, &convolve8_msa),
+ make_tuple(64, 128, &convolve8_msa),
+ make_tuple(128, 128, &convolve8_msa),
+#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(4, 4, &convolve8_msa),
make_tuple(8, 4, &convolve8_msa),
make_tuple(4, 8, &convolve8_msa),
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
-#define MAX_CU_SIZE 128
-
using libvpx_test::ACMRandom;
namespace {
#include "vpx_dsp/vpx_filter.h"
#include "vpx_mem/vpx_mem.h"
-#define MAX_CU_SIZE 128
-
using libvpx_test::ACMRandom;
namespace {
// --Must round-up because block may be located at sub-pixel position.
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
- uint8_t temp[135 * 64];
+ uint8_t temp[MAX_EXT_SIZE * MAX_CU_SIZE];
int intermediate_height =
(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
- assert(w <= 64);
- assert(h <= 64);
+ assert(w <= MAX_CU_SIZE);
+ assert(h <= MAX_CU_SIZE);
+
assert(y_step_q4 <= 32);
assert(x_step_q4 <= 32);
- convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+ convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+ temp, MAX_CU_SIZE,
x_filters, x0_q4, x_step_q4, w, intermediate_height);
- convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+ convolve_vert(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_CU_SIZE,
+ dst, dst_stride,
y_filters, y0_q4, y_step_q4, w, h);
}
const int16_t *filter_y, int y_step_q4,
int w, int h) {
/* Fixed size intermediate buffer places limits on parameters. */
- DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
- assert(w <= 64);
- assert(h <= 64);
+ DECLARE_ALIGNED(16, uint8_t, temp[MAX_CU_SIZE * MAX_CU_SIZE]);
+ assert(w <= MAX_CU_SIZE);
+ assert(h <= MAX_CU_SIZE);
- vpx_convolve8_c(src, src_stride, temp, 64,
+ vpx_convolve8_c(src, src_stride, temp, MAX_CU_SIZE,
filter_x, x_step_q4, filter_y, y_step_q4, w, h);
- vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+ vpx_convolve_avg_c(temp, MAX_CU_SIZE, dst, dst_stride,
+ NULL, 0, NULL, 0, w, h);
}
void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
// --Must round-up because block may be located at sub-pixel position.
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
- uint16_t temp[64 * 135];
+ uint16_t temp[MAX_EXT_SIZE * MAX_CU_SIZE];
int intermediate_height =
(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
- assert(w <= 64);
- assert(h <= 64);
+ assert(w <= MAX_CU_SIZE);
+ assert(h <= MAX_CU_SIZE);
assert(y_step_q4 <= 32);
assert(x_step_q4 <= 32);
- highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+ highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+ CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE,
x_filters, x0_q4, x_step_q4, w,
intermediate_height, bd);
- highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
- 64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
- w, h, bd);
+ highbd_convolve_vert(
+ CONVERT_TO_BYTEPTR(temp) + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_CU_SIZE,
+ dst, dst_stride,
+ y_filters, y0_q4, y_step_q4, w, h, bd);
}
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
// Fixed size intermediate buffer places limits on parameters.
- DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
- assert(w <= 64);
- assert(h <= 64);
+ DECLARE_ALIGNED(16, uint16_t, temp[MAX_CU_SIZE * MAX_CU_SIZE]);
+ assert(w <= MAX_CU_SIZE);
+ assert(h <= MAX_CU_SIZE);
- vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+ vpx_highbd_convolve8_c(src, src_stride,
+ CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE,
filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
- vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
+ vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE,
+ dst, dst_stride,
NULL, 0, NULL, 0, w, h, bd);
}
extern "C" {
#endif
+// Note: Fixed size intermediate buffers, place limits on parameters
+// of some functions. 2d filtering proceeds in 2 steps:
+// (1) Interpolate horizontally into an intermediate buffer, temp.
+// (2) Interpolate temp vertically to derive the sub-pixel result.
+// Deriving the maximum number of rows in the temp buffer (135):
+// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+// --Largest block size is 64x64 pixels.
+// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+// original frame (in 1/16th pixel units).
+// --Must round-up because block may be located at sub-pixel position.
+// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+# define MAX_EXT_SIZE 263
+#else
+# define MAX_EXT_SIZE 135
+#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
+
typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
extern "C" {
#endif
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+# define MAX_CU_SIZE 128
+#else
+# define MAX_CU_SIZE 64
+#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
+
#define VPXMIN(x, y) (((x) < (y)) ? (x) : (y))
#define VPXMAX(x, y) (((x) > (y)) ? (x) : (y))
#
# Sub Pixel Filters
#
-add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_copy neon dspr2 msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_avg neon dspr2 msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa/;
-
+add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/;
-
-add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/;
-
-add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_2d ssse3/;
-
-add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_horiz/;
-
-add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_vert/;
-
-add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_avg_2d/;
-
-add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_avg_horiz/;
-
-add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_avg_vert/;
+add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+
+specialize qw/vpx_convolve_copy /, "$sse2_x86inc";
+specialize qw/vpx_convolve_avg /, "$sse2_x86inc";
+specialize qw/vpx_convolve8 sse2 ssse3/, "$avx2_ssse3";
+specialize qw/vpx_convolve8_horiz sse2 ssse3/, "$avx2_ssse3";
+specialize qw/vpx_convolve8_vert sse2 ssse3/, "$avx2_ssse3";
+specialize qw/vpx_convolve8_avg sse2 ssse3/;
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3/;
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3/;
+specialize qw/vpx_scaled_2d ssse3/;
+
+# TODO(any): These need to be extended to up to 128x128 block sizes
+if (!(vpx_config("CONFIG_VP10") eq "yes" && vpx_config("CONFIG_EXT_PARTITION") eq "yes")) {
+ specialize qw/vpx_convolve_copy neon dspr2 msa/;
+ specialize qw/vpx_convolve_avg neon dspr2 msa/;
+ specialize qw/vpx_convolve8 neon dspr2 msa/;
+ specialize qw/vpx_convolve8_horiz neon dspr2 msa/;
+ specialize qw/vpx_convolve8_vert neon dspr2 msa/;
+ specialize qw/vpx_convolve8_avg neon dspr2 msa/;
+ specialize qw/vpx_convolve8_avg_horiz neon dspr2 msa/;
+ specialize qw/vpx_convolve8_avg_vert neon dspr2 msa/;
+}
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- #
- # Sub Pixel Filters
- #
add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vpx_highbd_convolve_copy/, "$sse2_x86inc";
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_convolve.h"
typedef void filter8_1dfunction (
const uint8_t *src_ptr,
int w, int h) { \
assert(filter_x[3] != 128); \
assert(filter_y[3] != 128); \
- assert(w <= 64); \
- assert(h <= 64); \
+ assert(w <= MAX_CU_SIZE); \
+ assert(h <= MAX_CU_SIZE); \
assert(x_step_q4 == 16); \
assert(y_step_q4 == 16); \
if (filter_x[0] || filter_x[1] || filter_x[2]|| \
filter_y[0] || filter_y[1] || filter_y[2]) { \
- DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
- vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+7)]); \
+ vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+ fdata2, MAX_CU_SIZE, \
filter_x, x_step_q4, filter_y, y_step_q4, \
w, h + 7); \
- vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+ vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_CU_SIZE, MAX_CU_SIZE, \
+ dst, dst_stride, \
filter_x, x_step_q4, filter_y, \
y_step_q4, w, h); \
} else { \
- DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
- vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+1)]); \
+ vpx_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_CU_SIZE, \
filter_x, x_step_q4, filter_y, y_step_q4, \
w, h + 1); \
- vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+ vpx_convolve8_##avg##vert_##opt(fdata2, MAX_CU_SIZE, dst, dst_stride, \
filter_x, x_step_q4, filter_y, \
y_step_q4, w, h); \
} \
const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, \
int w, int h, int bd) { \
- assert(w <= 64); \
- assert(h <= 64); \
+ assert(w <= MAX_CU_SIZE); \
+ assert(h <= MAX_CU_SIZE); \
if (x_step_q4 == 16 && y_step_q4 == 16) { \
if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
- DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
- vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
- CONVERT_TO_BYTEPTR(fdata2), 64, \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+7)]); \
+ vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, \
+ src_stride, \
+ CONVERT_TO_BYTEPTR(fdata2), \
+ MAX_CU_SIZE, \
filter_x, x_step_q4, \
filter_y, y_step_q4, \
w, h + 7, bd); \
- vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
- 64, dst, dst_stride, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h, bd); \
+ vpx_highbd_convolve8_##avg##vert_##opt( \
+ CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_CU_SIZE, \
+ MAX_CU_SIZE, \
+ dst, \
+ dst_stride, \
+ filter_x, x_step_q4, \
+ filter_y, y_step_q4, \
+ w, h, bd); \
} else { \
- DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
- vpx_highbd_convolve8_horiz_##opt(src, src_stride, \
- CONVERT_TO_BYTEPTR(fdata2), 64, \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+1)]); \
+ vpx_highbd_convolve8_horiz_##opt(src, \
+ src_stride, \
+ CONVERT_TO_BYTEPTR(fdata2), \
+ MAX_CU_SIZE, \
filter_x, x_step_q4, \
filter_y, y_step_q4, \
w, h + 1, bd); \
- vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
- dst, dst_stride, \
+ vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), \
+ MAX_CU_SIZE, \
+ dst, \
+ dst_stride, \
filter_x, x_step_q4, \
filter_y, y_step_q4, \
w, h, bd); \
je .w16
cmp r4d, 32
je .w32
+
+%if CONFIG_VP10 && CONFIG_EXT_PARTITION
+ cmp r4d, 64
+ je .w64
+%ifidn %2, highbd
+ cmp r4d, 128
+ je .w128
+
+.w256:
+ mov r4d, dword hm
+.loop256:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+16]
+ pavg m2, [dstq+32]
+ pavg m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ movu m0, [srcq+64]
+ movu m1, [srcq+80]
+ movu m2, [srcq+96]
+ movu m3, [srcq+112]
+%ifidn %1, avg
+ pavg m0, [dstq+64]
+ pavg m1, [dstq+80]
+ pavg m2, [dstq+96]
+ pavg m3, [dstq+112]
+%endif
+ mova [dstq+64], m0
+ mova [dstq+80], m1
+ mova [dstq+96], m2
+ mova [dstq+112], m3
+ movu m0, [srcq+128]
+ movu m1, [srcq+128+16]
+ movu m2, [srcq+128+32]
+ movu m3, [srcq+128+48]
+%ifidn %1, avg
+ pavg m0, [dstq+128]
+ pavg m1, [dstq+128+16]
+ pavg m2, [dstq+128+32]
+ pavg m3, [dstq+128+48]
+%endif
+ mova [dstq+128 ], m0
+ mova [dstq+128+16], m1
+ mova [dstq+128+32], m2
+ mova [dstq+128+48], m3
+ movu m0, [srcq+128+64]
+ movu m1, [srcq+128+80]
+ movu m2, [srcq+128+96]
+ movu m3, [srcq+128+112]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavg m0, [dstq+128+64]
+ pavg m1, [dstq+128+80]
+ pavg m2, [dstq+128+96]
+ pavg m3, [dstq+128+112]
+%endif
+ mova [dstq+128+64], m0
+ mova [dstq+128+80], m1
+ mova [dstq+128+96], m2
+ mova [dstq+128+112], m3
+ add dstq, dst_strideq
+ sub r4d, 1
+ jnz .loop256
+ RET
+%endif
+
+.w128:
+ mov r4d, dword hm
+.loop128:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+16]
+ pavg m2, [dstq+32]
+ pavg m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ movu m0, [srcq+64]
+ movu m1, [srcq+80]
+ movu m2, [srcq+96]
+ movu m3, [srcq+112]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavg m0, [dstq+64]
+ pavg m1, [dstq+80]
+ pavg m2, [dstq+96]
+ pavg m3, [dstq+112]
+%endif
+ mova [dstq+64], m0
+ mova [dstq+80], m1
+ mova [dstq+96], m2
+ mova [dstq+112], m3
+ add dstq, dst_strideq
+ sub r4d, 1
+ jnz .loop128
+ RET
+
+%else ; CONFIG_VP10 && CONFIG_EXT_PARTITION
+
%ifidn %2, highbd
cmp r4d, 64
je .w64
mova [dstq+96], m2
mova [dstq+112], m3
add dstq, dst_strideq
- dec r4d
+ sub r4d, 1
jnz .loop128
RET
%endif
+%endif ; CONFIG_VP10 && CONFIG_EXT_PARTITION
.w64
mov r4d, dword hm
mova [dstq+32], m2
mova [dstq+48], m3
add dstq, dst_strideq
- dec r4d
+ sub r4d, 1
jnz .loop64
RET
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
// --Require an additional 8 rows for the horiz_w8 transpose tail.
- DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+ DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_CU_SIZE]);
const int intermediate_height =
(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
- assert(w <= 64);
- assert(h <= 64);
+ assert(w <= MAX_CU_SIZE);
+ assert(h <= MAX_CU_SIZE);
assert(y_step_q4 <= 32);
assert(x_step_q4 <= 32);
if (w >= 8) {
scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
+ src_stride,
+ temp,
+ MAX_CU_SIZE,
+ x_filters, x0_q4, x_step_q4,
w, intermediate_height);
} else {
scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
+ src_stride,
+ temp,
+ MAX_CU_SIZE,
+ x_filters, x0_q4, x_step_q4,
w, intermediate_height);
}
if (w >= 16) {
- scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+ scaledconvolve_vert_w16(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1),
+ MAX_CU_SIZE,
+ dst,
+ dst_stride,
+ y_filters, y0_q4, y_step_q4, w, h);
} else if (w == 8) {
- scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+ scaledconvolve_vert_w8(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1),
+ MAX_CU_SIZE,
+ dst,
+ dst_stride,
+ y_filters, y0_q4, y_step_q4, w, h);
} else {
- scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+ scaledconvolve_vert_w4(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1),
+ MAX_CU_SIZE,
+ dst,
+ dst_stride,
+ y_filters, y0_q4, y_step_q4, w, h);
}
}