From: Peter de Rivaz Date: Thu, 16 Oct 2014 13:00:54 +0000 (+0100) Subject: Added sse2 acceleration for highbitdepth variance X-Git-Tag: v1.4.0~499^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=48032bfcdb412a8e7f9d89154c4ac8fbb3f8fe72;p=libvpx Added sse2 acceleration for highbitdepth variance Change-Id: I446bdf3a405e4e9d2aa633d6281d66ea0cdfd79f (cherry picked from commit d7422b2b1eb9f0011a8c379c2be680d6892b16bc) (cherry picked from commit 6d741e4d76a7d9ece69ca117d1d9e2f9ee48ef8c) --- diff --git a/test/variance_test.cc b/test/variance_test.cc index a438d1721..4d279f686 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -7,16 +7,18 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include -#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include +#include +#include "test/acm_random.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" +#include "third_party/googletest/src/include/gtest/gtest.h" -#include "vpx/vpx_integer.h" #include "./vpx_config.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" #if CONFIG_VP8_ENCODER # include "./vp8_rtcd.h" @@ -26,7 +28,6 @@ # include "./vp9_rtcd.h" # include "vp9/encoder/vp9_variance.h" #endif -#include "test/acm_random.h" namespace { @@ -43,31 +44,106 @@ static unsigned int mb_ss_ref(const int16_t *src) { return res; } -static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src, - int l2w, int l2h, unsigned int *sse_ptr) { +static unsigned int variance_ref(const uint8_t *src, const uint8_t *ref, + int l2w, int l2h, int src_stride_coeff, + int ref_stride_coeff, uint32_t *sse_ptr, + bool use_high_bit_depth_, + vpx_bit_depth_t bit_depth) { +#if CONFIG_VP9_HIGHBITDEPTH + int64_t se = 0; + uint64_t sse = 0; + const int w = 1 << l2w; + const int h = 1 << l2h; + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + int diff; + if (!use_high_bit_depth_) { + diff = ref[w * y * ref_stride_coeff + x] - + src[w * y * src_stride_coeff + x]; + se += diff; + sse += diff * diff; + } else { + diff = CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x] - + CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x]; + se += diff; + sse += diff * diff; + } + } + } + if (bit_depth > VPX_BITS_8) { + sse = ROUND_POWER_OF_TWO(sse, 2 * (bit_depth - 8)); + se = ROUND_POWER_OF_TWO(se, bit_depth - 8); + } +#else int se = 0; unsigned int sse = 0; - const int w = 1 << l2w, h = 1 << l2h; + const int w = 1 << l2w; + const int h = 1 << l2h; for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { - int diff = ref[w * y + x] - src[w * y + x]; + int diff = ref[w * y * ref_stride_coeff + x] - + src[w * y * src_stride_coeff + x]; se += diff; sse += diff * diff; } } +#endif // CONFIG_VP9_HIGHBITDEPTH *sse_ptr = sse; return sse - (((int64_t) se * se) >> (l2w + l2h)); } static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src, int l2w, int l2h, int xoff, int yoff, - unsigned int *sse_ptr) { + unsigned int *sse_ptr, + bool use_high_bit_depth_, + vpx_bit_depth_t bit_depth) { +#if CONFIG_VP9_HIGHBITDEPTH + int64_t se = 0; + uint64_t sse = 0; + const int w = 1 << l2w; + const int h = 1 << l2h; + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + // Bilinear interpolation at a 16th pel step. + if (!use_high_bit_depth_) { + const int a1 = ref[(w + 1) * (y + 0) + x + 0]; + const int a2 = ref[(w + 1) * (y + 0) + x + 1]; + const int b1 = ref[(w + 1) * (y + 1) + x + 0]; + const int b2 = ref[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int diff = r - src[w * y + x]; + se += diff; + sse += diff * diff; + } else { + uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref); + uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + const int a1 = ref16[(w + 1) * (y + 0) + x + 0]; + const int a2 = ref16[(w + 1) * (y + 0) + x + 1]; + const int b1 = ref16[(w + 1) * (y + 1) + x + 0]; + const int b2 = ref16[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int diff = r - src16[w * y + x]; + se += diff; + sse += diff * diff; + } + } + } + if (bit_depth > VPX_BITS_8) { + sse = ROUND_POWER_OF_TWO(sse, 2 * (bit_depth - 8)); + se = ROUND_POWER_OF_TWO(se, bit_depth - 8); + } +#else int se = 0; unsigned int sse = 0; - const int w = 1 << l2w, h = 1 << l2h; + const int w = 1 << l2w; + const int h = 1 << l2h; for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { - // bilinear interpolation at a 16th pel step + // Bilinear interpolation at a 16th pel step. const int a1 = ref[(w + 1) * (y + 0) + x + 0]; const int a2 = ref[(w + 1) * (y + 0) + x + 1]; const int b1 = ref[(w + 1) * (y + 1) + x + 0]; @@ -75,11 +151,12 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src, const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); const int r = a + (((b - a) * yoff + 8) >> 4); - int diff = r - src[w * y + x]; + const int diff = r - src[w * y + x]; se += diff; sse += diff * diff; } } +#endif // CONFIG_VP9_HIGHBITDEPTH *sse_ptr = sse; return sse - (((int64_t) se * se) >> (l2w + l2h)); } @@ -130,40 +207,74 @@ void SumOfSquaresTest::RefTest() { template class VarianceTest - : public ::testing::TestWithParam > { + : public ::testing::TestWithParam > { public: virtual void SetUp() { - const tuple& params = this->GetParam(); + const tuple& params = this->GetParam(); log2width_ = get<0>(params); width_ = 1 << log2width_; log2height_ = get<1>(params); height_ = 1 << log2height_; variance_ = get<2>(params); + if (get<3>(params)) { + bit_depth_ = static_cast(get<3>(params)); + use_high_bit_depth_ = true; + } else { + bit_depth_ = VPX_BITS_8; + use_high_bit_depth_ = false; + } + mask_ = (1 << bit_depth_) - 1; rnd_.Reset(ACMRandom::DeterministicSeed()); block_size_ = width_ * height_; - src_ = reinterpret_cast(vpx_memalign(16, block_size_)); - ref_ = new uint8_t[block_size_]; +#if CONFIG_VP9_HIGHBITDEPTH + if (!use_high_bit_depth_) { + src_ = reinterpret_cast(vpx_memalign(16, block_size_ * 2)); + ref_ = new uint8_t[block_size_ * 2]; + } else { + src_ = CONVERT_TO_BYTEPTR(reinterpret_cast( + vpx_memalign(16, block_size_ * 2 * sizeof(uint16_t)))); + ref_ = CONVERT_TO_BYTEPTR(new uint16_t[block_size_ * 2]); + } +#else + src_ = reinterpret_cast(vpx_memalign(16, block_size_ * 2)); + ref_ = new uint8_t[block_size_ * 2]; +#endif ASSERT_TRUE(src_ != NULL); ASSERT_TRUE(ref_ != NULL); } virtual void TearDown() { +#if CONFIG_VP9_HIGHBITDEPTH + if (!use_high_bit_depth_) { + vpx_free(src_); + delete[] ref_; + } else { + vpx_free(CONVERT_TO_SHORTPTR(src_)); + delete[] CONVERT_TO_SHORTPTR(ref_); + } +#else vpx_free(src_); delete[] ref_; +#endif libvpx_test::ClearSystemState(); } protected: void ZeroTest(); void RefTest(); + void RefStrideTest(); void OneQuarterTest(); ACMRandom rnd_; - uint8_t* src_; - uint8_t* ref_; + uint8_t *src_; + uint8_t *ref_; int width_, log2width_; int height_, log2height_; + vpx_bit_depth_t bit_depth_; + int mask_; + bool use_high_bit_depth_; int block_size_; VarianceFunctionType variance_; }; @@ -171,14 +282,32 @@ class VarianceTest template void VarianceTest::ZeroTest() { for (int i = 0; i <= 255; ++i) { +#if CONFIG_VP9_HIGHBITDEPTH + if (!use_high_bit_depth_) { + memset(src_, i, block_size_); + } else { + vpx_memset16(CONVERT_TO_SHORTPTR(src_), i << (bit_depth_ - 8), + block_size_); + } +#else memset(src_, i, block_size_); +#endif for (int j = 0; j <= 255; ++j) { +#if CONFIG_VP9_HIGHBITDEPTH + if (!use_high_bit_depth_) { + memset(ref_, j, block_size_); + } else { + vpx_memset16(CONVERT_TO_SHORTPTR(ref_), j << (bit_depth_ - 8), + block_size_); + } +#else memset(ref_, j, block_size_); +#endif unsigned int sse; unsigned int var; ASM_REGISTER_STATE_CHECK( var = variance_(src_, width_, ref_, width_, &sse)); - EXPECT_EQ(0u, var) << "src values: " << i << "ref values: " << j; + EXPECT_EQ(0u, var) << "src values: " << i << " ref values: " << j; } } } @@ -187,15 +316,64 @@ template void VarianceTest::RefTest() { for (int i = 0; i < 10; ++i) { for (int j = 0; j < block_size_; j++) { +#if CONFIG_VP9_HIGHBITDEPTH + if (!use_high_bit_depth_) { src_[j] = rnd_.Rand8(); ref_[j] = rnd_.Rand8(); + } else { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() && mask_; + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() && mask_; + } +#else + src_[j] = rnd_.Rand8(); + ref_[j] = rnd_.Rand8(); +#endif } unsigned int sse1, sse2; unsigned int var1; + const int stride_coeff = 1; ASM_REGISTER_STATE_CHECK( var1 = variance_(src_, width_, ref_, width_, &sse1)); const unsigned int var2 = variance_ref(src_, ref_, log2width_, - log2height_, &sse2); + log2height_, stride_coeff, + stride_coeff, &sse2, + use_high_bit_depth_, bit_depth_); + EXPECT_EQ(sse1, sse2); + EXPECT_EQ(var1, var2); + } +} + +template +void VarianceTest::RefStrideTest() { + for (int i = 0; i < 10; ++i) { + int ref_stride_coeff = i % 2; + int src_stride_coeff = (i >> 1) % 2; + for (int j = 0; j < block_size_; j++) { + int ref_ind = (j / width_) * ref_stride_coeff * width_ + j % width_; + int src_ind = (j / width_) * src_stride_coeff * width_ + j % width_; +#if CONFIG_VP9_HIGHBITDEPTH + if (!use_high_bit_depth_) { + src_[src_ind] = rnd_.Rand8(); + ref_[ref_ind] = rnd_.Rand8(); + } else { + CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() && mask_; + CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() && mask_; + } +#else + src_[src_ind] = rnd_.Rand8(); + ref_[ref_ind] = rnd_.Rand8(); +#endif + } + unsigned int sse1, sse2; + unsigned int var1; + + ASM_REGISTER_STATE_CHECK( + var1 = variance_(src_, width_ * src_stride_coeff, + ref_, width_ * ref_stride_coeff, &sse1)); + const unsigned int var2 = variance_ref(src_, ref_, log2width_, + log2height_, src_stride_coeff, + ref_stride_coeff, &sse2, + use_high_bit_depth_, bit_depth_); EXPECT_EQ(sse1, sse2); EXPECT_EQ(var1, var2); } @@ -203,10 +381,23 @@ void VarianceTest::RefTest() { template void VarianceTest::OneQuarterTest() { - memset(src_, 255, block_size_); const int half = block_size_ / 2; +#if CONFIG_VP9_HIGHBITDEPTH + if (!use_high_bit_depth_) { + memset(src_, 255, block_size_); + memset(ref_, 255, half); + memset(ref_ + half, 0, half); + } else { + vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << (bit_depth_ - 8), + block_size_); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << (bit_depth_ - 8), half); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half); + } +#else + memset(src_, 255, block_size_); memset(ref_, 255, half); memset(ref_ + half, 0, half); +#endif unsigned int sse; unsigned int var; ASM_REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse)); @@ -264,8 +455,10 @@ void MseTest::RefTest_mse() { ref_[j] = rnd.Rand8(); } unsigned int sse1, sse2; + const int stride_coeff = 1; ASM_REGISTER_STATE_CHECK(mse_(src_, width_, ref_, width_, &sse1)); - variance_ref(src_, ref_, log2width_, log2height_, &sse2); + variance_ref(src_, ref_, log2width_, log2height_, stride_coeff, + stride_coeff, &sse2, false, VPX_BITS_8); EXPECT_EQ(sse1, sse2); } } @@ -279,9 +472,10 @@ void MseTest::RefTest_sse() { } unsigned int sse2; unsigned int var1; - ASM_REGISTER_STATE_CHECK( - var1 = mse_(src_, width_, ref_, width_)); - variance_ref(src_, ref_, log2width_, log2height_, &sse2); + const int stride_coeff = 1; + ASM_REGISTER_STATE_CHECK(var1 = mse_(src_, width_, ref_, width_)); + variance_ref(src_, ref_, log2width_, log2height_, stride_coeff, + stride_coeff, &sse2, false, VPX_BITS_8); EXPECT_EQ(var1, sse2); } } @@ -308,16 +502,59 @@ void MseTest::MaxTest_sse() { #endif #if CONFIG_VP9_ENCODER - unsigned int subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src, const uint8_t *second_pred, int l2w, int l2h, int xoff, int yoff, - unsigned int *sse_ptr) { + unsigned int *sse_ptr, + bool use_high_bit_depth, + vpx_bit_depth_t bit_depth) { +#if CONFIG_VP9_HIGHBITDEPTH + int64_t se = 0; + uint64_t sse = 0; + const int w = 1 << l2w; + const int h = 1 << l2h; + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + // bilinear interpolation at a 16th pel step + if (!use_high_bit_depth) { + const int a1 = ref[(w + 1) * (y + 0) + x + 0]; + const int a2 = ref[(w + 1) * (y + 0) + x + 1]; + const int b1 = ref[(w + 1) * (y + 1) + x + 0]; + const int b2 = ref[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x]; + se += diff; + sse += diff * diff; + } else { + uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref); + uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + uint16_t *sec16 = CONVERT_TO_SHORTPTR(second_pred); + const int a1 = ref16[(w + 1) * (y + 0) + x + 0]; + const int a2 = ref16[(w + 1) * (y + 0) + x + 1]; + const int b1 = ref16[(w + 1) * (y + 1) + x + 0]; + const int b2 = ref16[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x]; + se += diff; + sse += diff * diff; + } + } + } + if (bit_depth > 8) { + sse = ROUND_POWER_OF_TWO(sse, 2*(bit_depth-8)); + se = ROUND_POWER_OF_TWO(se, bit_depth-8); + } +#else int se = 0; unsigned int sse = 0; - const int w = 1 << l2w, h = 1 << l2h; + const int w = 1 << l2w; + const int h = 1 << l2h; for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { // bilinear interpolation at a 16th pel step @@ -328,11 +565,12 @@ unsigned int subpel_avg_variance_ref(const uint8_t *ref, const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); const int r = a + (((b - a) * yoff + 8) >> 4); - int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x]; + const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x]; se += diff; sse += diff * diff; } } +#endif // CONFIG_VP9_HIGHBITDEPTH *sse_ptr = sse; return sse - (((int64_t) se * se) >> (l2w + l2h)); } @@ -340,44 +578,84 @@ unsigned int subpel_avg_variance_ref(const uint8_t *ref, template class SubpelVarianceTest : public ::testing::TestWithParam > { + SubpelVarianceFunctionType, int> > { public: virtual void SetUp() { - const tuple& params = + const tuple& params = this->GetParam(); log2width_ = get<0>(params); width_ = 1 << log2width_; log2height_ = get<1>(params); height_ = 1 << log2height_; subpel_variance_ = get<2>(params); + if (get<3>(params)) { + bit_depth_ = (vpx_bit_depth_t) get<3>(params); + use_high_bit_depth_ = true; + } else { + bit_depth_ = VPX_BITS_8; + use_high_bit_depth_ = false; + } + mask_ = (1 << bit_depth_)-1; rnd_.Reset(ACMRandom::DeterministicSeed()); block_size_ = width_ * height_; +#if CONFIG_VP9_HIGHBITDEPTH + if (!use_high_bit_depth_) { + src_ = reinterpret_cast(vpx_memalign(16, block_size_)); + sec_ = reinterpret_cast(vpx_memalign(16, block_size_)); + ref_ = new uint8_t[block_size_ + width_ + height_ + 1]; + } else { + src_ = CONVERT_TO_BYTEPTR( + reinterpret_cast( + vpx_memalign(16, block_size_*sizeof(uint16_t)))); + sec_ = CONVERT_TO_BYTEPTR( + reinterpret_cast( + vpx_memalign(16, block_size_*sizeof(uint16_t)))); + ref_ = CONVERT_TO_BYTEPTR( + new uint16_t[block_size_ + width_ + height_ + 1]); + } +#else src_ = reinterpret_cast(vpx_memalign(16, block_size_)); sec_ = reinterpret_cast(vpx_memalign(16, block_size_)); ref_ = new uint8_t[block_size_ + width_ + height_ + 1]; +#endif // CONFIG_VP9_HIGHBITDEPTH ASSERT_TRUE(src_ != NULL); ASSERT_TRUE(sec_ != NULL); ASSERT_TRUE(ref_ != NULL); } virtual void TearDown() { +#if CONFIG_VP9_HIGHBITDEPTH + if (!use_high_bit_depth_) { + vpx_free(src_); + delete[] ref_; + vpx_free(sec_); + } else { + vpx_free(CONVERT_TO_SHORTPTR(src_)); + delete[] CONVERT_TO_SHORTPTR(ref_); + vpx_free(CONVERT_TO_SHORTPTR(sec_)); + } +#else vpx_free(src_); delete[] ref_; vpx_free(sec_); +#endif libvpx_test::ClearSystemState(); } protected: void RefTest(); + void ExtremeRefTest(); ACMRandom rnd_; uint8_t *src_; uint8_t *ref_; uint8_t *sec_; + bool use_high_bit_depth_; + vpx_bit_depth_t bit_depth_; int width_, log2width_; int height_, log2height_; - int block_size_; + int block_size_, mask_; SubpelVarianceFunctionType subpel_variance_; }; @@ -385,18 +663,78 @@ template void SubpelVarianceTest::RefTest() { for (int x = 0; x < 16; ++x) { for (int y = 0; y < 16; ++y) { +#if CONFIG_VP9_HIGHBITDEPTH + if (!use_high_bit_depth_) { + for (int j = 0; j < block_size_; j++) { + src_[j] = rnd_.Rand8(); + } + for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { + ref_[j] = rnd_.Rand8(); + } + } else { + for (int j = 0; j < block_size_; j++) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_; + } + for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_; + } + } +#else for (int j = 0; j < block_size_; j++) { src_[j] = rnd_.Rand8(); } for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { ref_[j] = rnd_.Rand8(); } +#endif // CONFIG_VP9_HIGHBITDEPTH unsigned int sse1, sse2; unsigned int var1; ASM_REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1)); const unsigned int var2 = subpel_variance_ref(ref_, src_, log2width_, - log2height_, x, y, &sse2); + log2height_, x, y, &sse2, + use_high_bit_depth_, + bit_depth_); + EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; + EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; + } + } +} + +template +void SubpelVarianceTest::ExtremeRefTest() { + // Compare against reference. + // Src: Set the first half of values to 0, the second half to the maximum. + // Ref: Set the first half of values to the maximum, the second half to 0. + for (int x = 0; x < 16; ++x) { + for (int y = 0; y < 16; ++y) { + const int half = block_size_ / 2; +#if CONFIG_VP9_HIGHBITDEPTH + if (!use_high_bit_depth_) { + memset(src_, 0, half); + memset(src_ + half, 255, half); + memset(ref_, 255, half); + memset(ref_ + half, 0, half + width_ + height_ + 1); + } else { + vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half); + vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_, + half + width_ + height_ + 1); + } +#else + memset(src_, 0, half); + memset(src_ + half, 255, half); + memset(ref_, 255, half); + memset(ref_ + half, 0, half + width_ + height_ + 1); +#endif // CONFIG_VP9_HIGHBITDEPTH + unsigned int sse1, sse2; + unsigned int var1; + ASM_REGISTER_STATE_CHECK( + var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1)); + const unsigned int var2 = + subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2, + use_high_bit_depth_, bit_depth_); EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; } @@ -407,6 +745,25 @@ template<> void SubpelVarianceTest::RefTest() { for (int x = 0; x < 16; ++x) { for (int y = 0; y < 16; ++y) { +#if CONFIG_VP9_HIGHBITDEPTH + if (!use_high_bit_depth_) { + for (int j = 0; j < block_size_; j++) { + src_[j] = rnd_.Rand8(); + sec_[j] = rnd_.Rand8(); + } + for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { + ref_[j] = rnd_.Rand8(); + } + } else { + for (int j = 0; j < block_size_; j++) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_; + CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask_; + } + for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_; + } + } +#else for (int j = 0; j < block_size_; j++) { src_[j] = rnd_.Rand8(); sec_[j] = rnd_.Rand8(); @@ -414,6 +771,7 @@ void SubpelVarianceTest::RefTest() { for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { ref_[j] = rnd_.Rand8(); } +#endif unsigned int sse1, sse2; unsigned int var1; ASM_REGISTER_STATE_CHECK( @@ -421,7 +779,9 @@ void SubpelVarianceTest::RefTest() { src_, width_, &sse1, sec_)); const unsigned int var2 = subpel_avg_variance_ref(ref_, src_, sec_, log2width_, log2height_, - x, y, &sse2); + x, y, &sse2, + use_high_bit_depth_, + bit_depth_); EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; } @@ -468,11 +828,11 @@ const vp8_variance_fn_t variance16x8_c = vp8_variance16x8_c; const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c; INSTANTIATE_TEST_CASE_P( C, VP8VarianceTest, - ::testing::Values(make_tuple(2, 2, variance4x4_c), - make_tuple(3, 3, variance8x8_c), - make_tuple(3, 4, variance8x16_c), - make_tuple(4, 3, variance16x8_c), - make_tuple(4, 4, variance16x16_c))); + ::testing::Values(make_tuple(2, 2, variance4x4_c, 0), + make_tuple(3, 3, variance8x8_c, 0), + make_tuple(3, 4, variance8x16_c, 0), + make_tuple(4, 3, variance16x8_c, 0), + make_tuple(4, 4, variance16x16_c, 0))); #if HAVE_NEON const vp8_sse_fn_t get4x4sse_cs_neon = vp8_get4x4sse_cs_neon; @@ -491,13 +851,12 @@ const vp8_variance_fn_t variance16x8_neon = vp8_variance16x8_neon; const vp8_variance_fn_t variance16x16_neon = vp8_variance16x16_neon; INSTANTIATE_TEST_CASE_P( NEON, VP8VarianceTest, - ::testing::Values(make_tuple(3, 3, variance8x8_neon), - make_tuple(3, 4, variance8x16_neon), - make_tuple(4, 3, variance16x8_neon), - make_tuple(4, 4, variance16x16_neon))); + ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0), + make_tuple(3, 4, variance8x16_neon, 0), + make_tuple(4, 3, variance16x8_neon, 0), + make_tuple(4, 4, variance16x16_neon, 0))); #endif - #if HAVE_MMX const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx; const vp8_variance_fn_t variance8x8_mmx = vp8_variance8x8_mmx; @@ -506,11 +865,11 @@ const vp8_variance_fn_t variance16x8_mmx = vp8_variance16x8_mmx; const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx; INSTANTIATE_TEST_CASE_P( MMX, VP8VarianceTest, - ::testing::Values(make_tuple(2, 2, variance4x4_mmx), - make_tuple(3, 3, variance8x8_mmx), - make_tuple(3, 4, variance8x16_mmx), - make_tuple(4, 3, variance16x8_mmx), - make_tuple(4, 4, variance16x16_mmx))); + ::testing::Values(make_tuple(2, 2, variance4x4_mmx, 0), + make_tuple(3, 3, variance8x8_mmx, 0), + make_tuple(3, 4, variance8x16_mmx, 0), + make_tuple(4, 3, variance16x8_mmx, 0), + make_tuple(4, 4, variance16x16_mmx, 0))); #endif #if HAVE_SSE2 @@ -521,11 +880,11 @@ const vp8_variance_fn_t variance16x8_wmt = vp8_variance16x8_wmt; const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt; INSTANTIATE_TEST_CASE_P( SSE2, VP8VarianceTest, - ::testing::Values(make_tuple(2, 2, variance4x4_wmt), - make_tuple(3, 3, variance8x8_wmt), - make_tuple(3, 4, variance8x16_wmt), - make_tuple(4, 3, variance16x8_wmt), - make_tuple(4, 4, variance16x16_wmt))); + ::testing::Values(make_tuple(2, 2, variance4x4_wmt, 0), + make_tuple(3, 3, variance8x8_wmt, 0), + make_tuple(3, 4, variance8x16_wmt, 0), + make_tuple(4, 3, variance16x8_wmt, 0), + make_tuple(4, 4, variance16x16_wmt, 0))); #endif #endif // CONFIG_VP8_ENCODER @@ -537,7 +896,6 @@ INSTANTIATE_TEST_CASE_P( namespace vp9 { #if CONFIG_VP9_ENCODER - TEST_P(SumOfSquaresTest, Const) { ConstTest(); } TEST_P(SumOfSquaresTest, Ref) { RefTest(); } @@ -550,10 +908,27 @@ typedef SubpelVarianceTest VP9SubpelAvgVarianceTest; TEST_P(VP9VarianceTest, Zero) { ZeroTest(); } TEST_P(VP9VarianceTest, Ref) { RefTest(); } +TEST_P(VP9VarianceTest, RefStride) { RefStrideTest(); } TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); } +TEST_P(VP9SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); } TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); } +#if CONFIG_VP9_HIGHBITDEPTH +typedef VarianceTest VP9VarianceHighTest; +typedef SubpelVarianceTest VP9SubpelVarianceHighTest; +typedef SubpelVarianceTest + VP9SubpelAvgVarianceHighTest; + +TEST_P(VP9VarianceHighTest, Zero) { ZeroTest(); } +TEST_P(VP9VarianceHighTest, Ref) { RefTest(); } +TEST_P(VP9VarianceHighTest, RefStride) { RefStrideTest(); } +TEST_P(VP9SubpelVarianceHighTest, Ref) { RefTest(); } +TEST_P(VP9SubpelVarianceHighTest, ExtremeRef) { ExtremeRefTest(); } +TEST_P(VP9SubpelAvgVarianceHighTest, Ref) { RefTest(); } +TEST_P(VP9VarianceHighTest, OneQuarter) { OneQuarterTest(); } +#endif // CONFIG_VP9_HIGHBITDEPTH + const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c; const vp9_variance_fn_t variance4x8_c = vp9_variance4x8_c; const vp9_variance_fn_t variance8x4_c = vp9_variance8x4_c; @@ -569,20 +944,115 @@ const vp9_variance_fn_t variance64x32_c = vp9_variance64x32_c; const vp9_variance_fn_t variance64x64_c = vp9_variance64x64_c; INSTANTIATE_TEST_CASE_P( C, VP9VarianceTest, - ::testing::Values(make_tuple(2, 2, variance4x4_c), - make_tuple(2, 3, variance4x8_c), - make_tuple(3, 2, variance8x4_c), - make_tuple(3, 3, variance8x8_c), - make_tuple(3, 4, variance8x16_c), - make_tuple(4, 3, variance16x8_c), - make_tuple(4, 4, variance16x16_c), - make_tuple(4, 5, variance16x32_c), - make_tuple(5, 4, variance32x16_c), - make_tuple(5, 5, variance32x32_c), - make_tuple(5, 6, variance32x64_c), - make_tuple(6, 5, variance64x32_c), - make_tuple(6, 6, variance64x64_c))); - + ::testing::Values(make_tuple(2, 2, variance4x4_c, 0), + make_tuple(2, 3, variance4x8_c, 0), + make_tuple(3, 2, variance8x4_c, 0), + make_tuple(3, 3, variance8x8_c, 0), + make_tuple(3, 4, variance8x16_c, 0), + make_tuple(4, 3, variance16x8_c, 0), + make_tuple(4, 4, variance16x16_c, 0), + make_tuple(4, 5, variance16x32_c, 0), + make_tuple(5, 4, variance32x16_c, 0), + make_tuple(5, 5, variance32x32_c, 0), + make_tuple(5, 6, variance32x64_c, 0), + make_tuple(6, 5, variance64x32_c, 0), + make_tuple(6, 6, variance64x64_c, 0))); +#if CONFIG_VP9_HIGHBITDEPTH +const vp9_variance_fn_t highbd_10_variance4x4_c = vp9_highbd_10_variance4x4_c; +const vp9_variance_fn_t highbd_10_variance4x8_c = vp9_highbd_10_variance4x8_c; +const vp9_variance_fn_t highbd_10_variance8x4_c = vp9_highbd_10_variance8x4_c; +const vp9_variance_fn_t highbd_10_variance8x8_c = vp9_highbd_10_variance8x8_c; +const vp9_variance_fn_t highbd_10_variance8x16_c = vp9_highbd_10_variance8x16_c; +const vp9_variance_fn_t highbd_10_variance16x8_c = vp9_highbd_10_variance16x8_c; +const vp9_variance_fn_t highbd_10_variance16x16_c = + vp9_highbd_10_variance16x16_c; +const vp9_variance_fn_t highbd_10_variance16x32_c = + vp9_highbd_10_variance16x32_c; +const vp9_variance_fn_t highbd_10_variance32x16_c = + vp9_highbd_10_variance32x16_c; +const vp9_variance_fn_t highbd_10_variance32x32_c = + vp9_highbd_10_variance32x32_c; +const vp9_variance_fn_t highbd_10_variance32x64_c = + vp9_highbd_10_variance32x64_c; +const vp9_variance_fn_t highbd_10_variance64x32_c = + vp9_highbd_10_variance64x32_c; +const vp9_variance_fn_t highbd_10_variance64x64_c = + vp9_highbd_10_variance64x64_c; +const vp9_variance_fn_t highbd_12_variance4x4_c = vp9_highbd_12_variance4x4_c; +const vp9_variance_fn_t highbd_12_variance4x8_c = vp9_highbd_12_variance4x8_c; +const vp9_variance_fn_t highbd_12_variance8x4_c = vp9_highbd_12_variance8x4_c; +const vp9_variance_fn_t highbd_12_variance8x8_c = vp9_highbd_12_variance8x8_c; +const vp9_variance_fn_t highbd_12_variance8x16_c = vp9_highbd_12_variance8x16_c; +const vp9_variance_fn_t highbd_12_variance16x8_c = vp9_highbd_12_variance16x8_c; +const vp9_variance_fn_t highbd_12_variance16x16_c = + vp9_highbd_12_variance16x16_c; +const vp9_variance_fn_t highbd_12_variance16x32_c = + vp9_highbd_12_variance16x32_c; +const vp9_variance_fn_t highbd_12_variance32x16_c = + vp9_highbd_12_variance32x16_c; +const vp9_variance_fn_t highbd_12_variance32x32_c = + vp9_highbd_12_variance32x32_c; +const vp9_variance_fn_t highbd_12_variance32x64_c = + vp9_highbd_12_variance32x64_c; +const vp9_variance_fn_t highbd_12_variance64x32_c = + vp9_highbd_12_variance64x32_c; +const vp9_variance_fn_t highbd_12_variance64x64_c = + vp9_highbd_12_variance64x64_c; +const vp9_variance_fn_t highbd_variance4x4_c = vp9_highbd_variance4x4_c; +const vp9_variance_fn_t highbd_variance4x8_c = vp9_highbd_variance4x8_c; +const vp9_variance_fn_t highbd_variance8x4_c = vp9_highbd_variance8x4_c; +const vp9_variance_fn_t highbd_variance8x8_c = vp9_highbd_variance8x8_c; +const vp9_variance_fn_t highbd_variance8x16_c = vp9_highbd_variance8x16_c; +const vp9_variance_fn_t highbd_variance16x8_c = vp9_highbd_variance16x8_c; +const vp9_variance_fn_t highbd_variance16x16_c = vp9_highbd_variance16x16_c; +const vp9_variance_fn_t highbd_variance16x32_c = vp9_highbd_variance16x32_c; +const vp9_variance_fn_t highbd_variance32x16_c = vp9_highbd_variance32x16_c; +const vp9_variance_fn_t highbd_variance32x32_c = vp9_highbd_variance32x32_c; +const vp9_variance_fn_t highbd_variance32x64_c = vp9_highbd_variance32x64_c; +const vp9_variance_fn_t highbd_variance64x32_c = vp9_highbd_variance64x32_c; +const vp9_variance_fn_t highbd_variance64x64_c = vp9_highbd_variance64x64_c; +INSTANTIATE_TEST_CASE_P( + C, VP9VarianceHighTest, + ::testing::Values(make_tuple(2, 2, highbd_10_variance4x4_c, 10), + make_tuple(2, 3, highbd_10_variance4x8_c, 10), + make_tuple(3, 2, highbd_10_variance8x4_c, 10), + make_tuple(3, 3, highbd_10_variance8x8_c, 10), + make_tuple(3, 4, highbd_10_variance8x16_c, 10), + make_tuple(4, 3, highbd_10_variance16x8_c, 10), + make_tuple(4, 4, highbd_10_variance16x16_c, 10), + make_tuple(4, 5, highbd_10_variance16x32_c, 10), + make_tuple(5, 4, highbd_10_variance32x16_c, 10), + make_tuple(5, 5, highbd_10_variance32x32_c, 10), + make_tuple(5, 6, highbd_10_variance32x64_c, 10), + make_tuple(6, 5, highbd_10_variance64x32_c, 10), + make_tuple(6, 6, highbd_10_variance64x64_c, 10), + make_tuple(2, 2, highbd_12_variance4x4_c, 12), + make_tuple(2, 3, highbd_12_variance4x8_c, 12), + make_tuple(3, 2, highbd_12_variance8x4_c, 12), + make_tuple(3, 3, highbd_12_variance8x8_c, 12), + make_tuple(3, 4, highbd_12_variance8x16_c, 12), + make_tuple(4, 3, highbd_12_variance16x8_c, 12), + make_tuple(4, 4, highbd_12_variance16x16_c, 12), + make_tuple(4, 5, highbd_12_variance16x32_c, 12), + make_tuple(5, 4, highbd_12_variance32x16_c, 12), + make_tuple(5, 5, highbd_12_variance32x32_c, 12), + make_tuple(5, 6, highbd_12_variance32x64_c, 12), + make_tuple(6, 5, highbd_12_variance64x32_c, 12), + make_tuple(6, 6, highbd_12_variance64x64_c, 12), + make_tuple(2, 2, highbd_variance4x4_c, 8), + make_tuple(2, 3, highbd_variance4x8_c, 8), + make_tuple(3, 2, highbd_variance8x4_c, 8), + make_tuple(3, 3, highbd_variance8x8_c, 8), + make_tuple(3, 4, highbd_variance8x16_c, 8), + make_tuple(4, 3, highbd_variance16x8_c, 8), + make_tuple(4, 4, highbd_variance16x16_c, 8), + make_tuple(4, 5, highbd_variance16x32_c, 8), + make_tuple(5, 4, highbd_variance32x16_c, 8), + make_tuple(5, 5, highbd_variance32x32_c, 8), + make_tuple(5, 6, highbd_variance32x64_c, 8), + make_tuple(6, 5, highbd_variance64x32_c, 8), + make_tuple(6, 6, highbd_variance64x64_c, 8))); +#endif // CONFIG_VP9_HIGHBITDEPTH const vp9_subpixvariance_fn_t subpel_variance4x4_c = vp9_sub_pixel_variance4x4_c; const vp9_subpixvariance_fn_t subpel_variance4x8_c = @@ -611,20 +1081,19 @@ const vp9_subpixvariance_fn_t subpel_variance64x64_c = vp9_sub_pixel_variance64x64_c; INSTANTIATE_TEST_CASE_P( C, VP9SubpelVarianceTest, - ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c), - make_tuple(2, 3, subpel_variance4x8_c), - make_tuple(3, 2, subpel_variance8x4_c), - make_tuple(3, 3, subpel_variance8x8_c), - make_tuple(3, 4, subpel_variance8x16_c), - make_tuple(4, 3, subpel_variance16x8_c), - make_tuple(4, 4, subpel_variance16x16_c), - make_tuple(4, 5, subpel_variance16x32_c), - make_tuple(5, 4, subpel_variance32x16_c), - make_tuple(5, 5, subpel_variance32x32_c), - make_tuple(5, 6, subpel_variance32x64_c), - make_tuple(6, 5, subpel_variance64x32_c), - make_tuple(6, 6, subpel_variance64x64_c))); - + ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c, 0), + make_tuple(2, 3, subpel_variance4x8_c, 0), + make_tuple(3, 2, subpel_variance8x4_c, 0), + make_tuple(3, 3, subpel_variance8x8_c, 0), + make_tuple(3, 4, subpel_variance8x16_c, 0), + make_tuple(4, 3, subpel_variance16x8_c, 0), + make_tuple(4, 4, subpel_variance16x16_c, 0), + make_tuple(4, 5, subpel_variance16x32_c, 0), + make_tuple(5, 4, subpel_variance32x16_c, 0), + make_tuple(5, 5, subpel_variance32x32_c, 0), + make_tuple(5, 6, subpel_variance32x64_c, 0), + make_tuple(6, 5, subpel_variance64x32_c, 0), + make_tuple(6, 6, subpel_variance64x64_c, 0))); const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c = vp9_sub_pixel_avg_variance4x4_c; const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c = @@ -653,23 +1122,263 @@ const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_c = vp9_sub_pixel_avg_variance64x64_c; INSTANTIATE_TEST_CASE_P( C, VP9SubpelAvgVarianceTest, - ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c), - make_tuple(2, 3, subpel_avg_variance4x8_c), - make_tuple(3, 2, subpel_avg_variance8x4_c), - make_tuple(3, 3, subpel_avg_variance8x8_c), - make_tuple(3, 4, subpel_avg_variance8x16_c), - make_tuple(4, 3, subpel_avg_variance16x8_c), - make_tuple(4, 4, subpel_avg_variance16x16_c), - make_tuple(4, 5, subpel_avg_variance16x32_c), - make_tuple(5, 4, subpel_avg_variance32x16_c), - make_tuple(5, 5, subpel_avg_variance32x32_c), - make_tuple(5, 6, subpel_avg_variance32x64_c), - make_tuple(6, 5, subpel_avg_variance64x32_c), - make_tuple(6, 6, subpel_avg_variance64x64_c))); + ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c, 0), + make_tuple(2, 3, subpel_avg_variance4x8_c, 0), + make_tuple(3, 2, subpel_avg_variance8x4_c, 0), + make_tuple(3, 3, subpel_avg_variance8x8_c, 0), + make_tuple(3, 4, subpel_avg_variance8x16_c, 0), + make_tuple(4, 3, subpel_avg_variance16x8_c, 0), + make_tuple(4, 4, subpel_avg_variance16x16_c, 0), + make_tuple(4, 5, subpel_avg_variance16x32_c, 0), + make_tuple(5, 4, subpel_avg_variance32x16_c, 0), + make_tuple(5, 5, subpel_avg_variance32x32_c, 0), + make_tuple(5, 6, subpel_avg_variance32x64_c, 0), + make_tuple(6, 5, subpel_avg_variance64x32_c, 0), + make_tuple(6, 6, subpel_avg_variance64x64_c, 0))); +#if CONFIG_VP9_HIGHBITDEPTH +const vp9_subpixvariance_fn_t highbd_10_subpel_variance4x4_c = + vp9_highbd_10_sub_pixel_variance4x4_c; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance4x8_c = + vp9_highbd_10_sub_pixel_variance4x8_c; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x4_c = + vp9_highbd_10_sub_pixel_variance8x4_c; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x8_c = + vp9_highbd_10_sub_pixel_variance8x8_c; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x16_c = + vp9_highbd_10_sub_pixel_variance8x16_c; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x8_c = + vp9_highbd_10_sub_pixel_variance16x8_c; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x16_c = + vp9_highbd_10_sub_pixel_variance16x16_c; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x32_c = + vp9_highbd_10_sub_pixel_variance16x32_c; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x16_c = + vp9_highbd_10_sub_pixel_variance32x16_c; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x32_c = + vp9_highbd_10_sub_pixel_variance32x32_c; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x64_c = + vp9_highbd_10_sub_pixel_variance32x64_c; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance64x32_c = + vp9_highbd_10_sub_pixel_variance64x32_c; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance64x64_c = + vp9_highbd_10_sub_pixel_variance64x64_c; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance4x4_c = + vp9_highbd_12_sub_pixel_variance4x4_c; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance4x8_c = + vp9_highbd_12_sub_pixel_variance4x8_c; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x4_c = + vp9_highbd_12_sub_pixel_variance8x4_c; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x8_c = + vp9_highbd_12_sub_pixel_variance8x8_c; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x16_c = + vp9_highbd_12_sub_pixel_variance8x16_c; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x8_c = + vp9_highbd_12_sub_pixel_variance16x8_c; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x16_c = + vp9_highbd_12_sub_pixel_variance16x16_c; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x32_c = + vp9_highbd_12_sub_pixel_variance16x32_c; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x16_c = + vp9_highbd_12_sub_pixel_variance32x16_c; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x32_c = + vp9_highbd_12_sub_pixel_variance32x32_c; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x64_c = + vp9_highbd_12_sub_pixel_variance32x64_c; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance64x32_c = + vp9_highbd_12_sub_pixel_variance64x32_c; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance64x64_c = + vp9_highbd_12_sub_pixel_variance64x64_c; +const vp9_subpixvariance_fn_t highbd_subpel_variance4x4_c = + vp9_highbd_sub_pixel_variance4x4_c; +const vp9_subpixvariance_fn_t highbd_subpel_variance4x8_c = + vp9_highbd_sub_pixel_variance4x8_c; +const vp9_subpixvariance_fn_t highbd_subpel_variance8x4_c = + vp9_highbd_sub_pixel_variance8x4_c; +const vp9_subpixvariance_fn_t highbd_subpel_variance8x8_c = + vp9_highbd_sub_pixel_variance8x8_c; +const vp9_subpixvariance_fn_t highbd_subpel_variance8x16_c = + vp9_highbd_sub_pixel_variance8x16_c; +const vp9_subpixvariance_fn_t highbd_subpel_variance16x8_c = + vp9_highbd_sub_pixel_variance16x8_c; +const vp9_subpixvariance_fn_t highbd_subpel_variance16x16_c = + vp9_highbd_sub_pixel_variance16x16_c; +const vp9_subpixvariance_fn_t highbd_subpel_variance16x32_c = + vp9_highbd_sub_pixel_variance16x32_c; +const vp9_subpixvariance_fn_t highbd_subpel_variance32x16_c = + vp9_highbd_sub_pixel_variance32x16_c; +const vp9_subpixvariance_fn_t highbd_subpel_variance32x32_c = + vp9_highbd_sub_pixel_variance32x32_c; +const vp9_subpixvariance_fn_t highbd_subpel_variance32x64_c = + vp9_highbd_sub_pixel_variance32x64_c; +const vp9_subpixvariance_fn_t highbd_subpel_variance64x32_c = + vp9_highbd_sub_pixel_variance64x32_c; +const vp9_subpixvariance_fn_t highbd_subpel_variance64x64_c = + vp9_highbd_sub_pixel_variance64x64_c; +INSTANTIATE_TEST_CASE_P( + C, VP9SubpelVarianceHighTest, + ::testing::Values(make_tuple(2, 2, highbd_10_subpel_variance4x4_c, 10), + make_tuple(2, 3, highbd_10_subpel_variance4x8_c, 10), + make_tuple(3, 2, highbd_10_subpel_variance8x4_c, 10), + make_tuple(3, 3, highbd_10_subpel_variance8x8_c, 10), + make_tuple(3, 4, highbd_10_subpel_variance8x16_c, 10), + make_tuple(4, 3, highbd_10_subpel_variance16x8_c, 10), + make_tuple(4, 4, highbd_10_subpel_variance16x16_c, 10), + make_tuple(4, 5, highbd_10_subpel_variance16x32_c, 10), + make_tuple(5, 4, highbd_10_subpel_variance32x16_c, 10), + make_tuple(5, 5, highbd_10_subpel_variance32x32_c, 10), + make_tuple(5, 6, highbd_10_subpel_variance32x64_c, 10), + make_tuple(6, 5, highbd_10_subpel_variance64x32_c, 10), + make_tuple(6, 6, highbd_10_subpel_variance64x64_c, 10), + make_tuple(2, 2, highbd_12_subpel_variance4x4_c, 12), + make_tuple(2, 3, highbd_12_subpel_variance4x8_c, 12), + make_tuple(3, 2, highbd_12_subpel_variance8x4_c, 12), + make_tuple(3, 3, highbd_12_subpel_variance8x8_c, 12), + make_tuple(3, 4, highbd_12_subpel_variance8x16_c, 12), + make_tuple(4, 3, highbd_12_subpel_variance16x8_c, 12), + make_tuple(4, 4, highbd_12_subpel_variance16x16_c, 12), + make_tuple(4, 5, highbd_12_subpel_variance16x32_c, 12), + make_tuple(5, 4, highbd_12_subpel_variance32x16_c, 12), + make_tuple(5, 5, highbd_12_subpel_variance32x32_c, 12), + make_tuple(5, 6, highbd_12_subpel_variance32x64_c, 12), + make_tuple(6, 5, highbd_12_subpel_variance64x32_c, 12), + make_tuple(6, 6, highbd_12_subpel_variance64x64_c, 12), + make_tuple(2, 2, highbd_subpel_variance4x4_c, 8), + make_tuple(2, 3, highbd_subpel_variance4x8_c, 8), + make_tuple(3, 2, highbd_subpel_variance8x4_c, 8), + make_tuple(3, 3, highbd_subpel_variance8x8_c, 8), + make_tuple(3, 4, highbd_subpel_variance8x16_c, 8), + make_tuple(4, 3, highbd_subpel_variance16x8_c, 8), + make_tuple(4, 4, highbd_subpel_variance16x16_c, 8), + make_tuple(4, 5, highbd_subpel_variance16x32_c, 8), + make_tuple(5, 4, highbd_subpel_variance32x16_c, 8), + make_tuple(5, 5, highbd_subpel_variance32x32_c, 8), + make_tuple(5, 6, highbd_subpel_variance32x64_c, 8), + make_tuple(6, 5, highbd_subpel_variance64x32_c, 8), + make_tuple(6, 6, highbd_subpel_variance64x64_c, 8))); +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance4x4_c = + vp9_highbd_10_sub_pixel_avg_variance4x4_c; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance4x8_c = + vp9_highbd_10_sub_pixel_avg_variance4x8_c; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x4_c = + vp9_highbd_10_sub_pixel_avg_variance8x4_c; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x8_c = + vp9_highbd_10_sub_pixel_avg_variance8x8_c; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x16_c = + vp9_highbd_10_sub_pixel_avg_variance8x16_c; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x8_c = + vp9_highbd_10_sub_pixel_avg_variance16x8_c; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x16_c = + vp9_highbd_10_sub_pixel_avg_variance16x16_c; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x32_c = + vp9_highbd_10_sub_pixel_avg_variance16x32_c; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x16_c = + vp9_highbd_10_sub_pixel_avg_variance32x16_c; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x32_c = + vp9_highbd_10_sub_pixel_avg_variance32x32_c; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x64_c = + vp9_highbd_10_sub_pixel_avg_variance32x64_c; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x32_c = + vp9_highbd_10_sub_pixel_avg_variance64x32_c; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x64_c = + vp9_highbd_10_sub_pixel_avg_variance64x64_c; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance4x4_c = + vp9_highbd_12_sub_pixel_avg_variance4x4_c; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance4x8_c = + vp9_highbd_12_sub_pixel_avg_variance4x8_c; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x4_c = + vp9_highbd_12_sub_pixel_avg_variance8x4_c; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x8_c = + vp9_highbd_12_sub_pixel_avg_variance8x8_c; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x16_c = + vp9_highbd_12_sub_pixel_avg_variance8x16_c; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x8_c = + vp9_highbd_12_sub_pixel_avg_variance16x8_c; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x16_c = + vp9_highbd_12_sub_pixel_avg_variance16x16_c; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x32_c = + vp9_highbd_12_sub_pixel_avg_variance16x32_c; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x16_c = + vp9_highbd_12_sub_pixel_avg_variance32x16_c; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x32_c = + vp9_highbd_12_sub_pixel_avg_variance32x32_c; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x64_c = + vp9_highbd_12_sub_pixel_avg_variance32x64_c; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x32_c = + vp9_highbd_12_sub_pixel_avg_variance64x32_c; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x64_c = + vp9_highbd_12_sub_pixel_avg_variance64x64_c; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance4x4_c = + vp9_highbd_sub_pixel_avg_variance4x4_c; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance4x8_c = + vp9_highbd_sub_pixel_avg_variance4x8_c; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x4_c = + vp9_highbd_sub_pixel_avg_variance8x4_c; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x8_c = + vp9_highbd_sub_pixel_avg_variance8x8_c; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x16_c = + vp9_highbd_sub_pixel_avg_variance8x16_c; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x8_c = + vp9_highbd_sub_pixel_avg_variance16x8_c; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x16_c = + vp9_highbd_sub_pixel_avg_variance16x16_c; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x32_c = + vp9_highbd_sub_pixel_avg_variance16x32_c; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x16_c = + vp9_highbd_sub_pixel_avg_variance32x16_c; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x32_c = + vp9_highbd_sub_pixel_avg_variance32x32_c; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x64_c = + vp9_highbd_sub_pixel_avg_variance32x64_c; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x32_c = + vp9_highbd_sub_pixel_avg_variance64x32_c; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x64_c = + vp9_highbd_sub_pixel_avg_variance64x64_c; +INSTANTIATE_TEST_CASE_P( + C, VP9SubpelAvgVarianceHighTest, + ::testing::Values( + make_tuple(2, 2, highbd_10_subpel_avg_variance4x4_c, 10), + make_tuple(2, 3, highbd_10_subpel_avg_variance4x8_c, 10), + make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_c, 10), + make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_c, 10), + make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_c, 10), + make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_c, 10), + make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_c, 10), + make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_c, 10), + make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_c, 10), + make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_c, 10), + make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_c, 10), + make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_c, 10), + make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_c, 10), + make_tuple(2, 2, highbd_12_subpel_avg_variance4x4_c, 12), + make_tuple(2, 3, highbd_12_subpel_avg_variance4x8_c, 12), + make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_c, 12), + make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_c, 12), + make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_c, 12), + make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_c, 12), + make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_c, 12), + make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_c, 12), + make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_c, 12), + make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_c, 12), + make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_c, 12), + make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_c, 12), + make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_c, 12), + make_tuple(2, 2, highbd_subpel_avg_variance4x4_c, 8), + make_tuple(2, 3, highbd_subpel_avg_variance4x8_c, 8), + make_tuple(3, 2, highbd_subpel_avg_variance8x4_c, 8), + make_tuple(3, 3, highbd_subpel_avg_variance8x8_c, 8), + make_tuple(3, 4, highbd_subpel_avg_variance8x16_c, 8), + make_tuple(4, 3, highbd_subpel_avg_variance16x8_c, 8), + make_tuple(4, 4, highbd_subpel_avg_variance16x16_c, 8), + make_tuple(4, 5, highbd_subpel_avg_variance16x32_c, 8), + make_tuple(5, 4, highbd_subpel_avg_variance32x16_c, 8), + make_tuple(5, 5, highbd_subpel_avg_variance32x32_c, 8), + make_tuple(5, 6, highbd_subpel_avg_variance32x64_c, 8), + make_tuple(6, 5, highbd_subpel_avg_variance64x32_c, 8), + make_tuple(6, 6, highbd_subpel_avg_variance64x64_c, 8))); +#endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_SSE2 #if CONFIG_USE_X86INC - INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest, ::testing::Values(vp9_get_mb_ss_sse2)); @@ -688,20 +1397,19 @@ const vp9_variance_fn_t variance64x32_sse2 = vp9_variance64x32_sse2; const vp9_variance_fn_t variance64x64_sse2 = vp9_variance64x64_sse2; INSTANTIATE_TEST_CASE_P( SSE2, VP9VarianceTest, - ::testing::Values(make_tuple(2, 2, variance4x4_sse2), - make_tuple(2, 3, variance4x8_sse2), - make_tuple(3, 2, variance8x4_sse2), - make_tuple(3, 3, variance8x8_sse2), - make_tuple(3, 4, variance8x16_sse2), - make_tuple(4, 3, variance16x8_sse2), - make_tuple(4, 4, variance16x16_sse2), - make_tuple(4, 5, variance16x32_sse2), - make_tuple(5, 4, variance32x16_sse2), - make_tuple(5, 5, variance32x32_sse2), - make_tuple(5, 6, variance32x64_sse2), - make_tuple(6, 5, variance64x32_sse2), - make_tuple(6, 6, variance64x64_sse2))); - + ::testing::Values(make_tuple(2, 2, variance4x4_sse2, 0), + make_tuple(2, 3, variance4x8_sse2, 0), + make_tuple(3, 2, variance8x4_sse2, 0), + make_tuple(3, 3, variance8x8_sse2, 0), + make_tuple(3, 4, variance8x16_sse2, 0), + make_tuple(4, 3, variance16x8_sse2, 0), + make_tuple(4, 4, variance16x16_sse2, 0), + make_tuple(4, 5, variance16x32_sse2, 0), + make_tuple(5, 4, variance32x16_sse2, 0), + make_tuple(5, 5, variance32x32_sse2, 0), + make_tuple(5, 6, variance32x64_sse2, 0), + make_tuple(6, 5, variance64x32_sse2, 0), + make_tuple(6, 6, variance64x64_sse2, 0))); const vp9_subpixvariance_fn_t subpel_variance4x4_sse = vp9_sub_pixel_variance4x4_sse; const vp9_subpixvariance_fn_t subpel_variance4x8_sse = @@ -730,20 +1438,19 @@ const vp9_subpixvariance_fn_t subpel_variance64x64_sse2 = vp9_sub_pixel_variance64x64_sse2; INSTANTIATE_TEST_CASE_P( SSE2, VP9SubpelVarianceTest, - ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse), - make_tuple(2, 3, subpel_variance4x8_sse), - make_tuple(3, 2, subpel_variance8x4_sse2), - make_tuple(3, 3, subpel_variance8x8_sse2), - make_tuple(3, 4, subpel_variance8x16_sse2), - make_tuple(4, 3, subpel_variance16x8_sse2), - make_tuple(4, 4, subpel_variance16x16_sse2), - make_tuple(4, 5, subpel_variance16x32_sse2), - make_tuple(5, 4, subpel_variance32x16_sse2), - make_tuple(5, 5, subpel_variance32x32_sse2), - make_tuple(5, 6, subpel_variance32x64_sse2), - make_tuple(6, 5, subpel_variance64x32_sse2), - make_tuple(6, 6, subpel_variance64x64_sse2))); - + ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse, 0), + make_tuple(2, 3, subpel_variance4x8_sse, 0), + make_tuple(3, 2, subpel_variance8x4_sse2, 0), + make_tuple(3, 3, subpel_variance8x8_sse2, 0), + make_tuple(3, 4, subpel_variance8x16_sse2, 0), + make_tuple(4, 3, subpel_variance16x8_sse2, 0), + make_tuple(4, 4, subpel_variance16x16_sse2, 0), + make_tuple(4, 5, subpel_variance16x32_sse2, 0), + make_tuple(5, 4, subpel_variance32x16_sse2, 0), + make_tuple(5, 5, subpel_variance32x32_sse2, 0), + make_tuple(5, 6, subpel_variance32x64_sse2, 0), + make_tuple(6, 5, subpel_variance64x32_sse2, 0), + make_tuple(6, 6, subpel_variance64x64_sse2, 0))); const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse = vp9_sub_pixel_avg_variance4x4_sse; const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse = @@ -772,22 +1479,316 @@ const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_sse2 = vp9_sub_pixel_avg_variance64x64_sse2; INSTANTIATE_TEST_CASE_P( SSE2, VP9SubpelAvgVarianceTest, - ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse), - make_tuple(2, 3, subpel_avg_variance4x8_sse), - make_tuple(3, 2, subpel_avg_variance8x4_sse2), - make_tuple(3, 3, subpel_avg_variance8x8_sse2), - make_tuple(3, 4, subpel_avg_variance8x16_sse2), - make_tuple(4, 3, subpel_avg_variance16x8_sse2), - make_tuple(4, 4, subpel_avg_variance16x16_sse2), - make_tuple(4, 5, subpel_avg_variance16x32_sse2), - make_tuple(5, 4, subpel_avg_variance32x16_sse2), - make_tuple(5, 5, subpel_avg_variance32x32_sse2), - make_tuple(5, 6, subpel_avg_variance32x64_sse2), - make_tuple(6, 5, subpel_avg_variance64x32_sse2), - make_tuple(6, 6, subpel_avg_variance64x64_sse2))); -#endif -#endif - + ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse, 0), + make_tuple(2, 3, subpel_avg_variance4x8_sse, 0), + make_tuple(3, 2, subpel_avg_variance8x4_sse2, 0), + make_tuple(3, 3, subpel_avg_variance8x8_sse2, 0), + make_tuple(3, 4, subpel_avg_variance8x16_sse2, 0), + make_tuple(4, 3, subpel_avg_variance16x8_sse2, 0), + make_tuple(4, 4, subpel_avg_variance16x16_sse2, 0), + make_tuple(4, 5, subpel_avg_variance16x32_sse2, 0), + make_tuple(5, 4, subpel_avg_variance32x16_sse2, 0), + make_tuple(5, 5, subpel_avg_variance32x32_sse2, 0), + make_tuple(5, 6, subpel_avg_variance32x64_sse2, 0), + make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0), + make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0))); +#if CONFIG_VP9_HIGHBITDEPTH +const vp9_variance_fn_t highbd_variance8x8_sse2 = vp9_highbd_variance8x8_sse2; +const vp9_variance_fn_t highbd_10_variance8x8_sse2 = + vp9_highbd_10_variance8x8_sse2; +const vp9_variance_fn_t highbd_12_variance8x8_sse2 = + vp9_highbd_12_variance8x8_sse2; +const vp9_variance_fn_t highbd_variance8x16_sse2 = vp9_highbd_variance8x16_sse2; +const vp9_variance_fn_t highbd_10_variance8x16_sse2 = + vp9_highbd_10_variance8x16_sse2; +const vp9_variance_fn_t highbd_12_variance8x16_sse2 = + vp9_highbd_12_variance8x16_sse2; +const vp9_variance_fn_t highbd_variance16x8_sse2 = + vp9_highbd_variance16x8_sse2; +const vp9_variance_fn_t highbd_10_variance16x8_sse2 = + vp9_highbd_10_variance16x8_sse2; +const vp9_variance_fn_t highbd_12_variance16x8_sse2 = + vp9_highbd_12_variance16x8_sse2; +const vp9_variance_fn_t highbd_variance16x16_sse2 = + vp9_highbd_variance16x16_sse2; +const vp9_variance_fn_t highbd_10_variance16x16_sse2 = + vp9_highbd_10_variance16x16_sse2; +const vp9_variance_fn_t highbd_12_variance16x16_sse2 = + vp9_highbd_12_variance16x16_sse2; +const vp9_variance_fn_t highbd_variance16x32_sse2 = + vp9_highbd_variance16x32_sse2; +const vp9_variance_fn_t highbd_10_variance16x32_sse2 = + vp9_highbd_10_variance16x32_sse2; +const vp9_variance_fn_t highbd_12_variance16x32_sse2 = + vp9_highbd_12_variance16x32_sse2; +const vp9_variance_fn_t highbd_variance32x16_sse2 = + vp9_highbd_variance32x16_sse2; +const vp9_variance_fn_t highbd_10_variance32x16_sse2 = + vp9_highbd_10_variance32x16_sse2; +const vp9_variance_fn_t highbd_12_variance32x16_sse2 = + vp9_highbd_12_variance32x16_sse2; +const vp9_variance_fn_t highbd_variance32x32_sse2 = + vp9_highbd_variance32x32_sse2; +const vp9_variance_fn_t highbd_10_variance32x32_sse2 = + vp9_highbd_10_variance32x32_sse2; +const vp9_variance_fn_t highbd_12_variance32x32_sse2 = + vp9_highbd_12_variance32x32_sse2; +const vp9_variance_fn_t highbd_variance32x64_sse2 = + vp9_highbd_variance32x64_sse2; +const vp9_variance_fn_t highbd_10_variance32x64_sse2 = + vp9_highbd_10_variance32x64_sse2; +const vp9_variance_fn_t highbd_12_variance32x64_sse2 = + vp9_highbd_12_variance32x64_sse2; +const vp9_variance_fn_t highbd_variance64x32_sse2 = + vp9_highbd_variance64x32_sse2; +const vp9_variance_fn_t highbd_10_variance64x32_sse2 = + vp9_highbd_10_variance64x32_sse2; +const vp9_variance_fn_t highbd_12_variance64x32_sse2 = + vp9_highbd_12_variance64x32_sse2; +const vp9_variance_fn_t highbd_variance64x64_sse2 = + vp9_highbd_variance64x64_sse2; +const vp9_variance_fn_t highbd_10_variance64x64_sse2 = + vp9_highbd_10_variance64x64_sse2; +const vp9_variance_fn_t highbd_12_variance64x64_sse2 = + vp9_highbd_12_variance64x64_sse2; +INSTANTIATE_TEST_CASE_P( + SSE2, VP9VarianceHighTest, + ::testing::Values(make_tuple(3, 3, highbd_10_variance8x8_sse2, 10), + make_tuple(3, 4, highbd_10_variance8x16_sse2, 10), + make_tuple(4, 3, highbd_10_variance16x8_sse2, 10), + make_tuple(4, 4, highbd_10_variance16x16_sse2, 10), + make_tuple(4, 5, highbd_10_variance16x32_sse2, 10), + make_tuple(5, 4, highbd_10_variance32x16_sse2, 10), + make_tuple(5, 5, highbd_10_variance32x32_sse2, 10), + make_tuple(5, 6, highbd_10_variance32x64_sse2, 10), + make_tuple(6, 5, highbd_10_variance64x32_sse2, 10), + make_tuple(6, 6, highbd_10_variance64x64_sse2, 10), + make_tuple(3, 3, highbd_12_variance8x8_sse2, 12), + make_tuple(3, 4, highbd_12_variance8x16_sse2, 12), + make_tuple(4, 3, highbd_12_variance16x8_sse2, 12), + make_tuple(4, 4, highbd_12_variance16x16_sse2, 12), + make_tuple(4, 5, highbd_12_variance16x32_sse2, 12), + make_tuple(5, 4, highbd_12_variance32x16_sse2, 12), + make_tuple(5, 5, highbd_12_variance32x32_sse2, 12), + make_tuple(5, 6, highbd_12_variance32x64_sse2, 12), + make_tuple(6, 5, highbd_12_variance64x32_sse2, 12), + make_tuple(6, 6, highbd_12_variance64x64_sse2, 12), + make_tuple(3, 3, highbd_variance8x8_sse2, 8), + make_tuple(3, 4, highbd_variance8x16_sse2, 8), + make_tuple(4, 3, highbd_variance16x8_sse2, 8), + make_tuple(4, 4, highbd_variance16x16_sse2, 8), + make_tuple(4, 5, highbd_variance16x32_sse2, 8), + make_tuple(5, 4, highbd_variance32x16_sse2, 8), + make_tuple(5, 5, highbd_variance32x32_sse2, 8), + make_tuple(5, 6, highbd_variance32x64_sse2, 8), + make_tuple(6, 5, highbd_variance64x32_sse2, 8), + make_tuple(6, 6, highbd_variance64x64_sse2, 8))); +const vp9_subpixvariance_fn_t highbd_subpel_variance8x4_sse2 = + vp9_highbd_sub_pixel_variance8x4_sse2; +const vp9_subpixvariance_fn_t highbd_subpel_variance8x8_sse2 = + vp9_highbd_sub_pixel_variance8x8_sse2; +const vp9_subpixvariance_fn_t highbd_subpel_variance8x16_sse2 = + vp9_highbd_sub_pixel_variance8x16_sse2; +const vp9_subpixvariance_fn_t highbd_subpel_variance16x8_sse2 = + vp9_highbd_sub_pixel_variance16x8_sse2; +const vp9_subpixvariance_fn_t highbd_subpel_variance16x16_sse2 = + vp9_highbd_sub_pixel_variance16x16_sse2; +const vp9_subpixvariance_fn_t highbd_subpel_variance16x32_sse2 = + vp9_highbd_sub_pixel_variance16x32_sse2; +const vp9_subpixvariance_fn_t highbd_subpel_variance32x16_sse2 = + vp9_highbd_sub_pixel_variance32x16_sse2; +const vp9_subpixvariance_fn_t highbd_subpel_variance32x32_sse2 = + vp9_highbd_sub_pixel_variance32x32_sse2; +const vp9_subpixvariance_fn_t highbd_subpel_variance32x64_sse2 = + vp9_highbd_sub_pixel_variance32x64_sse2; +const vp9_subpixvariance_fn_t highbd_subpel_variance64x32_sse2 = + vp9_highbd_sub_pixel_variance64x32_sse2; +const vp9_subpixvariance_fn_t highbd_subpel_variance64x64_sse2 = + vp9_highbd_sub_pixel_variance64x64_sse2; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x4_sse2 = + vp9_highbd_10_sub_pixel_variance8x4_sse2; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x8_sse2 = + vp9_highbd_10_sub_pixel_variance8x8_sse2; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance8x16_sse2 = + vp9_highbd_10_sub_pixel_variance8x16_sse2; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x8_sse2 = + vp9_highbd_10_sub_pixel_variance16x8_sse2; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x16_sse2 = + vp9_highbd_10_sub_pixel_variance16x16_sse2; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance16x32_sse2 = + vp9_highbd_10_sub_pixel_variance16x32_sse2; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x16_sse2 = + vp9_highbd_10_sub_pixel_variance32x16_sse2; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x32_sse2 = + vp9_highbd_10_sub_pixel_variance32x32_sse2; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance32x64_sse2 = + vp9_highbd_10_sub_pixel_variance32x64_sse2; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance64x32_sse2 = + vp9_highbd_10_sub_pixel_variance64x32_sse2; +const vp9_subpixvariance_fn_t highbd_10_subpel_variance64x64_sse2 = + vp9_highbd_10_sub_pixel_variance64x64_sse2; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x4_sse2 = + vp9_highbd_12_sub_pixel_variance8x4_sse2; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x8_sse2 = + vp9_highbd_12_sub_pixel_variance8x8_sse2; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance8x16_sse2 = + vp9_highbd_12_sub_pixel_variance8x16_sse2; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x8_sse2 = + vp9_highbd_12_sub_pixel_variance16x8_sse2; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x16_sse2 = + vp9_highbd_12_sub_pixel_variance16x16_sse2; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance16x32_sse2 = + vp9_highbd_12_sub_pixel_variance16x32_sse2; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x16_sse2 = + vp9_highbd_12_sub_pixel_variance32x16_sse2; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x32_sse2 = + vp9_highbd_12_sub_pixel_variance32x32_sse2; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance32x64_sse2 = + vp9_highbd_12_sub_pixel_variance32x64_sse2; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance64x32_sse2 = + vp9_highbd_12_sub_pixel_variance64x32_sse2; +const vp9_subpixvariance_fn_t highbd_12_subpel_variance64x64_sse2 = + vp9_highbd_12_sub_pixel_variance64x64_sse2; +INSTANTIATE_TEST_CASE_P( + SSE2, VP9SubpelVarianceHighTest, + ::testing::Values(make_tuple(3, 2, highbd_10_subpel_variance8x4_sse2, 10), + make_tuple(3, 3, highbd_10_subpel_variance8x8_sse2, 10), + make_tuple(3, 4, highbd_10_subpel_variance8x16_sse2, 10), + make_tuple(4, 3, highbd_10_subpel_variance16x8_sse2, 10), + make_tuple(4, 4, highbd_10_subpel_variance16x16_sse2, 10), + make_tuple(4, 5, highbd_10_subpel_variance16x32_sse2, 10), + make_tuple(5, 4, highbd_10_subpel_variance32x16_sse2, 10), + make_tuple(5, 5, highbd_10_subpel_variance32x32_sse2, 10), + make_tuple(5, 6, highbd_10_subpel_variance32x64_sse2, 10), + make_tuple(6, 5, highbd_10_subpel_variance64x32_sse2, 10), + make_tuple(6, 6, highbd_10_subpel_variance64x64_sse2, 10), + make_tuple(3, 2, highbd_12_subpel_variance8x4_sse2, 12), + make_tuple(3, 3, highbd_12_subpel_variance8x8_sse2, 12), + make_tuple(3, 4, highbd_12_subpel_variance8x16_sse2, 12), + make_tuple(4, 3, highbd_12_subpel_variance16x8_sse2, 12), + make_tuple(4, 4, highbd_12_subpel_variance16x16_sse2, 12), + make_tuple(4, 5, highbd_12_subpel_variance16x32_sse2, 12), + make_tuple(5, 4, highbd_12_subpel_variance32x16_sse2, 12), + make_tuple(5, 5, highbd_12_subpel_variance32x32_sse2, 12), + make_tuple(5, 6, highbd_12_subpel_variance32x64_sse2, 12), + make_tuple(6, 5, highbd_12_subpel_variance64x32_sse2, 12), + make_tuple(6, 6, highbd_12_subpel_variance64x64_sse2, 12), + make_tuple(3, 2, highbd_subpel_variance8x4_sse2, 8), + make_tuple(3, 3, highbd_subpel_variance8x8_sse2, 8), + make_tuple(3, 4, highbd_subpel_variance8x16_sse2, 8), + make_tuple(4, 3, highbd_subpel_variance16x8_sse2, 8), + make_tuple(4, 4, highbd_subpel_variance16x16_sse2, 8), + make_tuple(4, 5, highbd_subpel_variance16x32_sse2, 8), + make_tuple(5, 4, highbd_subpel_variance32x16_sse2, 8), + make_tuple(5, 5, highbd_subpel_variance32x32_sse2, 8), + make_tuple(5, 6, highbd_subpel_variance32x64_sse2, 8), + make_tuple(6, 5, highbd_subpel_variance64x32_sse2, 8), + make_tuple(6, 6, highbd_subpel_variance64x64_sse2, 8))); +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x4_sse2 = + vp9_highbd_sub_pixel_avg_variance8x4_sse2; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x8_sse2 = + vp9_highbd_sub_pixel_avg_variance8x8_sse2; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x16_sse2 = + vp9_highbd_sub_pixel_avg_variance8x16_sse2; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x8_sse2 = + vp9_highbd_sub_pixel_avg_variance16x8_sse2; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x16_sse2 = + vp9_highbd_sub_pixel_avg_variance16x16_sse2; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x32_sse2 = + vp9_highbd_sub_pixel_avg_variance16x32_sse2; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x16_sse2 = + vp9_highbd_sub_pixel_avg_variance32x16_sse2; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x32_sse2 = + vp9_highbd_sub_pixel_avg_variance32x32_sse2; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x64_sse2 = + vp9_highbd_sub_pixel_avg_variance32x64_sse2; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x32_sse2 = + vp9_highbd_sub_pixel_avg_variance64x32_sse2; +const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x64_sse2 = + vp9_highbd_sub_pixel_avg_variance64x64_sse2; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x4_sse2 = + vp9_highbd_10_sub_pixel_avg_variance8x4_sse2; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x8_sse2 = + vp9_highbd_10_sub_pixel_avg_variance8x8_sse2; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x16_sse2 = + vp9_highbd_10_sub_pixel_avg_variance8x16_sse2; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x8_sse2 = + vp9_highbd_10_sub_pixel_avg_variance16x8_sse2; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x16_sse2 = + vp9_highbd_10_sub_pixel_avg_variance16x16_sse2; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x32_sse2 = + vp9_highbd_10_sub_pixel_avg_variance16x32_sse2; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x16_sse2 = + vp9_highbd_10_sub_pixel_avg_variance32x16_sse2; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x32_sse2 = + vp9_highbd_10_sub_pixel_avg_variance32x32_sse2; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x64_sse2 = + vp9_highbd_10_sub_pixel_avg_variance32x64_sse2; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x32_sse2 = + vp9_highbd_10_sub_pixel_avg_variance64x32_sse2; +const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x64_sse2 = + vp9_highbd_10_sub_pixel_avg_variance64x64_sse2; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x4_sse2 = + vp9_highbd_12_sub_pixel_avg_variance8x4_sse2; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x8_sse2 = + vp9_highbd_12_sub_pixel_avg_variance8x8_sse2; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x16_sse2 = + vp9_highbd_12_sub_pixel_avg_variance8x16_sse2; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x8_sse2 = + vp9_highbd_12_sub_pixel_avg_variance16x8_sse2; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x16_sse2 = + vp9_highbd_12_sub_pixel_avg_variance16x16_sse2; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x32_sse2 = + vp9_highbd_12_sub_pixel_avg_variance16x32_sse2; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x16_sse2 = + vp9_highbd_12_sub_pixel_avg_variance32x16_sse2; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x32_sse2 = + vp9_highbd_12_sub_pixel_avg_variance32x32_sse2; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x64_sse2 = + vp9_highbd_12_sub_pixel_avg_variance32x64_sse2; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x32_sse2 = + vp9_highbd_12_sub_pixel_avg_variance64x32_sse2; +const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x64_sse2 = + vp9_highbd_12_sub_pixel_avg_variance64x64_sse2; +INSTANTIATE_TEST_CASE_P( + SSE2, VP9SubpelAvgVarianceHighTest, + ::testing::Values( + make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_sse2, 10), + make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_sse2, 10), + make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_sse2, 10), + make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_sse2, 10), + make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_sse2, 10), + make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_sse2, 10), + make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_sse2, 10), + make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_sse2, 10), + make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_sse2, 10), + make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_sse2, 10), + make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_sse2, 10), + make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_sse2, 12), + make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_sse2, 12), + make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_sse2, 12), + make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_sse2, 12), + make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_sse2, 12), + make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_sse2, 12), + make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_sse2, 12), + make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_sse2, 12), + make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_sse2, 12), + make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_sse2, 12), + make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_sse2, 12), + make_tuple(3, 2, highbd_subpel_avg_variance8x4_sse2, 8), + make_tuple(3, 3, highbd_subpel_avg_variance8x8_sse2, 8), + make_tuple(3, 4, highbd_subpel_avg_variance8x16_sse2, 8), + make_tuple(4, 3, highbd_subpel_avg_variance16x8_sse2, 8), + make_tuple(4, 4, highbd_subpel_avg_variance16x16_sse2, 8), + make_tuple(4, 5, highbd_subpel_avg_variance16x32_sse2, 8), + make_tuple(5, 4, highbd_subpel_avg_variance32x16_sse2, 8), + make_tuple(5, 5, highbd_subpel_avg_variance32x32_sse2, 8), + make_tuple(5, 6, highbd_subpel_avg_variance32x64_sse2, 8), + make_tuple(6, 5, highbd_subpel_avg_variance64x32_sse2, 8), + make_tuple(6, 6, highbd_subpel_avg_variance64x64_sse2, 8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // CONFIG_USE_X86INC +#endif // HAVE_SSE2 #if HAVE_SSSE3 #if CONFIG_USE_X86INC @@ -819,20 +1820,19 @@ const vp9_subpixvariance_fn_t subpel_variance64x64_ssse3 = vp9_sub_pixel_variance64x64_ssse3; INSTANTIATE_TEST_CASE_P( SSSE3, VP9SubpelVarianceTest, - ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3), - make_tuple(2, 3, subpel_variance4x8_ssse3), - make_tuple(3, 2, subpel_variance8x4_ssse3), - make_tuple(3, 3, subpel_variance8x8_ssse3), - make_tuple(3, 4, subpel_variance8x16_ssse3), - make_tuple(4, 3, subpel_variance16x8_ssse3), - make_tuple(4, 4, subpel_variance16x16_ssse3), - make_tuple(4, 5, subpel_variance16x32_ssse3), - make_tuple(5, 4, subpel_variance32x16_ssse3), - make_tuple(5, 5, subpel_variance32x32_ssse3), - make_tuple(5, 6, subpel_variance32x64_ssse3), - make_tuple(6, 5, subpel_variance64x32_ssse3), - make_tuple(6, 6, subpel_variance64x64_ssse3))); - + ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3, 0), + make_tuple(2, 3, subpel_variance4x8_ssse3, 0), + make_tuple(3, 2, subpel_variance8x4_ssse3, 0), + make_tuple(3, 3, subpel_variance8x8_ssse3, 0), + make_tuple(3, 4, subpel_variance8x16_ssse3, 0), + make_tuple(4, 3, subpel_variance16x8_ssse3, 0), + make_tuple(4, 4, subpel_variance16x16_ssse3, 0), + make_tuple(4, 5, subpel_variance16x32_ssse3, 0), + make_tuple(5, 4, subpel_variance32x16_ssse3, 0), + make_tuple(5, 5, subpel_variance32x32_ssse3, 0), + make_tuple(5, 6, subpel_variance32x64_ssse3, 0), + make_tuple(6, 5, subpel_variance64x32_ssse3, 0), + make_tuple(6, 6, subpel_variance64x64_ssse3, 0))); const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 = vp9_sub_pixel_avg_variance4x4_ssse3; const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 = @@ -861,21 +1861,21 @@ const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_ssse3 = vp9_sub_pixel_avg_variance64x64_ssse3; INSTANTIATE_TEST_CASE_P( SSSE3, VP9SubpelAvgVarianceTest, - ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3), - make_tuple(2, 3, subpel_avg_variance4x8_ssse3), - make_tuple(3, 2, subpel_avg_variance8x4_ssse3), - make_tuple(3, 3, subpel_avg_variance8x8_ssse3), - make_tuple(3, 4, subpel_avg_variance8x16_ssse3), - make_tuple(4, 3, subpel_avg_variance16x8_ssse3), - make_tuple(4, 4, subpel_avg_variance16x16_ssse3), - make_tuple(4, 5, subpel_avg_variance16x32_ssse3), - make_tuple(5, 4, subpel_avg_variance32x16_ssse3), - make_tuple(5, 5, subpel_avg_variance32x32_ssse3), - make_tuple(5, 6, subpel_avg_variance32x64_ssse3), - make_tuple(6, 5, subpel_avg_variance64x32_ssse3), - make_tuple(6, 6, subpel_avg_variance64x64_ssse3))); -#endif -#endif + ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3, 0), + make_tuple(2, 3, subpel_avg_variance4x8_ssse3, 0), + make_tuple(3, 2, subpel_avg_variance8x4_ssse3, 0), + make_tuple(3, 3, subpel_avg_variance8x8_ssse3, 0), + make_tuple(3, 4, subpel_avg_variance8x16_ssse3, 0), + make_tuple(4, 3, subpel_avg_variance16x8_ssse3, 0), + make_tuple(4, 4, subpel_avg_variance16x16_ssse3, 0), + make_tuple(4, 5, subpel_avg_variance16x32_ssse3, 0), + make_tuple(5, 4, subpel_avg_variance32x16_ssse3, 0), + make_tuple(5, 5, subpel_avg_variance32x32_ssse3, 0), + make_tuple(5, 6, subpel_avg_variance32x64_ssse3, 0), + make_tuple(6, 5, subpel_avg_variance64x32_ssse3, 0), + make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0))); +#endif // CONFIG_USE_X86INC +#endif // HAVE_SSSE3 #if HAVE_AVX2 @@ -886,11 +1886,11 @@ const vp9_variance_fn_t variance64x32_avx2 = vp9_variance64x32_avx2; const vp9_variance_fn_t variance64x64_avx2 = vp9_variance64x64_avx2; INSTANTIATE_TEST_CASE_P( AVX2, VP9VarianceTest, - ::testing::Values(make_tuple(4, 4, variance16x16_avx2), - make_tuple(5, 4, variance32x16_avx2), - make_tuple(5, 5, variance32x32_avx2), - make_tuple(6, 5, variance64x32_avx2), - make_tuple(6, 6, variance64x64_avx2))); + ::testing::Values(make_tuple(4, 4, variance16x16_avx2, 0), + make_tuple(5, 4, variance32x16_avx2, 0), + make_tuple(5, 5, variance32x32_avx2, 0), + make_tuple(6, 5, variance64x32_avx2, 0), + make_tuple(6, 6, variance64x64_avx2, 0))); const vp9_subpixvariance_fn_t subpel_variance32x32_avx2 = vp9_sub_pixel_variance32x32_avx2; @@ -898,8 +1898,8 @@ const vp9_subpixvariance_fn_t subpel_variance64x64_avx2 = vp9_sub_pixel_variance64x64_avx2; INSTANTIATE_TEST_CASE_P( AVX2, VP9SubpelVarianceTest, - ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2), - make_tuple(6, 6, subpel_variance64x64_avx2))); + ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2, 0), + make_tuple(6, 6, subpel_variance64x64_avx2, 0))); const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_avx2 = vp9_sub_pixel_avg_variance32x32_avx2; @@ -907,8 +1907,8 @@ const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_avx2 = vp9_sub_pixel_avg_variance64x64_avx2; INSTANTIATE_TEST_CASE_P( AVX2, VP9SubpelAvgVarianceTest, - ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2), - make_tuple(6, 6, subpel_avg_variance64x64_avx2))); + ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0), + make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0))); #endif // HAVE_AVX2 #if HAVE_NEON const vp9_variance_fn_t variance8x8_neon = vp9_variance8x8_neon; @@ -916,9 +1916,9 @@ const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon; const vp9_variance_fn_t variance32x32_neon = vp9_variance32x32_neon; INSTANTIATE_TEST_CASE_P( NEON, VP9VarianceTest, - ::testing::Values(make_tuple(3, 3, variance8x8_neon), - make_tuple(4, 4, variance16x16_neon), - make_tuple(5, 5, variance32x32_neon))); + ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0), + make_tuple(4, 4, variance16x16_neon, 0), + make_tuple(5, 5, variance32x32_neon, 0))); const vp9_subpixvariance_fn_t subpel_variance8x8_neon = vp9_sub_pixel_variance8x8_neon; @@ -928,12 +1928,11 @@ const vp9_subpixvariance_fn_t subpel_variance32x32_neon = vp9_sub_pixel_variance32x32_neon; INSTANTIATE_TEST_CASE_P( NEON, VP9SubpelVarianceTest, - ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon), - make_tuple(4, 4, subpel_variance16x16_neon), - make_tuple(5, 5, subpel_variance32x32_neon))); + ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon, 0), + make_tuple(4, 4, subpel_variance16x16_neon, 0), + make_tuple(5, 5, subpel_variance32x32_neon, 0))); #endif // HAVE_NEON #endif // CONFIG_VP9_ENCODER } // namespace vp9 - } // namespace diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 1234d54c7..5512476c5 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -112,6 +112,9 @@ typedef struct { // Common for both INTER and INTRA blocks BLOCK_SIZE sb_type; PREDICTION_MODE mode; +#if CONFIG_FILTERINTRA + int filterbit, uv_filterbit; +#endif TX_SIZE tx_size; int8_t skip; int8_t segment_id; @@ -126,11 +129,18 @@ typedef struct { int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; uint8_t mode_context[MAX_REF_FRAMES]; INTERP_FILTER interp_filter; + +#if CONFIG_EXT_TX + EXT_TX_TYPE ext_txfrm; +#endif } MB_MODE_INFO; typedef struct MODE_INFO { struct MODE_INFO *src_mi; MB_MODE_INFO mbmi; +#if CONFIG_FILTERINTRA + int b_filter_info[4]; +#endif b_mode_info bmi[4]; } MODE_INFO; @@ -139,6 +149,17 @@ static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) { : mi->mbmi.mode; } +#if CONFIG_FILTERINTRA +static INLINE int is_filter_allowed(PREDICTION_MODE mode) { + (void)mode; + return 1; +} + +static INLINE int is_filter_enabled(TX_SIZE txsize) { + return (txsize < TX_SIZES); +} +#endif + static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) { return mbmi->ref_frame[0] > INTRA_FRAME; } @@ -236,12 +257,33 @@ static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES]; +#if CONFIG_EXT_TX +static TX_TYPE ext_tx_to_txtype(EXT_TX_TYPE ext_tx) { + switch (ext_tx) { + case NORM: + default: + return DCT_DCT; + case ALT: + return ADST_ADST; + } +} +#endif + static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type, const MACROBLOCKD *xd) { const MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi; - if (plane_type != PLANE_TYPE_Y || is_inter_block(mbmi)) +#if CONFIG_EXT_TX + if (plane_type != PLANE_TYPE_Y || xd->lossless) + return DCT_DCT; + + if (is_inter_block(mbmi)) { + return ext_tx_to_txtype(mbmi->ext_txfrm); + } +#else + if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mbmi)) return DCT_DCT; +#endif return intra_mode_to_tx_type_lookup[mbmi->mode]; } @@ -249,8 +291,17 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type, const MACROBLOCKD *xd, int ib) { const MODE_INFO *const mi = xd->mi[0].src_mi; +#if CONFIG_EXT_TX + if (plane_type != PLANE_TYPE_Y || xd->lossless) + return DCT_DCT; + + if (is_inter_block(&mi->mbmi)) { + return ext_tx_to_txtype(mi->mbmi.ext_txfrm); + } +#else if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi)) return DCT_DCT; +#endif return intra_mode_to_tx_type_lookup[get_y_mode(mi, ib)]; } diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index f3922b1ea..dcc6a1793 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1283,34 +1283,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # variance add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance32x16/; + specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance16x32/; + specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance64x32/; + specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance32x64/; + specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance32x32/; + specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance64x64/; + specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance16x16/; + specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance16x8/; + specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance8x16/; + specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance8x8/; + specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_highbd_variance8x4/; @@ -1322,40 +1322,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_variance4x4/; add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_get8x8var/; + specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc"; add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_get16x16var/; + specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance32x16/; + specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance16x32/; + specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance64x32/; + specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance32x64/; + specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance32x32/; + specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance64x64/; + specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance16x16/; + specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance16x8/; + specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance8x16/; + specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance8x8/; + specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_highbd_10_variance8x4/; @@ -1367,40 +1367,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_10_variance4x4/; add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_10_get8x8var/; + specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc"; add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_10_get16x16var/; + specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance32x16/; + specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance16x32/; + specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance64x32/; + specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance32x64/; + specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance32x32/; + specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance64x64/; + specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance16x16/; + specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance16x8/; + specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance8x16/; + specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance8x8/; + specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_highbd_12_variance8x4/; @@ -1412,76 +1412,76 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_12_variance4x4/; add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_12_get8x8var/; + specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc"; add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_12_get16x16var/; + specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance64x64/; + specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/; + specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance32x64/; + specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/; + specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance64x32/; + specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/; + specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance32x16/; + specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/; + specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance16x32/; + specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/; + specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance32x32/; + specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/; + specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance16x16/; + specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/; + specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance8x16/; + specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/; + specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance16x8/; + specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/; + specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance8x8/; + specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/; + specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance8x4/; + specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/; + specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_highbd_sub_pixel_variance4x8/; @@ -1496,70 +1496,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance64x64/; + specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance32x64/; + specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance64x32/; + specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance32x16/; + specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance16x32/; + specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance32x32/; + specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance16x16/; + specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance8x16/; + specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance16x8/; + specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance8x8/; + specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance8x4/; + specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_highbd_10_sub_pixel_variance4x8/; @@ -1574,70 +1574,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance64x64/; + specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance32x64/; + specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance64x32/; + specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance32x16/; + specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance16x32/; + specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance32x32/; + specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance16x16/; + specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance8x16/; + specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance16x8/; + specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance8x8/; + specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance8x4/; + specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_highbd_12_sub_pixel_variance4x8/; @@ -1817,7 +1817,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_sad4x4x4d sse2/; add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_mse16x16/; + specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; specialize qw/vp9_highbd_mse8x16/; @@ -1826,10 +1826,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_mse16x8/; add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_mse8x8/; + specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_mse16x16/; + specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; specialize qw/vp9_highbd_10_mse8x16/; @@ -1838,10 +1838,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_10_mse16x8/; add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_mse8x8/; + specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_mse16x16/; + specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; specialize qw/vp9_highbd_12_mse8x16/; @@ -1850,7 +1850,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_12_mse16x8/; add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_mse8x8/; + specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc"; # ENCODEMB INVOKE diff --git a/vp9/encoder/x86/vp9_highbd_subpel_variance.asm b/vp9/encoder/x86/vp9_highbd_subpel_variance.asm new file mode 100644 index 000000000..aebe63b74 --- /dev/null +++ b/vp9/encoder/x86/vp9_highbd_subpel_variance.asm @@ -0,0 +1,1043 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 +bilin_filter_m_sse2: times 8 dw 16 + times 8 dw 0 + times 8 dw 15 + times 8 dw 1 + times 8 dw 14 + times 8 dw 2 + times 8 dw 13 + times 8 dw 3 + times 8 dw 12 + times 8 dw 4 + times 8 dw 11 + times 8 dw 5 + times 8 dw 10 + times 8 dw 6 + times 8 dw 9 + times 8 dw 7 + times 16 dw 8 + times 8 dw 7 + times 8 dw 9 + times 8 dw 6 + times 8 dw 10 + times 8 dw 5 + times 8 dw 11 + times 8 dw 4 + times 8 dw 12 + times 8 dw 3 + times 8 dw 13 + times 8 dw 2 + times 8 dw 14 + times 8 dw 1 + times 8 dw 15 + +SECTION .text + +; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *dst, ptrdiff_t dst_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse + psubw %3, %4 + psubw %1, %2 + mova %4, %3 ; make copies to manipulate to calc sum + mova %2, %1 ; use originals for calc sse + pmaddwd %3, %3 + paddw %4, %2 + pmaddwd %1, %1 + movhlps %2, %4 + paddd %6, %3 + paddw %4, %2 + pxor %2, %2 + pcmpgtw %2, %4 ; mask for 0 > %4 (sum) + punpcklwd %4, %2 ; sign-extend word to dword + paddd %6, %1 + paddd %5, %4 + +%endmacro + +%macro STORE_AND_RET 0 +%if mmsize == 16 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + movhlps m3, m7 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + pshufd m4, m6, 0x1 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + movd [r1], m7 ; store sse + movd rax, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro INC_SRC_BY_SRC_STRIDE 0 +%if ARCH_X86=1 && CONFIG_PIC=1 + lea srcq, [srcq + src_stridemp*2] +%else + lea srcq, [srcq + src_strideq*2] +%endif +%endmacro + +%macro INC_SRC_BY_SRC_2STRIDE 0 +%if ARCH_X86=1 && CONFIG_PIC=1 + lea srcq, [srcq + src_stridemp*4] +%else + lea srcq, [srcq + src_strideq*4] +%endif +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%define bilin_filter_m bilin_filter_m_sse2 +%define filter_idx_shift 5 + + +%ifdef PIC ; 64bit PIC + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse + %define sec_str sec_strideq + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ + y_offset, dst, dst_stride, height, sse + %endif + %define h heightd + %define bilin_filter sseq +%else + %if ARCH_X86=1 && CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse, g_bilin_filter, g_pw_8 + %define h dword heightm + %define sec_str sec_stridemp + + ; Store bilin_filter and pw_8 location in stack + GET_GOT eax + add esp, 4 ; restore esp + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, height, \ + sse, g_bilin_filter, g_pw_8 + %define h heightd + + ; Store bilin_filter and pw_8 location in stack + GET_GOT eax + add esp, 4 ; restore esp + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %endif + %else + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ + 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse + %if ARCH_X86_64 + %define h heightd + %define sec_str sec_strideq + %else + %define h dword heightm + %define sec_str sec_stridemp + %endif + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, height, sse + %define h heightd + %endif + + %define bilin_filter bilin_filter_m + %endif +%endif + + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + +%if %1 < 16 + sar h, 1 +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m2, [srcq + 16] + mova m1, [dstq] + mova m3, [dstq + 16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m2, [secq+16] +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq + src_strideq*2] + mova m1, [dstq] + mova m3, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m2, [secq + sec_str*2] +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_zero_y_zero_loop + STORE_AND_RET + +.x_zero_y_nonzero: + cmp y_offsetd, 8 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m4, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+16] + mova m2, [dstq] + mova m3, [dstq+16] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*4] + mova m2, [dstq] + mova m3, [dstq+dst_strideq*2] + pavgw m0, m1 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+sec_str*2] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_zero_y_half_loop + STORE_AND_RET + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] + mova m9, [bilin_filter+y_offsetq+16] + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq + 16] + movu m4, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+16] + mova m2, [dstq] + mova m3, [dstq+16] + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m1, filter_y_a + pmullw m5, filter_y_b + paddw m1, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m1, m5 + paddw m0, m4 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*4] + mova m4, m1 + mova m2, [dstq] + mova m3, [dstq+dst_strideq*2] + pmullw m1, filter_y_a + pmullw m5, filter_y_b + paddw m1, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m1, m5 + paddw m0, m4 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+sec_str*2] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonzero: + cmp x_offsetd, 8 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq + 16] + movu m4, [srcq + 2] + movu m5, [srcq + 18] + mova m2, [dstq] + mova m3, [dstq + 16] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq + src_strideq*2] + movu m4, [srcq + 2] + movu m5, [srcq + src_strideq*2 + 2] + mova m2, [dstq] + mova m3, [dstq + dst_strideq*2] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+sec_str*2] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_half_y_zero_loop + STORE_AND_RET + +.x_half_y_nonzero: + cmp y_offsetd, 8 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 + pavgw m1, m3 +.x_half_y_half_loop: + movu m2, [srcq] + movu m3, [srcq + 16] + movu m4, [srcq + 2] + movu m5, [srcq + 18] + pavgw m2, m4 + pavgw m3, m5 + pavgw m0, m2 + pavgw m1, m3 + mova m4, [dstq] + mova m5, [dstq + 16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + mova m0, m2 + mova m1, m3 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 +.x_half_y_half_loop: + movu m2, [srcq] + movu m3, [srcq + src_strideq*2] + movu m4, [srcq + 2] + movu m5, [srcq + src_strideq*2 + 2] + pavgw m2, m4 + pavgw m3, m5 + pavgw m0, m2 + pavgw m2, m3 + mova m4, [dstq] + mova m5, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m2, [secq+sec_str*2] +%endif + SUM_SSE m0, m4, m2, m5, m6, m7 + mova m0, m3 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_half_y_half_loop + STORE_AND_RET + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] + mova m9, [bilin_filter+y_offsetq+16] + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86_32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 + pavgw m1, m3 +.x_half_y_other_loop: + movu m2, [srcq] + movu m3, [srcq+16] + movu m4, [srcq+2] + movu m5, [srcq+18] + pavgw m2, m4 + pavgw m3, m5 + mova m4, m2 + mova m5, m3 + pmullw m1, filter_y_a + pmullw m3, filter_y_b + paddw m1, filter_rnd + paddw m1, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + psrlw m1, 4 + paddw m0, m2 + mova m2, [dstq] + psrlw m0, 4 + mova m3, [dstq+16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + mova m0, m4 + mova m1, m5 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 +.x_half_y_other_loop: + movu m2, [srcq] + movu m3, [srcq+src_strideq*2] + movu m4, [srcq+2] + movu m5, [srcq+src_strideq*2+2] + pavgw m2, m4 + pavgw m3, m5 + mova m4, m2 + mova m5, m3 + pmullw m4, filter_y_a + pmullw m3, filter_y_b + paddw m4, filter_rnd + paddw m4, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + psrlw m4, 4 + paddw m0, m2 + mova m2, [dstq] + psrlw m0, 4 + mova m3, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m4, [secq+sec_str*2] +%endif + SUM_SSE m0, m2, m4, m3, m6, m7 + mova m0, m5 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + mova m4, [dstq] + mova m5, [dstq+16] + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m1, m3 + paddw m0, m2 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m2, [srcq+2] + movu m3, [srcq+src_strideq*2+2] + mova m4, [dstq] + mova m5, [dstq+dst_strideq*2] + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m1, m3 + paddw m0, m2 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+sec_str*2] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + + lea srcq, [srcq+src_strideq*4] + lea dstq, [dstq+dst_strideq*4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 8 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + lea srcq, [srcq+src_strideq*2] +.x_other_y_half_loop: + movu m2, [srcq] + movu m3, [srcq+16] + movu m4, [srcq+2] + movu m5, [srcq+18] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + mova m4, [dstq] + mova m5, [dstq+16] + psrlw m2, 4 + psrlw m3, 4 + pavgw m0, m2 + pavgw m1, m3 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + mova m0, m2 + mova m1, m3 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m0, m2 + psrlw m0, 4 + lea srcq, [srcq+src_strideq*2] +.x_other_y_half_loop: + movu m2, [srcq] + movu m3, [srcq+src_strideq*2] + movu m4, [srcq+2] + movu m5, [srcq+src_strideq*2+2] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + mova m4, [dstq] + mova m5, [dstq+dst_strideq*2] + psrlw m2, 4 + psrlw m3, 4 + pavgw m0, m2 + pavgw m2, m3 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m2, [secq+sec_str*2] +%endif + SUM_SSE m0, m4, m2, m5, m6, m7 + mova m0, m3 + + lea srcq, [srcq+src_strideq*4] + lea dstq, [dstq+dst_strideq*4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonhalf: +; loading filter - this is same as in 8-bit depth +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [bilin_filter+y_offsetq] + mova m11, [bilin_filter+y_offsetq+16] + mova m12, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif +; end of load filter + + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m2, [srcq+2] + movu m1, [srcq+16] + movu m3, [srcq+18] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movu m2, [srcq] + movu m4, [srcq+2] + movu m3, [srcq+16] + movu m5, [srcq+18] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + psrlw m2, 4 + psrlw m3, 4 + mova m4, m2 + mova m5, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + pmullw m1, filter_y_a + pmullw m3, filter_y_b + paddw m0, m2 + paddw m1, filter_rnd + mova m2, [dstq] + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + mova m3, [dstq+16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + mova m0, m4 + mova m1, m5 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq + dst_strideq * 2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m0, m2 + psrlw m0, 4 + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movu m2, [srcq] + movu m4, [srcq+2] + movu m3, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+2] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + psrlw m2, 4 + psrlw m3, 4 + mova m4, m2 + mova m5, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + pmullw m4, filter_y_a + pmullw m3, filter_y_b + paddw m0, m2 + paddw m4, filter_rnd + mova m2, [dstq] + paddw m4, m3 + psrlw m0, 4 + psrlw m4, 4 + mova m3, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m4, [secq+sec_str*2] +%endif + SUM_SSE m0, m2, m4, m3, m6, m7 + mova m0, m5 + + INC_SRC_BY_SRC_2STRIDE + lea dstq, [dstq + dst_strideq * 4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET +%endmacro + +INIT_XMM sse2 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM sse2 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm new file mode 100644 index 000000000..821dd0660 --- /dev/null +++ b/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm @@ -0,0 +1,313 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;unsigned int vp9_highbd_calc16x16var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(vp9_highbd_calc16x16var_sse2) PRIVATE +sym(vp9_highbd_calc16x16var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+16] + prefetcht0 [rsi+rax] + prefetcht0 [rsi+rax+16] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + + prefetcht0 [rdi] + prefetcht0 [rdi+16] + prefetcht0 [rdi+rdx] + prefetcht0 [rdi+rdx+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 16 + +.var16loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 2 + jnz .var16loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp9_highbd_calc8x8var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(vp9_highbd_calc8x8var_sse2) PRIVATE +sym(vp9_highbd_calc8x8var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+rax] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + + prefetcht0 [rdi] + prefetcht0 [rdi+rdx] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 8 + +.var8loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rbx+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rdi+rdx*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + lea rbx, [rbx+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 4 + jnz .var8loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp9/encoder/x86/vp9_highbd_variance_sse2.c b/vp9/encoder/x86/vp9_highbd_variance_sse2.c new file mode 100644 index 000000000..4bc3e7e2d --- /dev/null +++ b/vp9/encoder/x86/vp9_highbd_variance_sse2.c @@ -0,0 +1,580 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" + +#include "vp9/encoder/vp9_variance.h" +#include "vpx_ports/mem.h" + +typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +static void highbd_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int64_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = ROUND_POWER_OF_TWO(sse_long, 4); +} + +static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int64_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 4); + *sse = ROUND_POWER_OF_TWO(sse_long, 8); +} + + +#define HIGH_GET_VAR(S) \ +void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ +} \ +\ +void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 2); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ +} \ +\ +void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 4); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ +} + +HIGH_GET_VAR(16); +HIGH_GET_VAR(8); + +#undef HIGH_GET_VAR + +#define VAR_FN(w, h, block_size, shift) \ +uint32_t vp9_highbd_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vp9_highbd_calc##block_size##x##block_size##var_sse2, \ + block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} \ +\ +uint32_t vp9_highbd_10_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} \ +\ +uint32_t vp9_highbd_12_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_12_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} + +VAR_FN(64, 64, 16, 12); +VAR_FN(64, 32, 16, 11); +VAR_FN(32, 64, 16, 11); +VAR_FN(32, 32, 16, 10); +VAR_FN(32, 16, 16, 9); +VAR_FN(16, 32, 16, 9); +VAR_FN(16, 16, 16, 8); +VAR_FN(16, 8, 8, 7); +VAR_FN(8, 16, 8, 7); +VAR_FN(8, 8, 8, 6); + +#undef VAR_FN + +unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vp9_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vp9_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vp9_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vp9_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vp9_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vp9_highbd_calc8x8var_sse2, 8); + return *sse; +} + +#define DECL(w, opt) \ +int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \ + ptrdiff_t src_stride, \ + int x_offset, int y_offset, \ + const uint16_t *dst, \ + ptrdiff_t dst_stride, \ + int height, unsigned int *sse); +#define DECLS(opt1, opt2) \ +DECL(8, opt1); \ +DECL(16, opt1) + +DECLS(sse2, sse); +// DECLS(ssse3, ssse3); +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ +uint32_t vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \ + int src_stride, \ + int x_offset, \ + int y_offset, \ + const uint8_t *dst8, \ + int dst_stride, \ + uint32_t *sse_ptr) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, h, \ + &sse); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ + src_stride, \ + x_offset, y_offset, \ + dst + 16, \ + dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, \ + dst + 48, dst_stride, h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} \ +\ +uint32_t vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + h, &sse); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ + src_stride, \ + x_offset, y_offset, \ + dst + 16, \ + dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ + x_offset, y_offset, \ + dst + 48, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} \ +\ +uint32_t vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + int start_row; \ + uint32_t sse; \ + int se = 0; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + for (start_row = 0; start_row < h; start_row +=16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, \ + x_offset, y_offset, dst + (start_row * dst_stride), \ + dst_stride, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, dst + 16 + (start_row * dst_stride), \ + dst_stride, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, dst + 32 + (start_row * dst_stride), \ + dst_stride, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, dst + 48 + (start_row * dst_stride), \ + dst_stride, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + }\ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} + +#define FNS(opt1, opt2) \ +FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ +FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ +FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ +FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ +FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ +FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ +FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ +FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ +FN(8, 16, 8, 3, 4, opt1, (int64_t)); \ +FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ +FN(8, 4, 8, 3, 2, opt1, (int64_t)); + + +FNS(sse2, sse); + +#undef FNS +#undef FN + +#define DECL(w, opt) \ +int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \ + ptrdiff_t src_stride, \ + int x_offset, int y_offset, \ + const uint16_t *dst, \ + ptrdiff_t dst_stride, \ + const uint16_t *sec, \ + ptrdiff_t sec_stride, \ + int height, \ + unsigned int *sse); +#define DECLS(opt1) \ +DECL(16, opt1) \ +DECL(8, opt1) + +DECLS(sse2); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ +uint32_t vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, \ + y_offset, dst, dst_stride, sec, w, h, &sse); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, \ + dst + 16, dst_stride, sec + 16, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, \ + dst + 32, dst_stride, sec + 32, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, \ + dst + 48, dst_stride, sec + 48, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} \ +\ +uint32_t vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, \ + y_offset, dst, dst_stride, \ + sec, w, h, &sse); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, \ + x_offset, y_offset, \ + dst + 16, dst_stride, \ + sec + 16, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + sec + 32, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, \ + x_offset, y_offset, \ + dst + 48, dst_stride, \ + sec + 48, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} \ +\ +uint32_t vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + int start_row; \ + uint32_t sse; \ + int se = 0; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + for (start_row = 0; start_row < h; start_row +=16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + (start_row * dst_stride), dst_stride, \ + sec + (start_row * w), w, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, \ + dst + 16 + (start_row * dst_stride), dst_stride, \ + sec + 16 + (start_row * w), w, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, \ + dst + 32 + (start_row * dst_stride), dst_stride, \ + sec + 32 + (start_row * w), w, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, \ + dst + 48 + (start_row * dst_stride), dst_stride, \ + sec + 48 + (start_row * w), w, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} + + +#define FNS(opt1) \ +FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ +FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ +FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ +FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ +FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ +FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ +FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ +FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ +FN(8, 16, 8, 4, 3, opt1, (int64_t)); \ +FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ +FN(8, 4, 8, 3, 2, opt1, (int64_t)); + +FNS(sse2); + +#undef FNS +#undef FN diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index b3a37745b..651b4c168 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -104,6 +104,7 @@ VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm endif ifeq ($(CONFIG_USE_X86INC),yes) @@ -115,6 +116,8 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm endif endif