From: Peter de Rivaz Date: Thu, 16 Oct 2014 12:41:55 +0000 (+0100) Subject: Added highbitdepth sse2 SAD acceleration and tests X-Git-Tag: v1.4.0~511^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7eee487c0036340e99425cc1cf1503e21e70678a;p=libvpx Added highbitdepth sse2 SAD acceleration and tests Change-Id: I1a74a1b032b198793ef9cc526327987f7799125f (cherry picked from commit b1a6f6b9cb47eafe0ce86eaf0318612806091fe5) --- diff --git a/test/sad_test.cc b/test/sad_test.cc index c7042fe50..c7eadddc7 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -27,6 +27,7 @@ #include "test/register_state_check.h" #include "test/util.h" #include "third_party/googletest/src/include/gtest/gtest.h" +#include "vpx/vpx_codec.h" #if CONFIG_VP8_ENCODER @@ -35,42 +36,83 @@ typedef unsigned int (*SadMxNFunc)(const unsigned char *source_ptr, const unsigned char *reference_ptr, int reference_stride, unsigned int max_sad); -typedef std::tr1::tuple SadMxNParam; +typedef std::tr1::tuple SadMxNParam; #endif #if CONFIG_VP9_ENCODER typedef unsigned int (*SadMxNVp9Func)(const unsigned char *source_ptr, int source_stride, const unsigned char *reference_ptr, int reference_stride); -typedef std::tr1::tuple SadMxNVp9Param; +typedef std::tr1::tuple SadMxNVp9Param; +typedef uint32_t (*SadMxNAvgVp9Func)(const uint8_t *source_ptr, + int source_stride, + const uint8_t *reference_ptr, + int reference_stride, + const uint8_t *second_pred); +typedef std::tr1::tuple SadMxNAvgVp9Param; #endif typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride, - const unsigned char *const ref_ptr[], + const uint8_t *const ref_ptr[], int ref_stride, - unsigned int *sad_array); -typedef std::tr1::tuple SadMxNx4Param; + uint32_t *sad_array); +typedef std::tr1::tuple SadMxNx4Param; using libvpx_test::ACMRandom; namespace { class SADTestBase : public ::testing::Test { public: - SADTestBase(int width, int height) : width_(width), height_(height) {} + SADTestBase(int width, int height, int bit_depth) : + width_(width), height_(height), bd_(bit_depth) {} static void SetUpTestCase() { +#if CONFIG_VP9_HIGHBITDEPTH + source_data8_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBlockSize)); + reference_data8_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBufferSize)); + second_pred8_ = reinterpret_cast( + vpx_memalign(kDataAlignment, 64*64)); + source_data16_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBlockSize*sizeof(uint16_t))); + reference_data16_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBufferSize*sizeof(uint16_t))); + second_pred16_ = reinterpret_cast( + vpx_memalign(kDataAlignment, 64*64*sizeof(uint16_t))); +#else source_data_ = reinterpret_cast( vpx_memalign(kDataAlignment, kDataBlockSize)); reference_data_ = reinterpret_cast( vpx_memalign(kDataAlignment, kDataBufferSize)); + second_pred_ = reinterpret_cast( + vpx_memalign(kDataAlignment, 64*64)); +#endif } static void TearDownTestCase() { +#if CONFIG_VP9_HIGHBITDEPTH + vpx_free(source_data8_); + source_data8_ = NULL; + vpx_free(reference_data8_); + reference_data8_ = NULL; + vpx_free(second_pred8_); + second_pred8_ = NULL; + vpx_free(source_data16_); + source_data16_ = NULL; + vpx_free(reference_data16_); + reference_data16_ = NULL; + vpx_free(second_pred16_); + second_pred16_ = NULL; +#else vpx_free(source_data_); source_data_ = NULL; vpx_free(reference_data_); reference_data_ = NULL; + vpx_free(second_pred_); + second_pred_ = NULL; +#endif } virtual void TearDown() { @@ -84,25 +126,117 @@ class SADTestBase : public ::testing::Test { static const int kDataBufferSize = 4 * kDataBlockSize; virtual void SetUp() { +#if CONFIG_VP9_HIGHBITDEPTH + if (bd_ == -1) { + use_high_bit_depth_ = false; + bit_depth_ = VPX_BITS_8; + source_data_ = source_data8_; + reference_data_ = reference_data8_; + second_pred_ = second_pred8_; + } else { + use_high_bit_depth_ = true; + bit_depth_ = static_cast(bd_); + source_data_ = CONVERT_TO_BYTEPTR(source_data16_); + reference_data_ = CONVERT_TO_BYTEPTR(reference_data16_); + second_pred_ = CONVERT_TO_BYTEPTR(second_pred16_); + } +#endif + mask_ = (1 << bit_depth_) - 1; source_stride_ = (width_ + 31) & ~31; reference_stride_ = width_ * 2; rnd_.Reset(ACMRandom::DeterministicSeed()); } - virtual uint8_t* GetReference(int block_idx) { + virtual uint8_t *GetReference(int block_idx) { +#if CONFIG_VP9_HIGHBITDEPTH + if (!use_high_bit_depth_) { + return reference_data_ + block_idx * kDataBlockSize; + } else { + return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) + + block_idx * kDataBlockSize); + } +#else return reference_data_ + block_idx * kDataBlockSize; +#endif } // Sum of Absolute Differences. Given two blocks, calculate the absolute // difference between two pixels in the same relative location; accumulate. unsigned int ReferenceSAD(unsigned int max_sad, int block_idx) { unsigned int sad = 0; - const uint8_t* const reference = GetReference(block_idx); +#if CONFIG_VP9_HIGHBITDEPTH + const uint8_t *const reference8 = GetReference(block_idx); + const uint8_t *const source8 = source_data_; + const uint16_t *const reference16 = + CONVERT_TO_SHORTPTR(GetReference(block_idx)); + const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_); +#else + const uint8_t *const reference = GetReference(block_idx); + const uint8_t *const source = source_data_; +#endif + for (int h = 0; h < height_; ++h) { + for (int w = 0; w < width_; ++w) { +#if CONFIG_VP9_HIGHBITDEPTH + if (!use_high_bit_depth_) { + sad += + abs(source8[h * source_stride_ + w] - + reference8[h * reference_stride_ + w]); + } else { + sad += + abs(source16[h * source_stride_ + w] - + reference16[h * reference_stride_ + w]); + } +#else + sad += + abs(source[h * source_stride_ + w] - + reference[h * reference_stride_ + w]); +#endif + } + if (sad > max_sad) { + break; + } + } + return sad; + } + // Sum of Absolute Differences Average. Given two blocks, and a prediction + // calculate the absolute difference between one pixel and average of the + // corresponding and predicted pixels; accumulate. + unsigned int ReferenceSADavg(unsigned int max_sad, int block_idx) { + unsigned int sad = 0; +#if CONFIG_VP9_HIGHBITDEPTH + const uint8_t *const reference8 = GetReference(block_idx); + const uint8_t *const source8 = source_data_; + const uint8_t *const second_pred8 = second_pred_; + const uint16_t *const reference16 = + CONVERT_TO_SHORTPTR(GetReference(block_idx)); + const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_); + const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_); +#else + const uint8_t *const reference = GetReference(block_idx); + const uint8_t *const source = source_data_; + const uint8_t *const second_pred = second_pred_; +#endif for (int h = 0; h < height_; ++h) { for (int w = 0; w < width_; ++w) { - sad += abs(source_data_[h * source_stride_ + w] - - reference[h * reference_stride_ + w]); +#if CONFIG_VP9_HIGHBITDEPTH + if (!use_high_bit_depth_) { + const int tmp = second_pred8[h * width_ + w] + + reference8[h * reference_stride_ + w]; + const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1); + sad += abs(source8[h * source_stride_ + w] - comp_pred); + } else { + const int tmp = second_pred16[h * width_ + w] + + reference16[h * reference_stride_ + w]; + const uint16_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1); + sad += abs(source16[h * source_stride_ + w] - comp_pred); + } +#else + const int tmp = second_pred[h * width_ + w] + + reference[h * reference_stride_ + w]; + const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1); + sad += abs(source[h * source_stride_ + w] - comp_pred); +#endif } if (sad > max_sad) { break; @@ -111,26 +245,61 @@ class SADTestBase : public ::testing::Test { return sad; } - void FillConstant(uint8_t *data, int stride, uint8_t fill_constant) { + void FillConstant(uint8_t *data, int stride, uint16_t fill_constant) { +#if CONFIG_VP9_HIGHBITDEPTH + uint8_t *data8 = data; + uint16_t *data16 = CONVERT_TO_SHORTPTR(data); +#endif for (int h = 0; h < height_; ++h) { for (int w = 0; w < width_; ++w) { +#if CONFIG_VP9_HIGHBITDEPTH + if (!use_high_bit_depth_) { + data8[h * stride + w] = fill_constant; + } else { + data16[h * stride + w] = fill_constant; + } +#else data[h * stride + w] = fill_constant; +#endif } } } void FillRandom(uint8_t *data, int stride) { +#if CONFIG_VP9_HIGHBITDEPTH + uint8_t *data8 = data; + uint16_t *data16 = CONVERT_TO_SHORTPTR(data); +#endif for (int h = 0; h < height_; ++h) { for (int w = 0; w < width_; ++w) { +#if CONFIG_VP9_HIGHBITDEPTH + if (!use_high_bit_depth_) { + data8[h * stride + w] = rnd_.Rand8(); + } else { + data16[h * stride + w] = rnd_.Rand16() & mask_; + } +#else data[h * stride + w] = rnd_.Rand8(); +#endif } } } - int width_, height_; - static uint8_t* source_data_; + int width_, height_, mask_, bd_; + vpx_bit_depth_t bit_depth_; + static uint8_t *source_data_; + static uint8_t *reference_data_; + static uint8_t *second_pred_; int source_stride_; - static uint8_t* reference_data_; +#if CONFIG_VP9_HIGHBITDEPTH + bool use_high_bit_depth_; + static uint8_t *source_data8_; + static uint8_t *reference_data8_; + static uint8_t *second_pred8_; + static uint16_t *source_data16_; + static uint16_t *reference_data16_; + static uint16_t *second_pred16_; +#endif int reference_stride_; ACMRandom rnd_; @@ -140,11 +309,11 @@ class SADx4Test : public SADTestBase, public ::testing::WithParamInterface { public: - SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {} + SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} protected: void SADs(unsigned int *results) { - const uint8_t* refs[] = {GetReference(0), GetReference(1), + const uint8_t *refs[] = {GetReference(0), GetReference(1), GetReference(2), GetReference(3)}; ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_, @@ -169,12 +338,12 @@ class SADTest : public SADTestBase, public ::testing::WithParamInterface { public: - SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {} + SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} protected: unsigned int SAD(unsigned int max_sad, int block_idx) { unsigned int ret; - const uint8_t* const reference = GetReference(block_idx); + const uint8_t *const reference = GetReference(block_idx); ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_, reference, reference_stride_, @@ -201,12 +370,12 @@ class SADVP9Test : public SADTestBase, public ::testing::WithParamInterface { public: - SADVP9Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {} + SADVP9Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} protected: unsigned int SAD(int block_idx) { unsigned int ret; - const uint8_t* const reference = GetReference(block_idx); + const uint8_t *const reference = GetReference(block_idx); ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_, reference, reference_stride_)); @@ -220,20 +389,54 @@ class SADVP9Test ASSERT_EQ(reference_sad, exp_sad); } }; + +class SADavgVP9Test + : public SADTestBase, + public ::testing::WithParamInterface { + public: + SADavgVP9Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} + + protected: + unsigned int SAD_avg(int block_idx) { + unsigned int ret; + const uint8_t *const reference = GetReference(block_idx); + + ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_, + reference, reference_stride_, + second_pred_)); + return ret; + } + + void CheckSAD() { + const unsigned int reference_sad = ReferenceSADavg(UINT_MAX, 0); + const unsigned int exp_sad = SAD_avg(0); + + ASSERT_EQ(reference_sad, exp_sad); + } +}; #endif // CONFIG_VP9_ENCODER -uint8_t* SADTestBase::source_data_ = NULL; -uint8_t* SADTestBase::reference_data_ = NULL; +uint8_t *SADTestBase::source_data_ = NULL; +uint8_t *SADTestBase::reference_data_ = NULL; +uint8_t *SADTestBase::second_pred_ = NULL; +#if CONFIG_VP9_ENCODER && CONFIG_VP9_HIGHBITDEPTH +uint8_t *SADTestBase::source_data8_ = NULL; +uint8_t *SADTestBase::reference_data8_ = NULL; +uint8_t *SADTestBase::second_pred8_ = NULL; +uint16_t *SADTestBase::source_data16_ = NULL; +uint16_t *SADTestBase::reference_data16_ = NULL; +uint16_t *SADTestBase::second_pred16_ = NULL; +#endif #if CONFIG_VP8_ENCODER TEST_P(SADTest, MaxRef) { FillConstant(source_data_, source_stride_, 0); - FillConstant(reference_data_, reference_stride_, 255); + FillConstant(reference_data_, reference_stride_, mask_); CheckSAD(UINT_MAX); } TEST_P(SADTest, MaxSrc) { - FillConstant(source_data_, source_stride_, 255); + FillConstant(source_data_, source_stride_, mask_); FillConstant(reference_data_, reference_stride_, 0); CheckSAD(UINT_MAX); } @@ -270,7 +473,7 @@ TEST_P(SADTest, ShortSrc) { TEST_P(SADTest, MaxSAD) { // Verify that, when max_sad is set, the implementation does not return a // value lower than the reference. - FillConstant(source_data_, source_stride_, 255); + FillConstant(source_data_, source_stride_, mask_); FillConstant(reference_data_, reference_stride_, 0); CheckSAD(128); } @@ -279,12 +482,12 @@ TEST_P(SADTest, MaxSAD) { #if CONFIG_VP9_ENCODER TEST_P(SADVP9Test, MaxRef) { FillConstant(source_data_, source_stride_, 0); - FillConstant(reference_data_, reference_stride_, 255); + FillConstant(reference_data_, reference_stride_, mask_); CheckSAD(); } TEST_P(SADVP9Test, MaxSrc) { - FillConstant(source_data_, source_stride_, 255); + FillConstant(source_data_, source_stride_, mask_); FillConstant(reference_data_, reference_stride_, 0); CheckSAD(); } @@ -317,19 +520,64 @@ TEST_P(SADVP9Test, ShortSrc) { CheckSAD(); source_stride_ = tmp_stride; } + +TEST_P(SADavgVP9Test, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(reference_data_, reference_stride_, mask_); + FillConstant(second_pred_, width_, 0); + CheckSAD(); +} +TEST_P(SADavgVP9Test, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(reference_data_, reference_stride_, 0); + FillConstant(second_pred_, width_, 0); + CheckSAD(); +} + +TEST_P(SADavgVP9Test, ShortRef) { + const int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, width_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADavgVP9Test, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + const int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, width_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADavgVP9Test, ShortSrc) { + const int tmp_stride = source_stride_; + source_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, width_); + CheckSAD(); + source_stride_ = tmp_stride; +} #endif // CONFIG_VP9_ENCODER TEST_P(SADx4Test, MaxRef) { FillConstant(source_data_, source_stride_, 0); - FillConstant(GetReference(0), reference_stride_, 255); - FillConstant(GetReference(1), reference_stride_, 255); - FillConstant(GetReference(2), reference_stride_, 255); - FillConstant(GetReference(3), reference_stride_, 255); + FillConstant(GetReference(0), reference_stride_, mask_); + FillConstant(GetReference(1), reference_stride_, mask_); + FillConstant(GetReference(2), reference_stride_, mask_); + FillConstant(GetReference(3), reference_stride_, mask_); CheckSADs(); } TEST_P(SADx4Test, MaxSrc) { - FillConstant(source_data_, source_stride_, 255); + FillConstant(source_data_, source_stride_, mask_); FillConstant(GetReference(0), reference_stride_, 0); FillConstant(GetReference(1), reference_stride_, 0); FillConstant(GetReference(2), reference_stride_, 0); @@ -375,6 +623,18 @@ TEST_P(SADx4Test, ShortSrc) { source_stride_ = tmp_stride; } +TEST_P(SADx4Test, SrcAlignedByWidth) { + uint8_t * tmp_source_data = source_data_; + source_data_ += width_; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + source_data_ = tmp_source_data; +} + using std::tr1::make_tuple; //------------------------------------------------------------------------------ @@ -386,11 +646,11 @@ const SadMxNFunc sad_16x8_c = vp8_sad16x8_c; const SadMxNFunc sad_8x8_c = vp8_sad8x8_c; const SadMxNFunc sad_4x4_c = vp8_sad4x4_c; const SadMxNParam c_tests[] = { - make_tuple(16, 16, sad_16x16_c), - make_tuple(8, 16, sad_8x16_c), - make_tuple(16, 8, sad_16x8_c), - make_tuple(8, 8, sad_8x8_c), - make_tuple(4, 4, sad_4x4_c), + make_tuple(16, 16, sad_16x16_c, -1), + make_tuple(8, 16, sad_8x16_c, -1), + make_tuple(16, 8, sad_16x8_c, -1), + make_tuple(8, 8, sad_8x8_c, -1), + make_tuple(4, 4, sad_4x4_c, -1), }; INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests)); #endif // CONFIG_VP8_ENCODER @@ -406,15 +666,15 @@ const SadMxNVp9Func sad_8x4_c_vp9 = vp9_sad8x4_c; const SadMxNVp9Func sad_4x8_c_vp9 = vp9_sad4x8_c; const SadMxNVp9Func sad_4x4_c_vp9 = vp9_sad4x4_c; const SadMxNVp9Param c_vp9_tests[] = { - make_tuple(64, 64, sad_64x64_c_vp9), - make_tuple(32, 32, sad_32x32_c_vp9), - make_tuple(16, 16, sad_16x16_c_vp9), - make_tuple(8, 16, sad_8x16_c_vp9), - make_tuple(16, 8, sad_16x8_c_vp9), - make_tuple(8, 8, sad_8x8_c_vp9), - make_tuple(8, 4, sad_8x4_c_vp9), - make_tuple(4, 8, sad_4x8_c_vp9), - make_tuple(4, 4, sad_4x4_c_vp9), + make_tuple(64, 64, sad_64x64_c_vp9, -1), + make_tuple(32, 32, sad_32x32_c_vp9, -1), + make_tuple(16, 16, sad_16x16_c_vp9, -1), + make_tuple(8, 16, sad_8x16_c_vp9, -1), + make_tuple(16, 8, sad_16x8_c_vp9, -1), + make_tuple(8, 8, sad_8x8_c_vp9, -1), + make_tuple(8, 4, sad_8x4_c_vp9, -1), + make_tuple(4, 8, sad_4x8_c_vp9, -1), + make_tuple(4, 4, sad_4x4_c_vp9, -1), }; INSTANTIATE_TEST_CASE_P(C, SADVP9Test, ::testing::ValuesIn(c_vp9_tests)); @@ -432,19 +692,186 @@ const SadMxNx4Func sad_8x4x4d_c = vp9_sad8x4x4d_c; const SadMxNx4Func sad_4x8x4d_c = vp9_sad4x8x4d_c; const SadMxNx4Func sad_4x4x4d_c = vp9_sad4x4x4d_c; INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values( - make_tuple(64, 64, sad_64x64x4d_c), - make_tuple(64, 32, sad_64x32x4d_c), - make_tuple(32, 64, sad_32x64x4d_c), - make_tuple(32, 32, sad_32x32x4d_c), - make_tuple(32, 16, sad_32x16x4d_c), - make_tuple(16, 32, sad_16x32x4d_c), - make_tuple(16, 16, sad_16x16x4d_c), - make_tuple(16, 8, sad_16x8x4d_c), - make_tuple(8, 16, sad_8x16x4d_c), - make_tuple(8, 8, sad_8x8x4d_c), - make_tuple(8, 4, sad_8x4x4d_c), - make_tuple(4, 8, sad_4x8x4d_c), - make_tuple(4, 4, sad_4x4x4d_c))); + make_tuple(64, 64, sad_64x64x4d_c, -1), + make_tuple(64, 32, sad_64x32x4d_c, -1), + make_tuple(32, 64, sad_32x64x4d_c, -1), + make_tuple(32, 32, sad_32x32x4d_c, -1), + make_tuple(32, 16, sad_32x16x4d_c, -1), + make_tuple(16, 32, sad_16x32x4d_c, -1), + make_tuple(16, 16, sad_16x16x4d_c, -1), + make_tuple(16, 8, sad_16x8x4d_c, -1), + make_tuple(8, 16, sad_8x16x4d_c, -1), + make_tuple(8, 8, sad_8x8x4d_c, -1), + make_tuple(8, 4, sad_8x4x4d_c, -1), + make_tuple(4, 8, sad_4x8x4d_c, -1), + make_tuple(4, 4, sad_4x4x4d_c, -1))); + +#if CONFIG_VP9_HIGHBITDEPTH +const SadMxNVp9Func highbd_sad_64x64_c_vp9 = vp9_highbd_sad64x64_c; +const SadMxNVp9Func highbd_sad_32x32_c_vp9 = vp9_highbd_sad32x32_c; +const SadMxNVp9Func highbd_sad_16x16_c_vp9 = vp9_highbd_sad16x16_c; +const SadMxNVp9Func highbd_sad_8x16_c_vp9 = vp9_highbd_sad8x16_c; +const SadMxNVp9Func highbd_sad_16x8_c_vp9 = vp9_highbd_sad16x8_c; +const SadMxNVp9Func highbd_sad_8x8_c_vp9 = vp9_highbd_sad8x8_c; +const SadMxNVp9Func highbd_sad_8x4_c_vp9 = vp9_highbd_sad8x4_c; +const SadMxNVp9Func highbd_sad_4x8_c_vp9 = vp9_highbd_sad4x8_c; +const SadMxNVp9Func highbd_sad_4x4_c_vp9 = vp9_highbd_sad4x4_c; +const SadMxNVp9Param c_vp9_highbd_8_tests[] = { + make_tuple(64, 64, highbd_sad_64x64_c_vp9, 8), + make_tuple(32, 32, highbd_sad_32x32_c_vp9, 8), + make_tuple(16, 16, highbd_sad_16x16_c_vp9, 8), + make_tuple(8, 16, highbd_sad_8x16_c_vp9, 8), + make_tuple(16, 8, highbd_sad_16x8_c_vp9, 8), + make_tuple(8, 8, highbd_sad_8x8_c_vp9, 8), + make_tuple(8, 4, highbd_sad_8x4_c_vp9, 8), + make_tuple(4, 8, highbd_sad_4x8_c_vp9, 8), + make_tuple(4, 4, highbd_sad_4x4_c_vp9, 8), +}; +INSTANTIATE_TEST_CASE_P(C_8, SADVP9Test, + ::testing::ValuesIn(c_vp9_highbd_8_tests)); + +const SadMxNVp9Param c_vp9_highbd_10_tests[] = { + make_tuple(64, 64, highbd_sad_64x64_c_vp9, 10), + make_tuple(32, 32, highbd_sad_32x32_c_vp9, 10), + make_tuple(16, 16, highbd_sad_16x16_c_vp9, 10), + make_tuple(8, 16, highbd_sad_8x16_c_vp9, 10), + make_tuple(16, 8, highbd_sad_16x8_c_vp9, 10), + make_tuple(8, 8, highbd_sad_8x8_c_vp9, 10), + make_tuple(8, 4, highbd_sad_8x4_c_vp9, 10), + make_tuple(4, 8, highbd_sad_4x8_c_vp9, 10), + make_tuple(4, 4, highbd_sad_4x4_c_vp9, 10), +}; +INSTANTIATE_TEST_CASE_P(C_10, SADVP9Test, + ::testing::ValuesIn(c_vp9_highbd_10_tests)); + +const SadMxNVp9Param c_vp9_highbd_12_tests[] = { + make_tuple(64, 64, highbd_sad_64x64_c_vp9, 12), + make_tuple(32, 32, highbd_sad_32x32_c_vp9, 12), + make_tuple(16, 16, highbd_sad_16x16_c_vp9, 12), + make_tuple(8, 16, highbd_sad_8x16_c_vp9, 12), + make_tuple(16, 8, highbd_sad_16x8_c_vp9, 12), + make_tuple(8, 8, highbd_sad_8x8_c_vp9, 12), + make_tuple(8, 4, highbd_sad_8x4_c_vp9, 12), + make_tuple(4, 8, highbd_sad_4x8_c_vp9, 12), + make_tuple(4, 4, highbd_sad_4x4_c_vp9, 12), +}; +INSTANTIATE_TEST_CASE_P(C_12, SADVP9Test, + ::testing::ValuesIn(c_vp9_highbd_12_tests)); + +const SadMxNAvgVp9Func highbd_sad8x4_avg_c_vp9 = vp9_highbd_sad8x4_avg_c; +const SadMxNAvgVp9Func highbd_sad8x8_avg_c_vp9 = vp9_highbd_sad8x8_avg_c; +const SadMxNAvgVp9Func highbd_sad8x16_avg_c_vp9 = vp9_highbd_sad8x16_avg_c; +const SadMxNAvgVp9Func highbd_sad16x8_avg_c_vp9 = vp9_highbd_sad16x8_avg_c; +const SadMxNAvgVp9Func highbd_sad16x16_avg_c_vp9 = vp9_highbd_sad16x16_avg_c; +const SadMxNAvgVp9Func highbd_sad16x32_avg_c_vp9 = vp9_highbd_sad16x32_avg_c; +const SadMxNAvgVp9Func highbd_sad32x16_avg_c_vp9 = vp9_highbd_sad32x16_avg_c; +const SadMxNAvgVp9Func highbd_sad32x32_avg_c_vp9 = vp9_highbd_sad32x32_avg_c; +const SadMxNAvgVp9Func highbd_sad32x64_avg_c_vp9 = vp9_highbd_sad32x64_avg_c; +const SadMxNAvgVp9Func highbd_sad64x32_avg_c_vp9 = vp9_highbd_sad64x32_avg_c; +const SadMxNAvgVp9Func highbd_sad64x64_avg_c_vp9 = vp9_highbd_sad64x64_avg_c; +SadMxNAvgVp9Param avg_c_vp9_highbd_8_tests[] = { + make_tuple(8, 4, highbd_sad8x4_avg_c_vp9, 8), + make_tuple(8, 8, highbd_sad8x8_avg_c_vp9, 8), + make_tuple(8, 16, highbd_sad8x16_avg_c_vp9, 8), + make_tuple(16, 8, highbd_sad16x8_avg_c_vp9, 8), + make_tuple(16, 16, highbd_sad16x16_avg_c_vp9, 8), + make_tuple(16, 32, highbd_sad16x32_avg_c_vp9, 8), + make_tuple(32, 16, highbd_sad32x16_avg_c_vp9, 8), + make_tuple(32, 32, highbd_sad32x32_avg_c_vp9, 8), + make_tuple(32, 64, highbd_sad32x64_avg_c_vp9, 8), + make_tuple(64, 32, highbd_sad64x32_avg_c_vp9, 8), + make_tuple(64, 64, highbd_sad64x64_avg_c_vp9, 8)}; +INSTANTIATE_TEST_CASE_P(C_8, SADavgVP9Test, + ::testing::ValuesIn(avg_c_vp9_highbd_8_tests)); + +SadMxNAvgVp9Param avg_c_vp9_highbd_10_tests[] = { + make_tuple(8, 4, highbd_sad8x4_avg_c_vp9, 10), + make_tuple(8, 8, highbd_sad8x8_avg_c_vp9, 10), + make_tuple(8, 16, highbd_sad8x16_avg_c_vp9, 10), + make_tuple(16, 8, highbd_sad16x8_avg_c_vp9, 10), + make_tuple(16, 16, highbd_sad16x16_avg_c_vp9, 10), + make_tuple(16, 32, highbd_sad16x32_avg_c_vp9, 10), + make_tuple(32, 16, highbd_sad32x16_avg_c_vp9, 10), + make_tuple(32, 32, highbd_sad32x32_avg_c_vp9, 10), + make_tuple(32, 64, highbd_sad32x64_avg_c_vp9, 10), + make_tuple(64, 32, highbd_sad64x32_avg_c_vp9, 10), + make_tuple(64, 64, highbd_sad64x64_avg_c_vp9, 10)}; +INSTANTIATE_TEST_CASE_P(C_10, SADavgVP9Test, + ::testing::ValuesIn(avg_c_vp9_highbd_10_tests)); + +SadMxNAvgVp9Param avg_c_vp9_highbd_12_tests[] = { + make_tuple(8, 4, highbd_sad8x4_avg_c_vp9, 12), + make_tuple(8, 8, highbd_sad8x8_avg_c_vp9, 12), + make_tuple(8, 16, highbd_sad8x16_avg_c_vp9, 12), + make_tuple(16, 8, highbd_sad16x8_avg_c_vp9, 12), + make_tuple(16, 16, highbd_sad16x16_avg_c_vp9, 12), + make_tuple(16, 32, highbd_sad16x32_avg_c_vp9, 12), + make_tuple(32, 16, highbd_sad32x16_avg_c_vp9, 12), + make_tuple(32, 32, highbd_sad32x32_avg_c_vp9, 12), + make_tuple(32, 64, highbd_sad32x64_avg_c_vp9, 12), + make_tuple(64, 32, highbd_sad64x32_avg_c_vp9, 12), + make_tuple(64, 64, highbd_sad64x64_avg_c_vp9, 12)}; +INSTANTIATE_TEST_CASE_P(C_12, SADavgVP9Test, + ::testing::ValuesIn(avg_c_vp9_highbd_12_tests)); + +const SadMxNx4Func highbd_sad_64x64x4d_c = vp9_highbd_sad64x64x4d_c; +const SadMxNx4Func highbd_sad_64x32x4d_c = vp9_highbd_sad64x32x4d_c; +const SadMxNx4Func highbd_sad_32x64x4d_c = vp9_highbd_sad32x64x4d_c; +const SadMxNx4Func highbd_sad_32x32x4d_c = vp9_highbd_sad32x32x4d_c; +const SadMxNx4Func highbd_sad_32x16x4d_c = vp9_highbd_sad32x16x4d_c; +const SadMxNx4Func highbd_sad_16x32x4d_c = vp9_highbd_sad16x32x4d_c; +const SadMxNx4Func highbd_sad_16x16x4d_c = vp9_highbd_sad16x16x4d_c; +const SadMxNx4Func highbd_sad_16x8x4d_c = vp9_highbd_sad16x8x4d_c; +const SadMxNx4Func highbd_sad_8x16x4d_c = vp9_highbd_sad8x16x4d_c; +const SadMxNx4Func highbd_sad_8x8x4d_c = vp9_highbd_sad8x8x4d_c; +const SadMxNx4Func highbd_sad_8x4x4d_c = vp9_highbd_sad8x4x4d_c; +const SadMxNx4Func highbd_sad_4x8x4d_c = vp9_highbd_sad4x8x4d_c; +const SadMxNx4Func highbd_sad_4x4x4d_c = vp9_highbd_sad4x4x4d_c; +INSTANTIATE_TEST_CASE_P(C_8, SADx4Test, ::testing::Values( + make_tuple(64, 64, highbd_sad_64x64x4d_c, 8), + make_tuple(64, 32, highbd_sad_64x32x4d_c, 8), + make_tuple(32, 64, highbd_sad_32x64x4d_c, 8), + make_tuple(32, 32, highbd_sad_32x32x4d_c, 8), + make_tuple(32, 16, highbd_sad_32x16x4d_c, 8), + make_tuple(16, 32, highbd_sad_16x32x4d_c, 8), + make_tuple(16, 16, highbd_sad_16x16x4d_c, 8), + make_tuple(16, 8, highbd_sad_16x8x4d_c, 8), + make_tuple(8, 16, highbd_sad_8x16x4d_c, 8), + make_tuple(8, 8, highbd_sad_8x8x4d_c, 8), + make_tuple(8, 4, highbd_sad_8x4x4d_c, 8), + make_tuple(4, 8, highbd_sad_4x8x4d_c, 8), + make_tuple(4, 4, highbd_sad_4x4x4d_c, 8))); + +INSTANTIATE_TEST_CASE_P(C_10, SADx4Test, ::testing::Values( + make_tuple(64, 64, highbd_sad_64x64x4d_c, 10), + make_tuple(64, 32, highbd_sad_64x32x4d_c, 10), + make_tuple(32, 64, highbd_sad_32x64x4d_c, 10), + make_tuple(32, 32, highbd_sad_32x32x4d_c, 10), + make_tuple(32, 16, highbd_sad_32x16x4d_c, 10), + make_tuple(16, 32, highbd_sad_16x32x4d_c, 10), + make_tuple(16, 16, highbd_sad_16x16x4d_c, 10), + make_tuple(16, 8, highbd_sad_16x8x4d_c, 10), + make_tuple(8, 16, highbd_sad_8x16x4d_c, 10), + make_tuple(8, 8, highbd_sad_8x8x4d_c, 10), + make_tuple(8, 4, highbd_sad_8x4x4d_c, 10), + make_tuple(4, 8, highbd_sad_4x8x4d_c, 10), + make_tuple(4, 4, highbd_sad_4x4x4d_c, 10))); + +INSTANTIATE_TEST_CASE_P(C_12, SADx4Test, ::testing::Values( + make_tuple(64, 64, highbd_sad_64x64x4d_c, 12), + make_tuple(64, 32, highbd_sad_64x32x4d_c, 12), + make_tuple(32, 64, highbd_sad_32x64x4d_c, 12), + make_tuple(32, 32, highbd_sad_32x32x4d_c, 12), + make_tuple(32, 16, highbd_sad_32x16x4d_c, 12), + make_tuple(16, 32, highbd_sad_16x32x4d_c, 12), + make_tuple(16, 16, highbd_sad_16x16x4d_c, 12), + make_tuple(16, 8, highbd_sad_16x8x4d_c, 12), + make_tuple(8, 16, highbd_sad_8x16x4d_c, 12), + make_tuple(8, 8, highbd_sad_8x8x4d_c, 12), + make_tuple(8, 4, highbd_sad_8x4x4d_c, 12), + make_tuple(4, 8, highbd_sad_4x8x4d_c, 12), + make_tuple(4, 4, highbd_sad_4x4x4d_c, 12))); +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_VP9_ENCODER //------------------------------------------------------------------------------ @@ -453,7 +880,7 @@ INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values( #if CONFIG_VP8_ENCODER const SadMxNFunc sad_16x16_armv6 = vp8_sad16x16_armv6; INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::Values( - make_tuple(16, 16, sad_16x16_armv6))); + make_tuple(16, 16, sad_16x16_armv6, -1))); #endif // CONFIG_VP8_ENCODER #endif // HAVE_MEDIA @@ -465,11 +892,11 @@ const SadMxNFunc sad_16x8_neon = vp8_sad16x8_neon; const SadMxNFunc sad_8x8_neon = vp8_sad8x8_neon; const SadMxNFunc sad_4x4_neon = vp8_sad4x4_neon; INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::Values( - make_tuple(16, 16, sad_16x16_neon), - make_tuple(8, 16, sad_8x16_neon), - make_tuple(16, 8, sad_16x8_neon), - make_tuple(8, 8, sad_8x8_neon), - make_tuple(4, 4, sad_4x4_neon))); + make_tuple(16, 16, sad_16x16_neon, -1), + make_tuple(8, 16, sad_8x16_neon, -1), + make_tuple(16, 8, sad_16x8_neon, -1), + make_tuple(8, 8, sad_8x8_neon, -1), + make_tuple(4, 4, sad_4x4_neon, -1))); #endif // CONFIG_VP8_ENCODER #if CONFIG_VP9_ENCODER const SadMxNVp9Func sad_64x64_neon_vp9 = vp9_sad64x64_neon; @@ -477,10 +904,10 @@ const SadMxNVp9Func sad_32x32_neon_vp9 = vp9_sad32x32_neon; const SadMxNVp9Func sad_16x16_neon_vp9 = vp9_sad16x16_neon; const SadMxNVp9Func sad_8x8_neon_vp9 = vp9_sad8x8_neon; const SadMxNVp9Param neon_vp9_tests[] = { - make_tuple(64, 64, sad_64x64_neon_vp9), - make_tuple(32, 32, sad_32x32_neon_vp9), - make_tuple(16, 16, sad_16x16_neon_vp9), - make_tuple(8, 8, sad_8x8_neon_vp9), + make_tuple(64, 64, sad_64x64_neon_vp9, -1), + make_tuple(32, 32, sad_32x32_neon_vp9, -1), + make_tuple(16, 16, sad_16x16_neon_vp9, -1), + make_tuple(8, 8, sad_8x8_neon_vp9, -1), }; INSTANTIATE_TEST_CASE_P(NEON, SADVP9Test, ::testing::ValuesIn(neon_vp9_tests)); #endif // CONFIG_VP9_ENCODER @@ -496,11 +923,11 @@ const SadMxNFunc sad_16x8_mmx = vp8_sad16x8_mmx; const SadMxNFunc sad_8x8_mmx = vp8_sad8x8_mmx; const SadMxNFunc sad_4x4_mmx = vp8_sad4x4_mmx; const SadMxNParam mmx_tests[] = { - make_tuple(16, 16, sad_16x16_mmx), - make_tuple(8, 16, sad_8x16_mmx), - make_tuple(16, 8, sad_16x8_mmx), - make_tuple(8, 8, sad_8x8_mmx), - make_tuple(4, 4, sad_4x4_mmx), + make_tuple(16, 16, sad_16x16_mmx, -1), + make_tuple(8, 16, sad_8x16_mmx, -1), + make_tuple(16, 8, sad_16x8_mmx, -1), + make_tuple(8, 8, sad_8x8_mmx, -1), + make_tuple(4, 4, sad_4x4_mmx, -1), }; INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests)); #endif // CONFIG_VP8_ENCODER @@ -513,14 +940,14 @@ INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests)); const SadMxNVp9Func sad_4x4_sse_vp9 = vp9_sad4x4_sse; const SadMxNVp9Func sad_4x8_sse_vp9 = vp9_sad4x8_sse; INSTANTIATE_TEST_CASE_P(SSE, SADVP9Test, ::testing::Values( - make_tuple(4, 4, sad_4x4_sse_vp9), - make_tuple(4, 8, sad_4x8_sse_vp9))); + make_tuple(4, 4, sad_4x4_sse_vp9, -1), + make_tuple(4, 8, sad_4x8_sse_vp9, -1))); const SadMxNx4Func sad_4x8x4d_sse = vp9_sad4x8x4d_sse; const SadMxNx4Func sad_4x4x4d_sse = vp9_sad4x4x4d_sse; INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values( - make_tuple(4, 8, sad_4x8x4d_sse), - make_tuple(4, 4, sad_4x4x4d_sse))); + make_tuple(4, 8, sad_4x8x4d_sse, -1), + make_tuple(4, 4, sad_4x4x4d_sse, -1))); #endif // CONFIG_USE_X86INC #endif // CONFIG_VP9_ENCODER #endif // HAVE_SSE @@ -533,11 +960,11 @@ const SadMxNFunc sad_16x8_wmt = vp8_sad16x8_wmt; const SadMxNFunc sad_8x8_wmt = vp8_sad8x8_wmt; const SadMxNFunc sad_4x4_wmt = vp8_sad4x4_wmt; const SadMxNParam sse2_tests[] = { - make_tuple(16, 16, sad_16x16_wmt), - make_tuple(8, 16, sad_8x16_wmt), - make_tuple(16, 8, sad_16x8_wmt), - make_tuple(8, 8, sad_8x8_wmt), - make_tuple(4, 4, sad_4x4_wmt), + make_tuple(16, 16, sad_16x16_wmt, -1), + make_tuple(8, 16, sad_8x16_wmt, -1), + make_tuple(16, 8, sad_16x8_wmt, -1), + make_tuple(8, 8, sad_8x8_wmt, -1), + make_tuple(4, 4, sad_4x4_wmt, -1), }; INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests)); #endif // CONFIG_VP8_ENCODER @@ -555,20 +982,6 @@ const SadMxNVp9Func sad_16x8_sse2_vp9 = vp9_sad16x8_sse2; const SadMxNVp9Func sad_8x16_sse2_vp9 = vp9_sad8x16_sse2; const SadMxNVp9Func sad_8x8_sse2_vp9 = vp9_sad8x8_sse2; const SadMxNVp9Func sad_8x4_sse2_vp9 = vp9_sad8x4_sse2; -const SadMxNVp9Param sse2_vp9_tests[] = { - make_tuple(64, 64, sad_64x64_sse2_vp9), - make_tuple(64, 32, sad_64x32_sse2_vp9), - make_tuple(32, 64, sad_32x64_sse2_vp9), - make_tuple(32, 32, sad_32x32_sse2_vp9), - make_tuple(32, 16, sad_32x16_sse2_vp9), - make_tuple(16, 32, sad_16x32_sse2_vp9), - make_tuple(16, 16, sad_16x16_sse2_vp9), - make_tuple(16, 8, sad_16x8_sse2_vp9), - make_tuple(8, 16, sad_8x16_sse2_vp9), - make_tuple(8, 8, sad_8x8_sse2_vp9), - make_tuple(8, 4, sad_8x4_sse2_vp9), -}; -INSTANTIATE_TEST_CASE_P(SSE2, SADVP9Test, ::testing::ValuesIn(sse2_vp9_tests)); const SadMxNx4Func sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2; const SadMxNx4Func sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2; @@ -581,18 +994,214 @@ const SadMxNx4Func sad_16x8x4d_sse2 = vp9_sad16x8x4d_sse2; const SadMxNx4Func sad_8x16x4d_sse2 = vp9_sad8x16x4d_sse2; const SadMxNx4Func sad_8x8x4d_sse2 = vp9_sad8x8x4d_sse2; const SadMxNx4Func sad_8x4x4d_sse2 = vp9_sad8x4x4d_sse2; + +#if CONFIG_VP9_HIGHBITDEPTH +const SadMxNVp9Func highbd_sad8x4_sse2_vp9 = vp9_highbd_sad8x4_sse2; +const SadMxNVp9Func highbd_sad8x8_sse2_vp9 = vp9_highbd_sad8x8_sse2; +const SadMxNVp9Func highbd_sad8x16_sse2_vp9 = vp9_highbd_sad8x16_sse2; +const SadMxNVp9Func highbd_sad16x8_sse2_vp9 = vp9_highbd_sad16x8_sse2; +const SadMxNVp9Func highbd_sad16x16_sse2_vp9 = vp9_highbd_sad16x16_sse2; +const SadMxNVp9Func highbd_sad16x32_sse2_vp9 = vp9_highbd_sad16x32_sse2; +const SadMxNVp9Func highbd_sad32x16_sse2_vp9 = vp9_highbd_sad32x16_sse2; +const SadMxNVp9Func highbd_sad32x32_sse2_vp9 = vp9_highbd_sad32x32_sse2; +const SadMxNVp9Func highbd_sad32x64_sse2_vp9 = vp9_highbd_sad32x64_sse2; +const SadMxNVp9Func highbd_sad64x32_sse2_vp9 = vp9_highbd_sad64x32_sse2; +const SadMxNVp9Func highbd_sad64x64_sse2_vp9 = vp9_highbd_sad64x64_sse2; + +INSTANTIATE_TEST_CASE_P(SSE2, SADVP9Test, ::testing::Values( + make_tuple(64, 64, sad_64x64_sse2_vp9, -1), + make_tuple(64, 32, sad_64x32_sse2_vp9, -1), + make_tuple(32, 64, sad_32x64_sse2_vp9, -1), + make_tuple(32, 32, sad_32x32_sse2_vp9, -1), + make_tuple(32, 16, sad_32x16_sse2_vp9, -1), + make_tuple(16, 32, sad_16x32_sse2_vp9, -1), + make_tuple(16, 16, sad_16x16_sse2_vp9, -1), + make_tuple(16, 8, sad_16x8_sse2_vp9, -1), + make_tuple(8, 16, sad_8x16_sse2_vp9, -1), + make_tuple(8, 8, sad_8x8_sse2_vp9, -1), + make_tuple(8, 4, sad_8x4_sse2_vp9, -1), + make_tuple(8, 4, highbd_sad8x4_sse2_vp9, 8), + make_tuple(8, 8, highbd_sad8x8_sse2_vp9, 8), + make_tuple(8, 16, highbd_sad8x16_sse2_vp9, 8), + make_tuple(16, 8, highbd_sad16x8_sse2_vp9, 8), + make_tuple(16, 16, highbd_sad16x16_sse2_vp9, 8), + make_tuple(16, 32, highbd_sad16x32_sse2_vp9, 8), + make_tuple(32, 16, highbd_sad32x16_sse2_vp9, 8), + make_tuple(32, 32, highbd_sad32x32_sse2_vp9, 8), + make_tuple(32, 64, highbd_sad32x64_sse2_vp9, 8), + make_tuple(64, 32, highbd_sad64x32_sse2_vp9, 8), + make_tuple(64, 64, highbd_sad64x64_sse2_vp9, 8), + make_tuple(8, 4, highbd_sad8x4_sse2_vp9, 10), + make_tuple(8, 8, highbd_sad8x8_sse2_vp9, 10), + make_tuple(8, 16, highbd_sad8x16_sse2_vp9, 10), + make_tuple(16, 8, highbd_sad16x8_sse2_vp9, 10), + make_tuple(16, 16, highbd_sad16x16_sse2_vp9, 10), + make_tuple(16, 32, highbd_sad16x32_sse2_vp9, 10), + make_tuple(32, 16, highbd_sad32x16_sse2_vp9, 10), + make_tuple(32, 32, highbd_sad32x32_sse2_vp9, 10), + make_tuple(32, 64, highbd_sad32x64_sse2_vp9, 10), + make_tuple(64, 32, highbd_sad64x32_sse2_vp9, 10), + make_tuple(64, 64, highbd_sad64x64_sse2_vp9, 10), + make_tuple(8, 4, highbd_sad8x4_sse2_vp9, 12), + make_tuple(8, 8, highbd_sad8x8_sse2_vp9, 12), + make_tuple(8, 16, highbd_sad8x16_sse2_vp9, 12), + make_tuple(16, 8, highbd_sad16x8_sse2_vp9, 12), + make_tuple(16, 16, highbd_sad16x16_sse2_vp9, 12), + make_tuple(16, 32, highbd_sad16x32_sse2_vp9, 12), + make_tuple(32, 16, highbd_sad32x16_sse2_vp9, 12), + make_tuple(32, 32, highbd_sad32x32_sse2_vp9, 12), + make_tuple(32, 64, highbd_sad32x64_sse2_vp9, 12), + make_tuple(64, 32, highbd_sad64x32_sse2_vp9, 12), + make_tuple(64, 64, highbd_sad64x64_sse2_vp9, 12))); + +const SadMxNAvgVp9Func highbd_sad8x4_avg_sse2_vp9 = vp9_highbd_sad8x4_avg_sse2; +const SadMxNAvgVp9Func highbd_sad8x8_avg_sse2_vp9 = vp9_highbd_sad8x8_avg_sse2; +const SadMxNAvgVp9Func highbd_sad8x16_avg_sse2_vp9 = + vp9_highbd_sad8x16_avg_sse2; +const SadMxNAvgVp9Func highbd_sad16x8_avg_sse2_vp9 = + vp9_highbd_sad16x8_avg_sse2; +const SadMxNAvgVp9Func highbd_sad16x16_avg_sse2_vp9 = + vp9_highbd_sad16x16_avg_sse2; +const SadMxNAvgVp9Func highbd_sad16x32_avg_sse2_vp9 = + vp9_highbd_sad16x32_avg_sse2; +const SadMxNAvgVp9Func highbd_sad32x16_avg_sse2_vp9 = + vp9_highbd_sad32x16_avg_sse2; +const SadMxNAvgVp9Func highbd_sad32x32_avg_sse2_vp9 = + vp9_highbd_sad32x32_avg_sse2; +const SadMxNAvgVp9Func highbd_sad32x64_avg_sse2_vp9 = + vp9_highbd_sad32x64_avg_sse2; +const SadMxNAvgVp9Func highbd_sad64x32_avg_sse2_vp9 = + vp9_highbd_sad64x32_avg_sse2; +const SadMxNAvgVp9Func highbd_sad64x64_avg_sse2_vp9 = + vp9_highbd_sad64x64_avg_sse2; + +INSTANTIATE_TEST_CASE_P(SSE2, SADavgVP9Test, ::testing::Values( + make_tuple(8, 4, highbd_sad8x4_avg_sse2_vp9, 8), + make_tuple(8, 8, highbd_sad8x8_avg_sse2_vp9, 8), + make_tuple(8, 16, highbd_sad8x16_avg_sse2_vp9, 8), + make_tuple(16, 8, highbd_sad16x8_avg_sse2_vp9, 8), + make_tuple(16, 16, highbd_sad16x16_avg_sse2_vp9, 8), + make_tuple(16, 32, highbd_sad16x32_avg_sse2_vp9, 8), + make_tuple(32, 16, highbd_sad32x16_avg_sse2_vp9, 8), + make_tuple(32, 32, highbd_sad32x32_avg_sse2_vp9, 8), + make_tuple(32, 64, highbd_sad32x64_avg_sse2_vp9, 8), + make_tuple(64, 32, highbd_sad64x32_avg_sse2_vp9, 8), + make_tuple(64, 64, highbd_sad64x64_avg_sse2_vp9, 8), + make_tuple(8, 4, highbd_sad8x4_avg_sse2_vp9, 10), + make_tuple(8, 8, highbd_sad8x8_avg_sse2_vp9, 10), + make_tuple(8, 16, highbd_sad8x16_avg_sse2_vp9, 10), + make_tuple(16, 8, highbd_sad16x8_avg_sse2_vp9, 10), + make_tuple(16, 16, highbd_sad16x16_avg_sse2_vp9, 10), + make_tuple(16, 32, highbd_sad16x32_avg_sse2_vp9, 10), + make_tuple(32, 16, highbd_sad32x16_avg_sse2_vp9, 10), + make_tuple(32, 32, highbd_sad32x32_avg_sse2_vp9, 10), + make_tuple(32, 64, highbd_sad32x64_avg_sse2_vp9, 10), + make_tuple(64, 32, highbd_sad64x32_avg_sse2_vp9, 10), + make_tuple(64, 64, highbd_sad64x64_avg_sse2_vp9, 10), + make_tuple(8, 4, highbd_sad8x4_avg_sse2_vp9, 12), + make_tuple(8, 8, highbd_sad8x8_avg_sse2_vp9, 12), + make_tuple(8, 16, highbd_sad8x16_avg_sse2_vp9, 12), + make_tuple(16, 8, highbd_sad16x8_avg_sse2_vp9, 12), + make_tuple(16, 16, highbd_sad16x16_avg_sse2_vp9, 12), + make_tuple(16, 32, highbd_sad16x32_avg_sse2_vp9, 12), + make_tuple(32, 16, highbd_sad32x16_avg_sse2_vp9, 12), + make_tuple(32, 32, highbd_sad32x32_avg_sse2_vp9, 12), + make_tuple(32, 64, highbd_sad32x64_avg_sse2_vp9, 12), + make_tuple(64, 32, highbd_sad64x32_avg_sse2_vp9, 12), + make_tuple(64, 64, highbd_sad64x64_avg_sse2_vp9, 12))); + +const SadMxNx4Func highbd_sad_64x64x4d_sse2 = vp9_highbd_sad64x64x4d_sse2; +const SadMxNx4Func highbd_sad_64x32x4d_sse2 = vp9_highbd_sad64x32x4d_sse2; +const SadMxNx4Func highbd_sad_32x64x4d_sse2 = vp9_highbd_sad32x64x4d_sse2; +const SadMxNx4Func highbd_sad_32x32x4d_sse2 = vp9_highbd_sad32x32x4d_sse2; +const SadMxNx4Func highbd_sad_32x16x4d_sse2 = vp9_highbd_sad32x16x4d_sse2; +const SadMxNx4Func highbd_sad_16x32x4d_sse2 = vp9_highbd_sad16x32x4d_sse2; +const SadMxNx4Func highbd_sad_16x16x4d_sse2 = vp9_highbd_sad16x16x4d_sse2; +const SadMxNx4Func highbd_sad_16x8x4d_sse2 = vp9_highbd_sad16x8x4d_sse2; +const SadMxNx4Func highbd_sad_8x16x4d_sse2 = vp9_highbd_sad8x16x4d_sse2; +const SadMxNx4Func highbd_sad_8x8x4d_sse2 = vp9_highbd_sad8x8x4d_sse2; +const SadMxNx4Func highbd_sad_8x4x4d_sse2 = vp9_highbd_sad8x4x4d_sse2; +const SadMxNx4Func highbd_sad_4x8x4d_sse2 = vp9_highbd_sad4x8x4d_sse2; +const SadMxNx4Func highbd_sad_4x4x4d_sse2 = vp9_highbd_sad4x4x4d_sse2; + +INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values( + make_tuple(64, 64, sad_64x64x4d_sse2, -1), + make_tuple(64, 32, sad_64x32x4d_sse2, -1), + make_tuple(32, 64, sad_32x64x4d_sse2, -1), + make_tuple(32, 32, sad_32x32x4d_sse2, -1), + make_tuple(32, 16, sad_32x16x4d_sse2, -1), + make_tuple(16, 32, sad_16x32x4d_sse2, -1), + make_tuple(16, 16, sad_16x16x4d_sse2, -1), + make_tuple(16, 8, sad_16x8x4d_sse2, -1), + make_tuple(8, 16, sad_8x16x4d_sse2, -1), + make_tuple(8, 8, sad_8x8x4d_sse2, -1), + make_tuple(8, 4, sad_8x4x4d_sse2, -1), + make_tuple(64, 64, highbd_sad_64x64x4d_sse2, 8), + make_tuple(64, 32, highbd_sad_64x32x4d_sse2, 8), + make_tuple(32, 64, highbd_sad_32x64x4d_sse2, 8), + make_tuple(32, 32, highbd_sad_32x32x4d_sse2, 8), + make_tuple(32, 16, highbd_sad_32x16x4d_sse2, 8), + make_tuple(16, 32, highbd_sad_16x32x4d_sse2, 8), + make_tuple(16, 16, highbd_sad_16x16x4d_sse2, 8), + make_tuple(16, 8, highbd_sad_16x8x4d_sse2, 8), + make_tuple(8, 16, highbd_sad_8x16x4d_sse2, 8), + make_tuple(8, 8, highbd_sad_8x8x4d_sse2, 8), + make_tuple(8, 4, highbd_sad_8x4x4d_sse2, 8), + make_tuple(4, 8, highbd_sad_4x8x4d_sse2, 8), + make_tuple(4, 4, highbd_sad_4x4x4d_sse2, 8), + make_tuple(64, 64, highbd_sad_64x64x4d_sse2, 10), + make_tuple(64, 32, highbd_sad_64x32x4d_sse2, 10), + make_tuple(32, 64, highbd_sad_32x64x4d_sse2, 10), + make_tuple(32, 32, highbd_sad_32x32x4d_sse2, 10), + make_tuple(32, 16, highbd_sad_32x16x4d_sse2, 10), + make_tuple(16, 32, highbd_sad_16x32x4d_sse2, 10), + make_tuple(16, 16, highbd_sad_16x16x4d_sse2, 10), + make_tuple(16, 8, highbd_sad_16x8x4d_sse2, 10), + make_tuple(8, 16, highbd_sad_8x16x4d_sse2, 10), + make_tuple(8, 8, highbd_sad_8x8x4d_sse2, 10), + make_tuple(8, 4, highbd_sad_8x4x4d_sse2, 10), + make_tuple(4, 8, highbd_sad_4x8x4d_sse2, 10), + make_tuple(4, 4, highbd_sad_4x4x4d_sse2, 10), + make_tuple(64, 64, highbd_sad_64x64x4d_sse2, 12), + make_tuple(64, 32, highbd_sad_64x32x4d_sse2, 12), + make_tuple(32, 64, highbd_sad_32x64x4d_sse2, 12), + make_tuple(32, 32, highbd_sad_32x32x4d_sse2, 12), + make_tuple(32, 16, highbd_sad_32x16x4d_sse2, 12), + make_tuple(16, 32, highbd_sad_16x32x4d_sse2, 12), + make_tuple(16, 16, highbd_sad_16x16x4d_sse2, 12), + make_tuple(16, 8, highbd_sad_16x8x4d_sse2, 12), + make_tuple(8, 16, highbd_sad_8x16x4d_sse2, 12), + make_tuple(8, 8, highbd_sad_8x8x4d_sse2, 12), + make_tuple(8, 4, highbd_sad_8x4x4d_sse2, 12), + make_tuple(4, 8, highbd_sad_4x8x4d_sse2, 12), + make_tuple(4, 4, highbd_sad_4x4x4d_sse2, 12))); +#else +INSTANTIATE_TEST_CASE_P(SSE2, SADVP9Test, ::testing::Values( + make_tuple(64, 64, sad_64x64_sse2_vp9, -1), + make_tuple(64, 32, sad_64x32_sse2_vp9, -1), + make_tuple(32, 64, sad_32x64_sse2_vp9, -1), + make_tuple(32, 32, sad_32x32_sse2_vp9, -1), + make_tuple(32, 16, sad_32x16_sse2_vp9, -1), + make_tuple(16, 32, sad_16x32_sse2_vp9, -1), + make_tuple(16, 16, sad_16x16_sse2_vp9, -1), + make_tuple(16, 8, sad_16x8_sse2_vp9, -1), + make_tuple(8, 16, sad_8x16_sse2_vp9, -1), + make_tuple(8, 8, sad_8x8_sse2_vp9, -1), + make_tuple(8, 4, sad_8x4_sse2_vp9, -1))); + INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values( - make_tuple(64, 64, sad_64x64x4d_sse2), - make_tuple(64, 32, sad_64x32x4d_sse2), - make_tuple(32, 64, sad_32x64x4d_sse2), - make_tuple(32, 32, sad_32x32x4d_sse2), - make_tuple(32, 16, sad_32x16x4d_sse2), - make_tuple(16, 32, sad_16x32x4d_sse2), - make_tuple(16, 16, sad_16x16x4d_sse2), - make_tuple(16, 8, sad_16x8x4d_sse2), - make_tuple(8, 16, sad_8x16x4d_sse2), - make_tuple(8, 8, sad_8x8x4d_sse2), - make_tuple(8, 4, sad_8x4x4d_sse2))); + make_tuple(64, 64, sad_64x64x4d_sse2, -1), + make_tuple(64, 32, sad_64x32x4d_sse2, -1), + make_tuple(32, 64, sad_32x64x4d_sse2, -1), + make_tuple(32, 32, sad_32x32x4d_sse2, -1), + make_tuple(32, 16, sad_32x16x4d_sse2, -1), + make_tuple(16, 32, sad_16x32x4d_sse2, -1), + make_tuple(16, 16, sad_16x16x4d_sse2, -1), + make_tuple(16, 8, sad_16x8x4d_sse2, -1), + make_tuple(8, 16, sad_8x16x4d_sse2, -1), + make_tuple(8, 8, sad_8x8x4d_sse2, -1), + make_tuple(8, 4, sad_8x4x4d_sse2, -1))); +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_USE_X86INC #endif // CONFIG_VP9_ENCODER #endif // HAVE_SSE2 @@ -605,11 +1214,11 @@ const SadMxNx4Func sad_8x16x4d_sse3 = vp8_sad8x16x4d_sse3; const SadMxNx4Func sad_8x8x4d_sse3 = vp8_sad8x8x4d_sse3; const SadMxNx4Func sad_4x4x4d_sse3 = vp8_sad4x4x4d_sse3; INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values( - make_tuple(16, 16, sad_16x16x4d_sse3), - make_tuple(16, 8, sad_16x8x4d_sse3), - make_tuple(8, 16, sad_8x16x4d_sse3), - make_tuple(8, 8, sad_8x8x4d_sse3), - make_tuple(4, 4, sad_4x4x4d_sse3))); + make_tuple(16, 16, sad_16x16x4d_sse3, -1), + make_tuple(16, 8, sad_16x8x4d_sse3, -1), + make_tuple(8, 16, sad_8x16x4d_sse3, -1), + make_tuple(8, 8, sad_8x8x4d_sse3, -1), + make_tuple(4, 4, sad_4x4x4d_sse3, -1))); #endif // CONFIG_VP8_ENCODER #endif // HAVE_SSE3 @@ -618,32 +1227,18 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values( #if CONFIG_VP8_ENCODER const SadMxNFunc sad_16x16_sse3 = vp8_sad16x16_sse3; INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values( - make_tuple(16, 16, sad_16x16_sse3))); + make_tuple(16, 16, sad_16x16_sse3, -1))); #endif // CONFIG_VP8_ENCODER #endif // CONFIG_USE_X86INC #endif // HAVE_SSSE3 #if HAVE_AVX2 #if CONFIG_VP9_ENCODER -const SadMxNVp9Func sad_64x64_avx2_vp9 = vp9_sad64x64_avx2; -const SadMxNVp9Func sad_64x32_avx2_vp9 = vp9_sad64x32_avx2; -const SadMxNVp9Func sad_32x64_avx2_vp9 = vp9_sad32x64_avx2; -const SadMxNVp9Func sad_32x32_avx2_vp9 = vp9_sad32x32_avx2; -const SadMxNVp9Func sad_32x16_avx2_vp9 = vp9_sad32x16_avx2; -const SadMxNVp9Param avx2_vp9_tests[] = { - make_tuple(64, 64, sad_64x64_avx2_vp9), - make_tuple(64, 32, sad_64x32_avx2_vp9), - make_tuple(32, 64, sad_32x64_avx2_vp9), - make_tuple(32, 32, sad_32x32_avx2_vp9), - make_tuple(32, 16, sad_32x16_avx2_vp9), -}; -INSTANTIATE_TEST_CASE_P(AVX2, SADVP9Test, ::testing::ValuesIn(avx2_vp9_tests)); - const SadMxNx4Func sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2; const SadMxNx4Func sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2; INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::Values( - make_tuple(32, 32, sad_32x32x4d_avx2), - make_tuple(64, 64, sad_64x64x4d_avx2))); + make_tuple(32, 32, sad_32x32x4d_avx2, -1), + make_tuple(64, 64, sad_64x64x4d_avx2, -1))); #endif // CONFIG_VP9_ENCODER #endif // HAVE_AVX2 diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 0530f3a30..f3922b1ea 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1652,37 +1652,37 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/; add_proto qw/unsigned int vp9_highbd_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad64x64/; + specialize qw/vp9_highbd_sad64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad32x64/; + specialize qw/vp9_highbd_sad32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad64x32/; + specialize qw/vp9_highbd_sad64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad32x16/; + specialize qw/vp9_highbd_sad32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad16x32/; + specialize qw/vp9_highbd_sad16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad32x32/; + specialize qw/vp9_highbd_sad32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad16x16/; + specialize qw/vp9_highbd_sad16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad16x8/; + specialize qw/vp9_highbd_sad16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad8x16/; + specialize qw/vp9_highbd_sad8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad8x8/; + specialize qw/vp9_highbd_sad8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad8x4/; + specialize qw/vp9_highbd_sad8x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vp9_highbd_sad4x8/; @@ -1691,37 +1691,37 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_sad4x4/; add_proto qw/unsigned int vp9_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad64x64_avg/; + specialize qw/vp9_highbd_sad64x64_avg/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad32x64_avg/; + specialize qw/vp9_highbd_sad32x64_avg/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad64x32_avg/; + specialize qw/vp9_highbd_sad64x32_avg/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad32x16_avg/; + specialize qw/vp9_highbd_sad32x16_avg/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad16x32_avg/; + specialize qw/vp9_highbd_sad16x32_avg/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad32x32_avg/; + specialize qw/vp9_highbd_sad32x32_avg/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad16x16_avg/; + specialize qw/vp9_highbd_sad16x16_avg/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad16x8_avg/; + specialize qw/vp9_highbd_sad16x8_avg/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad8x16_avg/; + specialize qw/vp9_highbd_sad8x16_avg/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad8x8_avg/; + specialize qw/vp9_highbd_sad8x8_avg/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad8x4_avg/; + specialize qw/vp9_highbd_sad8x4_avg/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; specialize qw/vp9_highbd_sad4x8_avg/; @@ -1778,44 +1778,43 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_sad4x4x8/; add_proto qw/void vp9_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad64x64x4d/; + specialize qw/vp9_highbd_sad64x64x4d sse2/; add_proto qw/void vp9_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad32x64x4d/; + specialize qw/vp9_highbd_sad32x64x4d sse2/; add_proto qw/void vp9_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad64x32x4d/; + specialize qw/vp9_highbd_sad64x32x4d sse2/; add_proto qw/void vp9_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad32x16x4d/; + specialize qw/vp9_highbd_sad32x16x4d sse2/; add_proto qw/void vp9_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad16x32x4d/; + specialize qw/vp9_highbd_sad16x32x4d sse2/; add_proto qw/void vp9_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad32x32x4d/; + specialize qw/vp9_highbd_sad32x32x4d sse2/; add_proto qw/void vp9_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad16x16x4d/; + specialize qw/vp9_highbd_sad16x16x4d sse2/; add_proto qw/void vp9_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad16x8x4d/; + specialize qw/vp9_highbd_sad16x8x4d sse2/; add_proto qw/void vp9_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad8x16x4d/; + specialize qw/vp9_highbd_sad8x16x4d sse2/; add_proto qw/void vp9_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad8x8x4d/; + specialize qw/vp9_highbd_sad8x8x4d sse2/; - # TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form add_proto qw/void vp9_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad8x4x4d/; + specialize qw/vp9_highbd_sad8x4x4d sse2/; add_proto qw/void vp9_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad4x8x4d/; + specialize qw/vp9_highbd_sad4x8x4d sse2/; add_proto qw/void vp9_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad4x4x4d/; + specialize qw/vp9_highbd_sad4x4x4d sse2/; add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; specialize qw/vp9_highbd_mse16x16/; diff --git a/vp9/encoder/x86/vp9_highbd_sad4d_sse2.asm b/vp9/encoder/x86/vp9_highbd_sad4d_sse2.asm new file mode 100644 index 000000000..986efb11f --- /dev/null +++ b/vp9/encoder/x86/vp9_highbd_sad4d_sse2.asm @@ -0,0 +1,284 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_4x2x4 5-6 0 + movh m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + movhps m0, [srcq +%4*2] + movhps m4, [ref1q+%5*2] + movhps m5, [ref2q+%5*2] + movhps m6, [ref3q+%5*2] + movhps m7, [ref4q+%5*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + movu m2, [ref1q+%3*2] + movhps m0, [srcq +%4*2] + movhps m2, [ref1q+%5*2] + mova m3, m0 + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m4, m2 + + movu m2, [ref2q+%3*2] + mova m3, m0 + movhps m2, [ref2q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m5, m2 + + movu m2, [ref3q+%3*2] + mova m3, m0 + movhps m2, [ref3q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m6, m2 + + movu m2, [ref4q+%3*2] + mova m3, m0 + movhps m2, [ref4q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_8x2x4 5-6 0 + ; 1st 8 px + mova m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + mova m3, m0 + movu m2, [ref1q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif + + ; 2nd 8 px + mova m0, [srcq +(%4)*2] + mova m3, m0 + movu m2, [ref1q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endmacro + +; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_16x2x4 5-6 0 + HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8) + HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6 +%endmacro + +; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_32x2x4 5-6 0 + HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16) + HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6 +%endmacro + +; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_64x2x4 5-6 0 + HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32) + HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6 +%endmacro + +; void vp9_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; unsigned int res[4]); +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 +%macro HIGH_SADNXN4D 2 +%if UNIX64 +cglobal highbd_sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4, one +%else +cglobal highbd_sad%1x%2x4d, 4, 8, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4, one +%endif + + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + +; convert byte pointers to short pointers + shl srcq, 1 + shl ref2q, 1 + shl ref3q, 1 + shl ref4q, 1 + shl ref1q, 1 + + mov oned, 0x00010001 + movd m1, oned + pshufd m1, m1, 0x0 + + HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 +%rep (%2-4)/2 + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 +%endrep + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + ; N.B. HIGH_PROCESS outputs dwords (32 bits) + ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM + movhlps m0, m4 + movhlps m1, m5 + movhlps m2, m6 + movhlps m3, m7 + paddd m4, m0 + paddd m5, m1 + paddd m6, m2 + paddd m7, m3 + punpckldq m4, m5 + punpckldq m6, m7 + movhlps m0, m4 + movhlps m1, m6 + paddd m4, m0 + paddd m6, m1 + punpcklqdq m4, m6 + movifnidn r4, r4mp + movu [r4], m4 + RET +%endmacro + + +INIT_XMM sse2 +HIGH_SADNXN4D 64, 64 +HIGH_SADNXN4D 64, 32 +HIGH_SADNXN4D 32, 64 +HIGH_SADNXN4D 32, 32 +HIGH_SADNXN4D 32, 16 +HIGH_SADNXN4D 16, 32 +HIGH_SADNXN4D 16, 16 +HIGH_SADNXN4D 16, 8 +HIGH_SADNXN4D 8, 16 +HIGH_SADNXN4D 8, 8 +HIGH_SADNXN4D 8, 4 +HIGH_SADNXN4D 4, 8 +HIGH_SADNXN4D 4, 4 diff --git a/vp9/encoder/x86/vp9_highbd_sad_sse2.asm b/vp9/encoder/x86/vp9_highbd_sad_sse2.asm new file mode 100644 index 000000000..c895ac0ee --- /dev/null +++ b/vp9/encoder/x86/vp9_highbd_sad_sse2.asm @@ -0,0 +1,363 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro HIGH_SAD_FN 4 +%if %4 == 0 +%if %3 == 5 +cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%else ; avg +%if %3 == 5 +cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%endif ; avg/sad + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +; convert src, ref & second_pred to short ptrs (from byte ptrs) + shl srcq, 1 + shl refq, 1 +%if %4 == 1 + shl second_predq, 1 +%endif +%endmacro + +; unsigned int vp9_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD64XN 1-2 0 + HIGH_SAD_FN 64, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + pxor m6, m6 + +.loop: + ; first half of each row + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + paddd m0, m1 + paddd m0, m3 + ; second half of each row + movu m1, [refq+64] + movu m2, [refq+80] + movu m3, [refq+96] + movu m4, [refq+112] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq+64] + psubusw m5, m1 + psubusw m1, [srcq+64] + por m1, m5 + mova m5, [srcq+80] + psubusw m5, m2 + psubusw m2, [srcq+80] + por m2, m5 + mova m5, [srcq+96] + psubusw m5, m3 + psubusw m3, [srcq+96] + por m3, m5 + mova m5, [srcq+112] + psubusw m5, m4 + psubusw m4, [srcq+112] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 +HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 +HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 +HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 + + +; unsigned int vp9_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD32XN 1-2 0 + HIGH_SAD_FN 32, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 +HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 +HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 +HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 +HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 +HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 + +; unsigned int vp9_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD16XN 1-2 0 + HIGH_SAD_FN 16, %1, 5, %2 + mov n_rowsd, %1/2 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_strideq*2+16] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+16] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*2+16] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+src_strideq*2] + psubusw m5, m3 + psubusw m3, [srcq+src_strideq*2] + por m3, m5 + mova m5, [srcq+src_strideq*2+16] + psubusw m5, m4 + psubusw m4, [srcq+src_strideq*2+16] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 +HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 +HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 +HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 +HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 +HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 + + +; unsigned int vp9_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD8XN 1-2 0 + HIGH_SAD_FN 8, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq*2] + movu m3, [refq+ref_strideq*4] + movu m4, [refq+ref_stride3q*2] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+src_strideq*2] + psubusw m5, m2 + psubusw m2, [srcq+src_strideq*2] + por m2, m5 + mova m5, [srcq+src_strideq*4] + psubusw m5, m3 + psubusw m3, [srcq+src_strideq*4] + por m3, m5 + mova m5, [srcq+src_stride3q*2] + psubusw m5, m4 + psubusw m4, [srcq+src_stride3q*2] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*8] + paddd m0, m1 + lea srcq, [srcq+src_strideq*8] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 +HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 +HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 +HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 +HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 +HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index e72cb0024..b3a37745b 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -102,6 +102,9 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm +endif ifeq ($(CONFIG_USE_X86INC),yes) VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm @@ -110,6 +113,9 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad_sse2.asm +endif endif ifeq ($(ARCH_X86_64),yes)