From: Johann Date: Fri, 17 Apr 2015 20:11:38 +0000 (-0400) Subject: Move shared SAD code to vpx_dsp X-Git-Tag: v1.5.0~705^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d5d92898001064c74548a7fa04b0f624de4afb40;p=libvpx Move shared SAD code to vpx_dsp Create a new component, vpx_dsp, for code that can be shared between codecs. Move the SAD code into the component. This reduces the size of vpxenc/dec by 36k on x86_64 builds. Change-Id: I73f837ddaecac6b350bf757af0cfe19c4ab9327a --- diff --git a/libs.mk b/libs.mk index 3046e1b96..6eee0039c 100644 --- a/libs.mk +++ b/libs.mk @@ -54,6 +54,9 @@ CODEC_SRCS-yes += $(addprefix vpx_scale/,$(call enabled,SCALE_SRCS)) include $(SRC_PATH_BARE)/vpx_ports/vpx_ports.mk CODEC_SRCS-yes += $(addprefix vpx_ports/,$(call enabled,PORTS_SRCS)) +include $(SRC_PATH_BARE)/vpx_dsp/vpx_dsp.mk +CODEC_SRCS-yes += $(addprefix vpx_dsp/,$(call enabled,DSP_SRCS)) + ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),) VP8_PREFIX=vp8/ include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk diff --git a/test/sad_test.cc b/test/sad_test.cc index 65e9561a9..6c28edb51 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -14,14 +14,25 @@ #include #include "./vpx_config.h" -#if CONFIG_VP8_ENCODER -#include "./vp8_rtcd.h" -#endif -#if CONFIG_VP9_ENCODER -#include "./vp9_rtcd.h" -#endif +#include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" +/* Needed for ROUND_POWER_OF_TWO and CONVERT_TO* macros, both of which should be + * moved to a more generic location. Alternatively the *avg functions could be + * restricted to VP9 builds, but it would be better to avoid that sort of + * specificity. + * TODO(johannkoenig): move these macros to a common location. + */ +#if CONFIG_VP9_HIGHBITDEPTH +#include "vp9/common/vp9_common.h" +#endif // CONFIG_VP9_HIGHBITDEPTH + +#ifndef ROUND_POWER_OF_TWO +#define ROUND_POWER_OF_TWO(value, n) \ + (((value) + (1 << ((n) - 1))) >> (n)) +#endif // ROUND_POWER_OF_TWO + + #include "test/acm_random.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" @@ -30,27 +41,18 @@ #include "vpx/vpx_codec.h" -#if CONFIG_VP8_ENCODER -typedef unsigned int (*SadMxNFunc)(const unsigned char *source_ptr, - int source_stride, - const unsigned char *reference_ptr, - int reference_stride, - unsigned int max_sad); +typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride); typedef std::tr1::tuple SadMxNParam; -#endif -#if CONFIG_VP9_ENCODER -typedef unsigned int (*SadMxNVp9Func)(const unsigned char *source_ptr, - int source_stride, - const unsigned char *reference_ptr, - int reference_stride); -typedef std::tr1::tuple SadMxNVp9Param; -typedef uint32_t (*SadMxNAvgVp9Func)(const uint8_t *source_ptr, - int source_stride, - const uint8_t *reference_ptr, - int reference_stride, - const uint8_t *second_pred); -typedef std::tr1::tuple SadMxNAvgVp9Param; -#endif + +typedef uint32_t (*SadMxNAvgFunc)(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + const uint8_t *second_pred); +typedef std::tr1::tuple SadMxNAvgParam; typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride, @@ -68,7 +70,6 @@ class SADTestBase : public ::testing::Test { width_(width), height_(height), bd_(bit_depth) {} static void SetUpTestCase() { -#if CONFIG_VP9_HIGHBITDEPTH source_data8_ = reinterpret_cast( vpx_memalign(kDataAlignment, kDataBlockSize)); reference_data8_ = reinterpret_cast( @@ -81,18 +82,9 @@ class SADTestBase : public ::testing::Test { vpx_memalign(kDataAlignment, kDataBufferSize*sizeof(uint16_t))); second_pred16_ = reinterpret_cast( vpx_memalign(kDataAlignment, 64*64*sizeof(uint16_t))); -#else - source_data_ = reinterpret_cast( - vpx_memalign(kDataAlignment, kDataBlockSize)); - reference_data_ = reinterpret_cast( - vpx_memalign(kDataAlignment, kDataBufferSize)); - second_pred_ = reinterpret_cast( - vpx_memalign(kDataAlignment, 64*64)); -#endif } static void TearDownTestCase() { -#if CONFIG_VP9_HIGHBITDEPTH vpx_free(source_data8_); source_data8_ = NULL; vpx_free(reference_data8_); @@ -105,14 +97,6 @@ class SADTestBase : public ::testing::Test { reference_data16_ = NULL; vpx_free(second_pred16_); second_pred16_ = NULL; -#else - vpx_free(source_data_); - source_data_ = NULL; - vpx_free(reference_data_); - reference_data_ = NULL; - vpx_free(second_pred_); - second_pred_ = NULL; -#endif } virtual void TearDown() { @@ -126,23 +110,21 @@ class SADTestBase : public ::testing::Test { static const int kDataBufferSize = 4 * kDataBlockSize; virtual void SetUp() { -#if CONFIG_VP9_HIGHBITDEPTH if (bd_ == -1) { use_high_bit_depth_ = false; bit_depth_ = VPX_BITS_8; source_data_ = source_data8_; reference_data_ = reference_data8_; second_pred_ = second_pred8_; +#if CONFIG_VP9_HIGHBITDEPTH } else { use_high_bit_depth_ = true; bit_depth_ = static_cast(bd_); source_data_ = CONVERT_TO_BYTEPTR(source_data16_); reference_data_ = CONVERT_TO_BYTEPTR(reference_data16_); second_pred_ = CONVERT_TO_BYTEPTR(second_pred16_); +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - bit_depth_ = VPX_BITS_8; -#endif mask_ = (1 << bit_depth_) - 1; source_stride_ = (width_ + 31) & ~31; reference_stride_ = width_ * 2; @@ -151,51 +133,35 @@ class SADTestBase : public ::testing::Test { virtual uint8_t *GetReference(int block_idx) { #if CONFIG_VP9_HIGHBITDEPTH - if (!use_high_bit_depth_) { - return reference_data_ + block_idx * kDataBlockSize; - } else { + if (use_high_bit_depth_) return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) + block_idx * kDataBlockSize); - } -#else +#endif // CONFIG_VP9_HIGHBITDEPTH return reference_data_ + block_idx * kDataBlockSize; -#endif } // Sum of Absolute Differences. Given two blocks, calculate the absolute // difference between two pixels in the same relative location; accumulate. - unsigned int ReferenceSAD(unsigned int max_sad, int block_idx) { + unsigned int ReferenceSAD(int block_idx) { unsigned int sad = 0; -#if CONFIG_VP9_HIGHBITDEPTH const uint8_t *const reference8 = GetReference(block_idx); const uint8_t *const source8 = source_data_; +#if CONFIG_VP9_HIGHBITDEPTH const uint16_t *const reference16 = CONVERT_TO_SHORTPTR(GetReference(block_idx)); const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_); -#else - const uint8_t *const reference = GetReference(block_idx); - const uint8_t *const source = source_data_; -#endif +#endif // CONFIG_VP9_HIGHBITDEPTH for (int h = 0; h < height_; ++h) { for (int w = 0; w < width_; ++w) { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { - sad += - abs(source8[h * source_stride_ + w] - - reference8[h * reference_stride_ + w]); + sad += abs(source8[h * source_stride_ + w] - + reference8[h * reference_stride_ + w]); +#if CONFIG_VP9_HIGHBITDEPTH } else { - sad += - abs(source16[h * source_stride_ + w] - - reference16[h * reference_stride_ + w]); + sad += abs(source16[h * source_stride_ + w] - + reference16[h * reference_stride_ + w]); +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - sad += - abs(source[h * source_stride_ + w] - - reference[h * reference_stride_ + w]); -#endif - } - if (sad > max_sad) { - break; } } return sad; @@ -204,85 +170,69 @@ class SADTestBase : public ::testing::Test { // Sum of Absolute Differences Average. Given two blocks, and a prediction // calculate the absolute difference between one pixel and average of the // corresponding and predicted pixels; accumulate. - unsigned int ReferenceSADavg(unsigned int max_sad, int block_idx) { + unsigned int ReferenceSADavg(int block_idx) { unsigned int sad = 0; + const uint8_t *const reference8 = GetReference(block_idx); + const uint8_t *const source8 = source_data_; + const uint8_t *const second_pred8 = second_pred_; #if CONFIG_VP9_HIGHBITDEPTH - const uint8_t *const reference8 = GetReference(block_idx); - const uint8_t *const source8 = source_data_; - const uint8_t *const second_pred8 = second_pred_; - const uint16_t *const reference16 = - CONVERT_TO_SHORTPTR(GetReference(block_idx)); - const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_); - const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_); -#else - const uint8_t *const reference = GetReference(block_idx); - const uint8_t *const source = source_data_; - const uint8_t *const second_pred = second_pred_; -#endif + const uint16_t *const reference16 = + CONVERT_TO_SHORTPTR(GetReference(block_idx)); + const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_); + const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_); +#endif // CONFIG_VP9_HIGHBITDEPTH for (int h = 0; h < height_; ++h) { for (int w = 0; w < width_; ++w) { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { const int tmp = second_pred8[h * width_ + w] + reference8[h * reference_stride_ + w]; const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1); sad += abs(source8[h * source_stride_ + w] - comp_pred); +#if CONFIG_VP9_HIGHBITDEPTH } else { const int tmp = second_pred16[h * width_ + w] + reference16[h * reference_stride_ + w]; const uint16_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1); sad += abs(source16[h * source_stride_ + w] - comp_pred); +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - const int tmp = second_pred[h * width_ + w] + - reference[h * reference_stride_ + w]; - const uint8_t comp_pred = (tmp + 1) >> 1; - sad += abs(source[h * source_stride_ + w] - comp_pred); -#endif - } - if (sad > max_sad) { - break; } } return sad; } void FillConstant(uint8_t *data, int stride, uint16_t fill_constant) { -#if CONFIG_VP9_HIGHBITDEPTH uint8_t *data8 = data; +#if CONFIG_VP9_HIGHBITDEPTH uint16_t *data16 = CONVERT_TO_SHORTPTR(data); -#endif +#endif // CONFIG_VP9_HIGHBITDEPTH for (int h = 0; h < height_; ++h) { for (int w = 0; w < width_; ++w) { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { data8[h * stride + w] = static_cast(fill_constant); +#if CONFIG_VP9_HIGHBITDEPTH } else { data16[h * stride + w] = fill_constant; +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - data[h * stride + w] = static_cast(fill_constant); -#endif } } } void FillRandom(uint8_t *data, int stride) { -#if CONFIG_VP9_HIGHBITDEPTH uint8_t *data8 = data; +#if CONFIG_VP9_HIGHBITDEPTH uint16_t *data16 = CONVERT_TO_SHORTPTR(data); -#endif +#endif // CONFIG_VP9_HIGHBITDEPTH for (int h = 0; h < height_; ++h) { for (int w = 0; w < width_; ++w) { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { data8[h * stride + w] = rnd_.Rand8(); +#if CONFIG_VP9_HIGHBITDEPTH } else { data16[h * stride + w] = rnd_.Rand16() & mask_; +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - data[h * stride + w] = rnd_.Rand8(); -#endif } } } @@ -293,7 +243,6 @@ class SADTestBase : public ::testing::Test { static uint8_t *reference_data_; static uint8_t *second_pred_; int source_stride_; -#if CONFIG_VP9_HIGHBITDEPTH bool use_high_bit_depth_; static uint8_t *source_data8_; static uint8_t *reference_data8_; @@ -301,7 +250,6 @@ class SADTestBase : public ::testing::Test { static uint16_t *source_data16_; static uint16_t *reference_data16_; static uint16_t *second_pred16_; -#endif int reference_stride_; ACMRandom rnd_; @@ -315,11 +263,11 @@ class SADx4Test protected: void SADs(unsigned int *results) { - const uint8_t *refs[] = {GetReference(0), GetReference(1), - GetReference(2), GetReference(3)}; + const uint8_t *references[] = {GetReference(0), GetReference(1), + GetReference(2), GetReference(3)}; ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_, - refs, reference_stride_, + references, reference_stride_, results)); } @@ -328,52 +276,19 @@ class SADx4Test SADs(exp_sad); for (int block = 0; block < 4; ++block) { - reference_sad = ReferenceSAD(UINT_MAX, block); + reference_sad = ReferenceSAD(block); EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block; } } }; -#if CONFIG_VP8_ENCODER class SADTest : public SADTestBase, public ::testing::WithParamInterface { public: SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} - protected: - unsigned int SAD(unsigned int max_sad, int block_idx) { - unsigned int ret; - const uint8_t *const reference = GetReference(block_idx); - - ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_, - reference, reference_stride_, - max_sad)); - return ret; - } - - void CheckSAD(unsigned int max_sad) { - const unsigned int reference_sad = ReferenceSAD(max_sad, 0); - const unsigned int exp_sad = SAD(max_sad, 0); - - if (reference_sad <= max_sad) { - ASSERT_EQ(exp_sad, reference_sad); - } else { - // Alternative implementations are not required to check max_sad - ASSERT_GE(exp_sad, reference_sad); - } - } -}; -#endif // CONFIG_VP8_ENCODER - -#if CONFIG_VP9_ENCODER -class SADVP9Test - : public SADTestBase, - public ::testing::WithParamInterface { - public: - SADVP9Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} - protected: unsigned int SAD(int block_idx) { unsigned int ret; @@ -385,18 +300,18 @@ class SADVP9Test } void CheckSAD() { - const unsigned int reference_sad = ReferenceSAD(UINT_MAX, 0); + const unsigned int reference_sad = ReferenceSAD(0); const unsigned int exp_sad = SAD(0); ASSERT_EQ(reference_sad, exp_sad); } }; -class SADavgVP9Test +class SADavgTest : public SADTestBase, - public ::testing::WithParamInterface { + public ::testing::WithParamInterface { public: - SADavgVP9Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} + SADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} protected: unsigned int SAD_avg(int block_idx) { @@ -410,91 +325,36 @@ class SADavgVP9Test } void CheckSAD() { - const unsigned int reference_sad = ReferenceSADavg(UINT_MAX, 0); + const unsigned int reference_sad = ReferenceSADavg(0); const unsigned int exp_sad = SAD_avg(0); ASSERT_EQ(reference_sad, exp_sad); } }; -#endif // CONFIG_VP9_ENCODER uint8_t *SADTestBase::source_data_ = NULL; uint8_t *SADTestBase::reference_data_ = NULL; uint8_t *SADTestBase::second_pred_ = NULL; -#if CONFIG_VP9_ENCODER && CONFIG_VP9_HIGHBITDEPTH uint8_t *SADTestBase::source_data8_ = NULL; uint8_t *SADTestBase::reference_data8_ = NULL; uint8_t *SADTestBase::second_pred8_ = NULL; uint16_t *SADTestBase::source_data16_ = NULL; uint16_t *SADTestBase::reference_data16_ = NULL; uint16_t *SADTestBase::second_pred16_ = NULL; -#endif -#if CONFIG_VP8_ENCODER TEST_P(SADTest, MaxRef) { - FillConstant(source_data_, source_stride_, 0); - FillConstant(reference_data_, reference_stride_, mask_); - CheckSAD(UINT_MAX); -} - -TEST_P(SADTest, MaxSrc) { - FillConstant(source_data_, source_stride_, mask_); - FillConstant(reference_data_, reference_stride_, 0); - CheckSAD(UINT_MAX); -} - -TEST_P(SADTest, ShortRef) { - int tmp_stride = reference_stride_; - reference_stride_ >>= 1; - FillRandom(source_data_, source_stride_); - FillRandom(reference_data_, reference_stride_); - CheckSAD(UINT_MAX); - reference_stride_ = tmp_stride; -} - -TEST_P(SADTest, UnalignedRef) { - // The reference frame, but not the source frame, may be unaligned for - // certain types of searches. - const int tmp_stride = reference_stride_; - reference_stride_ -= 1; - FillRandom(source_data_, source_stride_); - FillRandom(reference_data_, reference_stride_); - CheckSAD(UINT_MAX); - reference_stride_ = tmp_stride; -} - -TEST_P(SADTest, ShortSrc) { - const int tmp_stride = source_stride_; - source_stride_ >>= 1; - FillRandom(source_data_, source_stride_); - FillRandom(reference_data_, reference_stride_); - CheckSAD(UINT_MAX); - source_stride_ = tmp_stride; -} - -TEST_P(SADTest, MaxSAD) { - // Verify that, when max_sad is set, the implementation does not return a - // value lower than the reference. - FillConstant(source_data_, source_stride_, mask_); - FillConstant(reference_data_, reference_stride_, 0); - CheckSAD(128); -} -#endif // CONFIG_VP8_ENCODER - -#if CONFIG_VP9_ENCODER -TEST_P(SADVP9Test, MaxRef) { FillConstant(source_data_, source_stride_, 0); FillConstant(reference_data_, reference_stride_, mask_); CheckSAD(); } -TEST_P(SADVP9Test, MaxSrc) { +TEST_P(SADTest, MaxSrc) { FillConstant(source_data_, source_stride_, mask_); FillConstant(reference_data_, reference_stride_, 0); CheckSAD(); } -TEST_P(SADVP9Test, ShortRef) { +TEST_P(SADTest, ShortRef) { const int tmp_stride = reference_stride_; reference_stride_ >>= 1; FillRandom(source_data_, source_stride_); @@ -503,7 +363,7 @@ TEST_P(SADVP9Test, ShortRef) { reference_stride_ = tmp_stride; } -TEST_P(SADVP9Test, UnalignedRef) { +TEST_P(SADTest, UnalignedRef) { // The reference frame, but not the source frame, may be unaligned for // certain types of searches. const int tmp_stride = reference_stride_; @@ -514,7 +374,7 @@ TEST_P(SADVP9Test, UnalignedRef) { reference_stride_ = tmp_stride; } -TEST_P(SADVP9Test, ShortSrc) { +TEST_P(SADTest, ShortSrc) { const int tmp_stride = source_stride_; source_stride_ >>= 1; FillRandom(source_data_, source_stride_); @@ -523,20 +383,20 @@ TEST_P(SADVP9Test, ShortSrc) { source_stride_ = tmp_stride; } -TEST_P(SADavgVP9Test, MaxRef) { +TEST_P(SADavgTest, MaxRef) { FillConstant(source_data_, source_stride_, 0); FillConstant(reference_data_, reference_stride_, mask_); FillConstant(second_pred_, width_, 0); CheckSAD(); } -TEST_P(SADavgVP9Test, MaxSrc) { +TEST_P(SADavgTest, MaxSrc) { FillConstant(source_data_, source_stride_, mask_); FillConstant(reference_data_, reference_stride_, 0); FillConstant(second_pred_, width_, 0); CheckSAD(); } -TEST_P(SADavgVP9Test, ShortRef) { +TEST_P(SADavgTest, ShortRef) { const int tmp_stride = reference_stride_; reference_stride_ >>= 1; FillRandom(source_data_, source_stride_); @@ -546,7 +406,7 @@ TEST_P(SADavgVP9Test, ShortRef) { reference_stride_ = tmp_stride; } -TEST_P(SADavgVP9Test, UnalignedRef) { +TEST_P(SADavgTest, UnalignedRef) { // The reference frame, but not the source frame, may be unaligned for // certain types of searches. const int tmp_stride = reference_stride_; @@ -558,7 +418,7 @@ TEST_P(SADavgVP9Test, UnalignedRef) { reference_stride_ = tmp_stride; } -TEST_P(SADavgVP9Test, ShortSrc) { +TEST_P(SADavgTest, ShortSrc) { const int tmp_stride = source_stride_; source_stride_ >>= 1; FillRandom(source_data_, source_stride_); @@ -567,7 +427,6 @@ TEST_P(SADavgVP9Test, ShortSrc) { CheckSAD(); source_stride_ = tmp_stride; } -#endif // CONFIG_VP9_ENCODER TEST_P(SADx4Test, MaxRef) { FillConstant(source_data_, source_stride_, 0); @@ -641,617 +500,633 @@ using std::tr1::make_tuple; //------------------------------------------------------------------------------ // C functions -#if CONFIG_VP8_ENCODER -const SadMxNFunc sad_16x16_c = vp8_sad16x16_c; -const SadMxNFunc sad_8x16_c = vp8_sad8x16_c; -const SadMxNFunc sad_16x8_c = vp8_sad16x8_c; -const SadMxNFunc sad_8x8_c = vp8_sad8x8_c; -const SadMxNFunc sad_4x4_c = vp8_sad4x4_c; +const SadMxNFunc sad64x64_c = vpx_sad64x64_c; +const SadMxNFunc sad64x32_c = vpx_sad64x32_c; +const SadMxNFunc sad32x64_c = vpx_sad32x64_c; +const SadMxNFunc sad32x32_c = vpx_sad32x32_c; +const SadMxNFunc sad32x16_c = vpx_sad32x16_c; +const SadMxNFunc sad16x32_c = vpx_sad16x32_c; +const SadMxNFunc sad16x16_c = vpx_sad16x16_c; +const SadMxNFunc sad16x8_c = vpx_sad16x8_c; +const SadMxNFunc sad8x16_c = vpx_sad8x16_c; +const SadMxNFunc sad8x8_c = vpx_sad8x8_c; +const SadMxNFunc sad8x4_c = vpx_sad8x4_c; +const SadMxNFunc sad4x8_c = vpx_sad4x8_c; +const SadMxNFunc sad4x4_c = vpx_sad4x4_c; +#if CONFIG_VP9_HIGHBITDEPTH +const SadMxNFunc highbd_sad64x64_c = vpx_highbd_sad64x64_c; +const SadMxNFunc highbd_sad64x32_c = vpx_highbd_sad64x32_c; +const SadMxNFunc highbd_sad32x64_c = vpx_highbd_sad32x64_c; +const SadMxNFunc highbd_sad32x32_c = vpx_highbd_sad32x32_c; +const SadMxNFunc highbd_sad32x16_c = vpx_highbd_sad32x16_c; +const SadMxNFunc highbd_sad16x32_c = vpx_highbd_sad16x32_c; +const SadMxNFunc highbd_sad16x16_c = vpx_highbd_sad16x16_c; +const SadMxNFunc highbd_sad16x8_c = vpx_highbd_sad16x8_c; +const SadMxNFunc highbd_sad8x16_c = vpx_highbd_sad8x16_c; +const SadMxNFunc highbd_sad8x8_c = vpx_highbd_sad8x8_c; +const SadMxNFunc highbd_sad8x4_c = vpx_highbd_sad8x4_c; +const SadMxNFunc highbd_sad4x8_c = vpx_highbd_sad4x8_c; +const SadMxNFunc highbd_sad4x4_c = vpx_highbd_sad4x4_c; +#endif // CONFIG_VP9_HIGHBITDEPTH const SadMxNParam c_tests[] = { - make_tuple(16, 16, sad_16x16_c, -1), - make_tuple(8, 16, sad_8x16_c, -1), - make_tuple(16, 8, sad_16x8_c, -1), - make_tuple(8, 8, sad_8x8_c, -1), - make_tuple(4, 4, sad_4x4_c, -1), + make_tuple(64, 64, sad64x64_c, -1), + make_tuple(64, 32, sad64x32_c, -1), + make_tuple(32, 64, sad32x64_c, -1), + make_tuple(32, 32, sad32x32_c, -1), + make_tuple(32, 16, sad32x16_c, -1), + make_tuple(16, 32, sad16x32_c, -1), + make_tuple(16, 16, sad16x16_c, -1), + make_tuple(16, 8, sad16x8_c, -1), + make_tuple(8, 16, sad8x16_c, -1), + make_tuple(8, 8, sad8x8_c, -1), + make_tuple(8, 4, sad8x4_c, -1), + make_tuple(4, 8, sad4x8_c, -1), + make_tuple(4, 4, sad4x4_c, -1), +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(64, 64, highbd_sad64x64_c, 8), + make_tuple(64, 32, highbd_sad64x32_c, 8), + make_tuple(32, 64, highbd_sad32x64_c, 8), + make_tuple(32, 32, highbd_sad32x32_c, 8), + make_tuple(32, 16, highbd_sad32x16_c, 8), + make_tuple(16, 32, highbd_sad16x32_c, 8), + make_tuple(16, 16, highbd_sad16x16_c, 8), + make_tuple(16, 8, highbd_sad16x8_c, 8), + make_tuple(8, 16, highbd_sad8x16_c, 8), + make_tuple(8, 8, highbd_sad8x8_c, 8), + make_tuple(8, 4, highbd_sad8x4_c, 8), + make_tuple(4, 8, highbd_sad4x8_c, 8), + make_tuple(4, 4, highbd_sad4x4_c, 8), + make_tuple(64, 64, highbd_sad64x64_c, 10), + make_tuple(64, 32, highbd_sad64x32_c, 10), + make_tuple(32, 64, highbd_sad32x64_c, 10), + make_tuple(32, 32, highbd_sad32x32_c, 10), + make_tuple(32, 16, highbd_sad32x16_c, 10), + make_tuple(16, 32, highbd_sad16x32_c, 10), + make_tuple(16, 16, highbd_sad16x16_c, 10), + make_tuple(16, 8, highbd_sad16x8_c, 10), + make_tuple(8, 16, highbd_sad8x16_c, 10), + make_tuple(8, 8, highbd_sad8x8_c, 10), + make_tuple(8, 4, highbd_sad8x4_c, 10), + make_tuple(4, 8, highbd_sad4x8_c, 10), + make_tuple(4, 4, highbd_sad4x4_c, 10), + make_tuple(64, 64, highbd_sad64x64_c, 12), + make_tuple(64, 32, highbd_sad64x32_c, 12), + make_tuple(32, 64, highbd_sad32x64_c, 12), + make_tuple(32, 32, highbd_sad32x32_c, 12), + make_tuple(32, 16, highbd_sad32x16_c, 12), + make_tuple(16, 32, highbd_sad16x32_c, 12), + make_tuple(16, 16, highbd_sad16x16_c, 12), + make_tuple(16, 8, highbd_sad16x8_c, 12), + make_tuple(8, 16, highbd_sad8x16_c, 12), + make_tuple(8, 8, highbd_sad8x8_c, 12), + make_tuple(8, 4, highbd_sad8x4_c, 12), + make_tuple(4, 8, highbd_sad4x8_c, 12), + make_tuple(4, 4, highbd_sad4x4_c, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests)); -#endif // CONFIG_VP8_ENCODER - -#if CONFIG_VP9_ENCODER -const SadMxNVp9Func sad_64x64_c_vp9 = vp9_sad64x64_c; -const SadMxNVp9Func sad_32x32_c_vp9 = vp9_sad32x32_c; -const SadMxNVp9Func sad_16x16_c_vp9 = vp9_sad16x16_c; -const SadMxNVp9Func sad_8x16_c_vp9 = vp9_sad8x16_c; -const SadMxNVp9Func sad_16x8_c_vp9 = vp9_sad16x8_c; -const SadMxNVp9Func sad_8x8_c_vp9 = vp9_sad8x8_c; -const SadMxNVp9Func sad_8x4_c_vp9 = vp9_sad8x4_c; -const SadMxNVp9Func sad_4x8_c_vp9 = vp9_sad4x8_c; -const SadMxNVp9Func sad_4x4_c_vp9 = vp9_sad4x4_c; -const SadMxNVp9Param c_vp9_tests[] = { - make_tuple(64, 64, sad_64x64_c_vp9, -1), - make_tuple(32, 32, sad_32x32_c_vp9, -1), - make_tuple(16, 16, sad_16x16_c_vp9, -1), - make_tuple(8, 16, sad_8x16_c_vp9, -1), - make_tuple(16, 8, sad_16x8_c_vp9, -1), - make_tuple(8, 8, sad_8x8_c_vp9, -1), - make_tuple(8, 4, sad_8x4_c_vp9, -1), - make_tuple(4, 8, sad_4x8_c_vp9, -1), - make_tuple(4, 4, sad_4x4_c_vp9, -1), -}; -INSTANTIATE_TEST_CASE_P(C, SADVP9Test, ::testing::ValuesIn(c_vp9_tests)); - -const SadMxNx4Func sad_64x64x4d_c = vp9_sad64x64x4d_c; -const SadMxNx4Func sad_64x32x4d_c = vp9_sad64x32x4d_c; -const SadMxNx4Func sad_32x64x4d_c = vp9_sad32x64x4d_c; -const SadMxNx4Func sad_32x32x4d_c = vp9_sad32x32x4d_c; -const SadMxNx4Func sad_32x16x4d_c = vp9_sad32x16x4d_c; -const SadMxNx4Func sad_16x32x4d_c = vp9_sad16x32x4d_c; -const SadMxNx4Func sad_16x16x4d_c = vp9_sad16x16x4d_c; -const SadMxNx4Func sad_16x8x4d_c = vp9_sad16x8x4d_c; -const SadMxNx4Func sad_8x16x4d_c = vp9_sad8x16x4d_c; -const SadMxNx4Func sad_8x8x4d_c = vp9_sad8x8x4d_c; -const SadMxNx4Func sad_8x4x4d_c = vp9_sad8x4x4d_c; -const SadMxNx4Func sad_4x8x4d_c = vp9_sad4x8x4d_c; -const SadMxNx4Func sad_4x4x4d_c = vp9_sad4x4x4d_c; -INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values( - make_tuple(64, 64, sad_64x64x4d_c, -1), - make_tuple(64, 32, sad_64x32x4d_c, -1), - make_tuple(32, 64, sad_32x64x4d_c, -1), - make_tuple(32, 32, sad_32x32x4d_c, -1), - make_tuple(32, 16, sad_32x16x4d_c, -1), - make_tuple(16, 32, sad_16x32x4d_c, -1), - make_tuple(16, 16, sad_16x16x4d_c, -1), - make_tuple(16, 8, sad_16x8x4d_c, -1), - make_tuple(8, 16, sad_8x16x4d_c, -1), - make_tuple(8, 8, sad_8x8x4d_c, -1), - make_tuple(8, 4, sad_8x4x4d_c, -1), - make_tuple(4, 8, sad_4x8x4d_c, -1), - make_tuple(4, 4, sad_4x4x4d_c, -1))); +const SadMxNAvgFunc sad64x64_avg_c = vpx_sad64x64_avg_c; +const SadMxNAvgFunc sad64x32_avg_c = vpx_sad64x32_avg_c; +const SadMxNAvgFunc sad32x64_avg_c = vpx_sad32x64_avg_c; +const SadMxNAvgFunc sad32x32_avg_c = vpx_sad32x32_avg_c; +const SadMxNAvgFunc sad32x16_avg_c = vpx_sad32x16_avg_c; +const SadMxNAvgFunc sad16x32_avg_c = vpx_sad16x32_avg_c; +const SadMxNAvgFunc sad16x16_avg_c = vpx_sad16x16_avg_c; +const SadMxNAvgFunc sad16x8_avg_c = vpx_sad16x8_avg_c; +const SadMxNAvgFunc sad8x16_avg_c = vpx_sad8x16_avg_c; +const SadMxNAvgFunc sad8x8_avg_c = vpx_sad8x8_avg_c; +const SadMxNAvgFunc sad8x4_avg_c = vpx_sad8x4_avg_c; +const SadMxNAvgFunc sad4x8_avg_c = vpx_sad4x8_avg_c; +const SadMxNAvgFunc sad4x4_avg_c = vpx_sad4x4_avg_c; #if CONFIG_VP9_HIGHBITDEPTH -const SadMxNVp9Func highbd_sad_64x64_c_vp9 = vp9_highbd_sad64x64_c; -const SadMxNVp9Func highbd_sad_32x32_c_vp9 = vp9_highbd_sad32x32_c; -const SadMxNVp9Func highbd_sad_16x16_c_vp9 = vp9_highbd_sad16x16_c; -const SadMxNVp9Func highbd_sad_8x16_c_vp9 = vp9_highbd_sad8x16_c; -const SadMxNVp9Func highbd_sad_16x8_c_vp9 = vp9_highbd_sad16x8_c; -const SadMxNVp9Func highbd_sad_8x8_c_vp9 = vp9_highbd_sad8x8_c; -const SadMxNVp9Func highbd_sad_8x4_c_vp9 = vp9_highbd_sad8x4_c; -const SadMxNVp9Func highbd_sad_4x8_c_vp9 = vp9_highbd_sad4x8_c; -const SadMxNVp9Func highbd_sad_4x4_c_vp9 = vp9_highbd_sad4x4_c; -const SadMxNVp9Param c_vp9_highbd_8_tests[] = { - make_tuple(64, 64, highbd_sad_64x64_c_vp9, 8), - make_tuple(32, 32, highbd_sad_32x32_c_vp9, 8), - make_tuple(16, 16, highbd_sad_16x16_c_vp9, 8), - make_tuple(8, 16, highbd_sad_8x16_c_vp9, 8), - make_tuple(16, 8, highbd_sad_16x8_c_vp9, 8), - make_tuple(8, 8, highbd_sad_8x8_c_vp9, 8), - make_tuple(8, 4, highbd_sad_8x4_c_vp9, 8), - make_tuple(4, 8, highbd_sad_4x8_c_vp9, 8), - make_tuple(4, 4, highbd_sad_4x4_c_vp9, 8), -}; -INSTANTIATE_TEST_CASE_P(C_8, SADVP9Test, - ::testing::ValuesIn(c_vp9_highbd_8_tests)); - -const SadMxNVp9Param c_vp9_highbd_10_tests[] = { - make_tuple(64, 64, highbd_sad_64x64_c_vp9, 10), - make_tuple(32, 32, highbd_sad_32x32_c_vp9, 10), - make_tuple(16, 16, highbd_sad_16x16_c_vp9, 10), - make_tuple(8, 16, highbd_sad_8x16_c_vp9, 10), - make_tuple(16, 8, highbd_sad_16x8_c_vp9, 10), - make_tuple(8, 8, highbd_sad_8x8_c_vp9, 10), - make_tuple(8, 4, highbd_sad_8x4_c_vp9, 10), - make_tuple(4, 8, highbd_sad_4x8_c_vp9, 10), - make_tuple(4, 4, highbd_sad_4x4_c_vp9, 10), -}; -INSTANTIATE_TEST_CASE_P(C_10, SADVP9Test, - ::testing::ValuesIn(c_vp9_highbd_10_tests)); - -const SadMxNVp9Param c_vp9_highbd_12_tests[] = { - make_tuple(64, 64, highbd_sad_64x64_c_vp9, 12), - make_tuple(32, 32, highbd_sad_32x32_c_vp9, 12), - make_tuple(16, 16, highbd_sad_16x16_c_vp9, 12), - make_tuple(8, 16, highbd_sad_8x16_c_vp9, 12), - make_tuple(16, 8, highbd_sad_16x8_c_vp9, 12), - make_tuple(8, 8, highbd_sad_8x8_c_vp9, 12), - make_tuple(8, 4, highbd_sad_8x4_c_vp9, 12), - make_tuple(4, 8, highbd_sad_4x8_c_vp9, 12), - make_tuple(4, 4, highbd_sad_4x4_c_vp9, 12), +const SadMxNAvgFunc highbd_sad64x64_avg_c = vpx_highbd_sad64x64_avg_c; +const SadMxNAvgFunc highbd_sad64x32_avg_c = vpx_highbd_sad64x32_avg_c; +const SadMxNAvgFunc highbd_sad32x64_avg_c = vpx_highbd_sad32x64_avg_c; +const SadMxNAvgFunc highbd_sad32x32_avg_c = vpx_highbd_sad32x32_avg_c; +const SadMxNAvgFunc highbd_sad32x16_avg_c = vpx_highbd_sad32x16_avg_c; +const SadMxNAvgFunc highbd_sad16x32_avg_c = vpx_highbd_sad16x32_avg_c; +const SadMxNAvgFunc highbd_sad16x16_avg_c = vpx_highbd_sad16x16_avg_c; +const SadMxNAvgFunc highbd_sad16x8_avg_c = vpx_highbd_sad16x8_avg_c; +const SadMxNAvgFunc highbd_sad8x16_avg_c = vpx_highbd_sad8x16_avg_c; +const SadMxNAvgFunc highbd_sad8x8_avg_c = vpx_highbd_sad8x8_avg_c; +const SadMxNAvgFunc highbd_sad8x4_avg_c = vpx_highbd_sad8x4_avg_c; +const SadMxNAvgFunc highbd_sad4x8_avg_c = vpx_highbd_sad4x8_avg_c; +const SadMxNAvgFunc highbd_sad4x4_avg_c = vpx_highbd_sad4x4_avg_c; +#endif // CONFIG_VP9_HIGHBITDEPTH +const SadMxNAvgParam avg_c_tests[] = { + make_tuple(64, 64, sad64x64_avg_c, -1), + make_tuple(64, 32, sad64x32_avg_c, -1), + make_tuple(32, 64, sad32x64_avg_c, -1), + make_tuple(32, 32, sad32x32_avg_c, -1), + make_tuple(32, 16, sad32x16_avg_c, -1), + make_tuple(16, 32, sad16x32_avg_c, -1), + make_tuple(16, 16, sad16x16_avg_c, -1), + make_tuple(16, 8, sad16x8_avg_c, -1), + make_tuple(8, 16, sad8x16_avg_c, -1), + make_tuple(8, 8, sad8x8_avg_c, -1), + make_tuple(8, 4, sad8x4_avg_c, -1), + make_tuple(4, 8, sad4x8_avg_c, -1), + make_tuple(4, 4, sad4x4_avg_c, -1), +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(64, 64, highbd_sad64x64_avg_c, 8), + make_tuple(64, 32, highbd_sad64x32_avg_c, 8), + make_tuple(32, 64, highbd_sad32x64_avg_c, 8), + make_tuple(32, 32, highbd_sad32x32_avg_c, 8), + make_tuple(32, 16, highbd_sad32x16_avg_c, 8), + make_tuple(16, 32, highbd_sad16x32_avg_c, 8), + make_tuple(16, 16, highbd_sad16x16_avg_c, 8), + make_tuple(16, 8, highbd_sad16x8_avg_c, 8), + make_tuple(8, 16, highbd_sad8x16_avg_c, 8), + make_tuple(8, 8, highbd_sad8x8_avg_c, 8), + make_tuple(8, 4, highbd_sad8x4_avg_c, 8), + make_tuple(4, 8, highbd_sad4x8_avg_c, 8), + make_tuple(4, 4, highbd_sad4x4_avg_c, 8), + make_tuple(64, 64, highbd_sad64x64_avg_c, 10), + make_tuple(64, 32, highbd_sad64x32_avg_c, 10), + make_tuple(32, 64, highbd_sad32x64_avg_c, 10), + make_tuple(32, 32, highbd_sad32x32_avg_c, 10), + make_tuple(32, 16, highbd_sad32x16_avg_c, 10), + make_tuple(16, 32, highbd_sad16x32_avg_c, 10), + make_tuple(16, 16, highbd_sad16x16_avg_c, 10), + make_tuple(16, 8, highbd_sad16x8_avg_c, 10), + make_tuple(8, 16, highbd_sad8x16_avg_c, 10), + make_tuple(8, 8, highbd_sad8x8_avg_c, 10), + make_tuple(8, 4, highbd_sad8x4_avg_c, 10), + make_tuple(4, 8, highbd_sad4x8_avg_c, 10), + make_tuple(4, 4, highbd_sad4x4_avg_c, 10), + make_tuple(64, 64, highbd_sad64x64_avg_c, 12), + make_tuple(64, 32, highbd_sad64x32_avg_c, 12), + make_tuple(32, 64, highbd_sad32x64_avg_c, 12), + make_tuple(32, 32, highbd_sad32x32_avg_c, 12), + make_tuple(32, 16, highbd_sad32x16_avg_c, 12), + make_tuple(16, 32, highbd_sad16x32_avg_c, 12), + make_tuple(16, 16, highbd_sad16x16_avg_c, 12), + make_tuple(16, 8, highbd_sad16x8_avg_c, 12), + make_tuple(8, 16, highbd_sad8x16_avg_c, 12), + make_tuple(8, 8, highbd_sad8x8_avg_c, 12), + make_tuple(8, 4, highbd_sad8x4_avg_c, 12), + make_tuple(4, 8, highbd_sad4x8_avg_c, 12), + make_tuple(4, 4, highbd_sad4x4_avg_c, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; -INSTANTIATE_TEST_CASE_P(C_12, SADVP9Test, - ::testing::ValuesIn(c_vp9_highbd_12_tests)); - -const SadMxNAvgVp9Func highbd_sad8x4_avg_c_vp9 = vp9_highbd_sad8x4_avg_c; -const SadMxNAvgVp9Func highbd_sad8x8_avg_c_vp9 = vp9_highbd_sad8x8_avg_c; -const SadMxNAvgVp9Func highbd_sad8x16_avg_c_vp9 = vp9_highbd_sad8x16_avg_c; -const SadMxNAvgVp9Func highbd_sad16x8_avg_c_vp9 = vp9_highbd_sad16x8_avg_c; -const SadMxNAvgVp9Func highbd_sad16x16_avg_c_vp9 = vp9_highbd_sad16x16_avg_c; -const SadMxNAvgVp9Func highbd_sad16x32_avg_c_vp9 = vp9_highbd_sad16x32_avg_c; -const SadMxNAvgVp9Func highbd_sad32x16_avg_c_vp9 = vp9_highbd_sad32x16_avg_c; -const SadMxNAvgVp9Func highbd_sad32x32_avg_c_vp9 = vp9_highbd_sad32x32_avg_c; -const SadMxNAvgVp9Func highbd_sad32x64_avg_c_vp9 = vp9_highbd_sad32x64_avg_c; -const SadMxNAvgVp9Func highbd_sad64x32_avg_c_vp9 = vp9_highbd_sad64x32_avg_c; -const SadMxNAvgVp9Func highbd_sad64x64_avg_c_vp9 = vp9_highbd_sad64x64_avg_c; -SadMxNAvgVp9Param avg_c_vp9_highbd_8_tests[] = { - make_tuple(8, 4, highbd_sad8x4_avg_c_vp9, 8), - make_tuple(8, 8, highbd_sad8x8_avg_c_vp9, 8), - make_tuple(8, 16, highbd_sad8x16_avg_c_vp9, 8), - make_tuple(16, 8, highbd_sad16x8_avg_c_vp9, 8), - make_tuple(16, 16, highbd_sad16x16_avg_c_vp9, 8), - make_tuple(16, 32, highbd_sad16x32_avg_c_vp9, 8), - make_tuple(32, 16, highbd_sad32x16_avg_c_vp9, 8), - make_tuple(32, 32, highbd_sad32x32_avg_c_vp9, 8), - make_tuple(32, 64, highbd_sad32x64_avg_c_vp9, 8), - make_tuple(64, 32, highbd_sad64x32_avg_c_vp9, 8), - make_tuple(64, 64, highbd_sad64x64_avg_c_vp9, 8)}; -INSTANTIATE_TEST_CASE_P(C_8, SADavgVP9Test, - ::testing::ValuesIn(avg_c_vp9_highbd_8_tests)); - -SadMxNAvgVp9Param avg_c_vp9_highbd_10_tests[] = { - make_tuple(8, 4, highbd_sad8x4_avg_c_vp9, 10), - make_tuple(8, 8, highbd_sad8x8_avg_c_vp9, 10), - make_tuple(8, 16, highbd_sad8x16_avg_c_vp9, 10), - make_tuple(16, 8, highbd_sad16x8_avg_c_vp9, 10), - make_tuple(16, 16, highbd_sad16x16_avg_c_vp9, 10), - make_tuple(16, 32, highbd_sad16x32_avg_c_vp9, 10), - make_tuple(32, 16, highbd_sad32x16_avg_c_vp9, 10), - make_tuple(32, 32, highbd_sad32x32_avg_c_vp9, 10), - make_tuple(32, 64, highbd_sad32x64_avg_c_vp9, 10), - make_tuple(64, 32, highbd_sad64x32_avg_c_vp9, 10), - make_tuple(64, 64, highbd_sad64x64_avg_c_vp9, 10)}; -INSTANTIATE_TEST_CASE_P(C_10, SADavgVP9Test, - ::testing::ValuesIn(avg_c_vp9_highbd_10_tests)); - -SadMxNAvgVp9Param avg_c_vp9_highbd_12_tests[] = { - make_tuple(8, 4, highbd_sad8x4_avg_c_vp9, 12), - make_tuple(8, 8, highbd_sad8x8_avg_c_vp9, 12), - make_tuple(8, 16, highbd_sad8x16_avg_c_vp9, 12), - make_tuple(16, 8, highbd_sad16x8_avg_c_vp9, 12), - make_tuple(16, 16, highbd_sad16x16_avg_c_vp9, 12), - make_tuple(16, 32, highbd_sad16x32_avg_c_vp9, 12), - make_tuple(32, 16, highbd_sad32x16_avg_c_vp9, 12), - make_tuple(32, 32, highbd_sad32x32_avg_c_vp9, 12), - make_tuple(32, 64, highbd_sad32x64_avg_c_vp9, 12), - make_tuple(64, 32, highbd_sad64x32_avg_c_vp9, 12), - make_tuple(64, 64, highbd_sad64x64_avg_c_vp9, 12)}; -INSTANTIATE_TEST_CASE_P(C_12, SADavgVP9Test, - ::testing::ValuesIn(avg_c_vp9_highbd_12_tests)); - -const SadMxNx4Func highbd_sad_64x64x4d_c = vp9_highbd_sad64x64x4d_c; -const SadMxNx4Func highbd_sad_64x32x4d_c = vp9_highbd_sad64x32x4d_c; -const SadMxNx4Func highbd_sad_32x64x4d_c = vp9_highbd_sad32x64x4d_c; -const SadMxNx4Func highbd_sad_32x32x4d_c = vp9_highbd_sad32x32x4d_c; -const SadMxNx4Func highbd_sad_32x16x4d_c = vp9_highbd_sad32x16x4d_c; -const SadMxNx4Func highbd_sad_16x32x4d_c = vp9_highbd_sad16x32x4d_c; -const SadMxNx4Func highbd_sad_16x16x4d_c = vp9_highbd_sad16x16x4d_c; -const SadMxNx4Func highbd_sad_16x8x4d_c = vp9_highbd_sad16x8x4d_c; -const SadMxNx4Func highbd_sad_8x16x4d_c = vp9_highbd_sad8x16x4d_c; -const SadMxNx4Func highbd_sad_8x8x4d_c = vp9_highbd_sad8x8x4d_c; -const SadMxNx4Func highbd_sad_8x4x4d_c = vp9_highbd_sad8x4x4d_c; -const SadMxNx4Func highbd_sad_4x8x4d_c = vp9_highbd_sad4x8x4d_c; -const SadMxNx4Func highbd_sad_4x4x4d_c = vp9_highbd_sad4x4x4d_c; -INSTANTIATE_TEST_CASE_P(C_8, SADx4Test, ::testing::Values( - make_tuple(64, 64, highbd_sad_64x64x4d_c, 8), - make_tuple(64, 32, highbd_sad_64x32x4d_c, 8), - make_tuple(32, 64, highbd_sad_32x64x4d_c, 8), - make_tuple(32, 32, highbd_sad_32x32x4d_c, 8), - make_tuple(32, 16, highbd_sad_32x16x4d_c, 8), - make_tuple(16, 32, highbd_sad_16x32x4d_c, 8), - make_tuple(16, 16, highbd_sad_16x16x4d_c, 8), - make_tuple(16, 8, highbd_sad_16x8x4d_c, 8), - make_tuple(8, 16, highbd_sad_8x16x4d_c, 8), - make_tuple(8, 8, highbd_sad_8x8x4d_c, 8), - make_tuple(8, 4, highbd_sad_8x4x4d_c, 8), - make_tuple(4, 8, highbd_sad_4x8x4d_c, 8), - make_tuple(4, 4, highbd_sad_4x4x4d_c, 8))); - -INSTANTIATE_TEST_CASE_P(C_10, SADx4Test, ::testing::Values( - make_tuple(64, 64, highbd_sad_64x64x4d_c, 10), - make_tuple(64, 32, highbd_sad_64x32x4d_c, 10), - make_tuple(32, 64, highbd_sad_32x64x4d_c, 10), - make_tuple(32, 32, highbd_sad_32x32x4d_c, 10), - make_tuple(32, 16, highbd_sad_32x16x4d_c, 10), - make_tuple(16, 32, highbd_sad_16x32x4d_c, 10), - make_tuple(16, 16, highbd_sad_16x16x4d_c, 10), - make_tuple(16, 8, highbd_sad_16x8x4d_c, 10), - make_tuple(8, 16, highbd_sad_8x16x4d_c, 10), - make_tuple(8, 8, highbd_sad_8x8x4d_c, 10), - make_tuple(8, 4, highbd_sad_8x4x4d_c, 10), - make_tuple(4, 8, highbd_sad_4x8x4d_c, 10), - make_tuple(4, 4, highbd_sad_4x4x4d_c, 10))); - -INSTANTIATE_TEST_CASE_P(C_12, SADx4Test, ::testing::Values( - make_tuple(64, 64, highbd_sad_64x64x4d_c, 12), - make_tuple(64, 32, highbd_sad_64x32x4d_c, 12), - make_tuple(32, 64, highbd_sad_32x64x4d_c, 12), - make_tuple(32, 32, highbd_sad_32x32x4d_c, 12), - make_tuple(32, 16, highbd_sad_32x16x4d_c, 12), - make_tuple(16, 32, highbd_sad_16x32x4d_c, 12), - make_tuple(16, 16, highbd_sad_16x16x4d_c, 12), - make_tuple(16, 8, highbd_sad_16x8x4d_c, 12), - make_tuple(8, 16, highbd_sad_8x16x4d_c, 12), - make_tuple(8, 8, highbd_sad_8x8x4d_c, 12), - make_tuple(8, 4, highbd_sad_8x4x4d_c, 12), - make_tuple(4, 8, highbd_sad_4x8x4d_c, 12), - make_tuple(4, 4, highbd_sad_4x4x4d_c, 12))); +INSTANTIATE_TEST_CASE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests)); + +const SadMxNx4Func sad64x64x4d_c = vpx_sad64x64x4d_c; +const SadMxNx4Func sad64x32x4d_c = vpx_sad64x32x4d_c; +const SadMxNx4Func sad32x64x4d_c = vpx_sad32x64x4d_c; +const SadMxNx4Func sad32x32x4d_c = vpx_sad32x32x4d_c; +const SadMxNx4Func sad32x16x4d_c = vpx_sad32x16x4d_c; +const SadMxNx4Func sad16x32x4d_c = vpx_sad16x32x4d_c; +const SadMxNx4Func sad16x16x4d_c = vpx_sad16x16x4d_c; +const SadMxNx4Func sad16x8x4d_c = vpx_sad16x8x4d_c; +const SadMxNx4Func sad8x16x4d_c = vpx_sad8x16x4d_c; +const SadMxNx4Func sad8x8x4d_c = vpx_sad8x8x4d_c; +const SadMxNx4Func sad8x4x4d_c = vpx_sad8x4x4d_c; +const SadMxNx4Func sad4x8x4d_c = vpx_sad4x8x4d_c; +const SadMxNx4Func sad4x4x4d_c = vpx_sad4x4x4d_c; +#if CONFIG_VP9_HIGHBITDEPTH +const SadMxNx4Func highbd_sad64x64x4d_c = vpx_highbd_sad64x64x4d_c; +const SadMxNx4Func highbd_sad64x32x4d_c = vpx_highbd_sad64x32x4d_c; +const SadMxNx4Func highbd_sad32x64x4d_c = vpx_highbd_sad32x64x4d_c; +const SadMxNx4Func highbd_sad32x32x4d_c = vpx_highbd_sad32x32x4d_c; +const SadMxNx4Func highbd_sad32x16x4d_c = vpx_highbd_sad32x16x4d_c; +const SadMxNx4Func highbd_sad16x32x4d_c = vpx_highbd_sad16x32x4d_c; +const SadMxNx4Func highbd_sad16x16x4d_c = vpx_highbd_sad16x16x4d_c; +const SadMxNx4Func highbd_sad16x8x4d_c = vpx_highbd_sad16x8x4d_c; +const SadMxNx4Func highbd_sad8x16x4d_c = vpx_highbd_sad8x16x4d_c; +const SadMxNx4Func highbd_sad8x8x4d_c = vpx_highbd_sad8x8x4d_c; +const SadMxNx4Func highbd_sad8x4x4d_c = vpx_highbd_sad8x4x4d_c; +const SadMxNx4Func highbd_sad4x8x4d_c = vpx_highbd_sad4x8x4d_c; +const SadMxNx4Func highbd_sad4x4x4d_c = vpx_highbd_sad4x4x4d_c; +#endif // CONFIG_VP9_HIGHBITDEPTH +const SadMxNx4Param x4d_c_tests[] = { + make_tuple(64, 64, sad64x64x4d_c, -1), + make_tuple(64, 32, sad64x32x4d_c, -1), + make_tuple(32, 64, sad32x64x4d_c, -1), + make_tuple(32, 32, sad32x32x4d_c, -1), + make_tuple(32, 16, sad32x16x4d_c, -1), + make_tuple(16, 32, sad16x32x4d_c, -1), + make_tuple(16, 16, sad16x16x4d_c, -1), + make_tuple(16, 8, sad16x8x4d_c, -1), + make_tuple(8, 16, sad8x16x4d_c, -1), + make_tuple(8, 8, sad8x8x4d_c, -1), + make_tuple(8, 4, sad8x4x4d_c, -1), + make_tuple(4, 8, sad4x8x4d_c, -1), + make_tuple(4, 4, sad4x4x4d_c, -1), +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(64, 64, highbd_sad64x64x4d_c, 8), + make_tuple(64, 32, highbd_sad64x32x4d_c, 8), + make_tuple(32, 64, highbd_sad32x64x4d_c, 8), + make_tuple(32, 32, highbd_sad32x32x4d_c, 8), + make_tuple(32, 16, highbd_sad32x16x4d_c, 8), + make_tuple(16, 32, highbd_sad16x32x4d_c, 8), + make_tuple(16, 16, highbd_sad16x16x4d_c, 8), + make_tuple(16, 8, highbd_sad16x8x4d_c, 8), + make_tuple(8, 16, highbd_sad8x16x4d_c, 8), + make_tuple(8, 8, highbd_sad8x8x4d_c, 8), + make_tuple(8, 4, highbd_sad8x4x4d_c, 8), + make_tuple(4, 8, highbd_sad4x8x4d_c, 8), + make_tuple(4, 4, highbd_sad4x4x4d_c, 8), + make_tuple(64, 64, highbd_sad64x64x4d_c, 10), + make_tuple(64, 32, highbd_sad64x32x4d_c, 10), + make_tuple(32, 64, highbd_sad32x64x4d_c, 10), + make_tuple(32, 32, highbd_sad32x32x4d_c, 10), + make_tuple(32, 16, highbd_sad32x16x4d_c, 10), + make_tuple(16, 32, highbd_sad16x32x4d_c, 10), + make_tuple(16, 16, highbd_sad16x16x4d_c, 10), + make_tuple(16, 8, highbd_sad16x8x4d_c, 10), + make_tuple(8, 16, highbd_sad8x16x4d_c, 10), + make_tuple(8, 8, highbd_sad8x8x4d_c, 10), + make_tuple(8, 4, highbd_sad8x4x4d_c, 10), + make_tuple(4, 8, highbd_sad4x8x4d_c, 10), + make_tuple(4, 4, highbd_sad4x4x4d_c, 10), + make_tuple(64, 64, highbd_sad64x64x4d_c, 12), + make_tuple(64, 32, highbd_sad64x32x4d_c, 12), + make_tuple(32, 64, highbd_sad32x64x4d_c, 12), + make_tuple(32, 32, highbd_sad32x32x4d_c, 12), + make_tuple(32, 16, highbd_sad32x16x4d_c, 12), + make_tuple(16, 32, highbd_sad16x32x4d_c, 12), + make_tuple(16, 16, highbd_sad16x16x4d_c, 12), + make_tuple(16, 8, highbd_sad16x8x4d_c, 12), + make_tuple(8, 16, highbd_sad8x16x4d_c, 12), + make_tuple(8, 8, highbd_sad8x8x4d_c, 12), + make_tuple(8, 4, highbd_sad8x4x4d_c, 12), + make_tuple(4, 8, highbd_sad4x8x4d_c, 12), + make_tuple(4, 4, highbd_sad4x4x4d_c, 12), #endif // CONFIG_VP9_HIGHBITDEPTH -#endif // CONFIG_VP9_ENCODER +}; +INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests)); //------------------------------------------------------------------------------ // ARM functions #if HAVE_MEDIA -#if CONFIG_VP8_ENCODER -const SadMxNFunc sad_16x16_armv6 = vp8_sad16x16_armv6; -INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::Values( - make_tuple(16, 16, sad_16x16_armv6, -1))); -#endif // CONFIG_VP8_ENCODER +const SadMxNFunc sad16x16_media = vpx_sad16x16_media; +const SadMxNParam media_tests[] = { + make_tuple(16, 16, sad16x16_media, -1), +}; +INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::ValuesIn(media_tests)); #endif // HAVE_MEDIA #if HAVE_NEON -#if CONFIG_VP8_ENCODER -const SadMxNFunc sad_16x16_neon = vp8_sad16x16_neon; -const SadMxNFunc sad_8x16_neon = vp8_sad8x16_neon; -const SadMxNFunc sad_16x8_neon = vp8_sad16x8_neon; -const SadMxNFunc sad_8x8_neon = vp8_sad8x8_neon; -const SadMxNFunc sad_4x4_neon = vp8_sad4x4_neon; -INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::Values( - make_tuple(16, 16, sad_16x16_neon, -1), - make_tuple(8, 16, sad_8x16_neon, -1), - make_tuple(16, 8, sad_16x8_neon, -1), - make_tuple(8, 8, sad_8x8_neon, -1), - make_tuple(4, 4, sad_4x4_neon, -1))); -#endif // CONFIG_VP8_ENCODER -#if CONFIG_VP9_ENCODER -const SadMxNVp9Func sad_64x64_neon_vp9 = vp9_sad64x64_neon; -const SadMxNVp9Func sad_32x32_neon_vp9 = vp9_sad32x32_neon; -const SadMxNVp9Func sad_16x16_neon_vp9 = vp9_sad16x16_neon; -const SadMxNVp9Func sad_8x8_neon_vp9 = vp9_sad8x8_neon; -const SadMxNVp9Param neon_vp9_tests[] = { - make_tuple(64, 64, sad_64x64_neon_vp9, -1), - make_tuple(32, 32, sad_32x32_neon_vp9, -1), - make_tuple(16, 16, sad_16x16_neon_vp9, -1), - make_tuple(8, 8, sad_8x8_neon_vp9, -1), +const SadMxNFunc sad64x64_neon = vpx_sad64x64_neon; +const SadMxNFunc sad32x32_neon = vpx_sad32x32_neon; +const SadMxNFunc sad16x16_neon = vpx_sad16x16_neon; +const SadMxNFunc sad16x8_neon = vpx_sad16x8_neon; +const SadMxNFunc sad8x16_neon = vpx_sad8x16_neon; +const SadMxNFunc sad8x8_neon = vpx_sad8x8_neon; +const SadMxNFunc sad4x4_neon = vpx_sad4x4_neon; + +const SadMxNParam neon_tests[] = { + make_tuple(64, 64, sad64x64_neon, -1), + make_tuple(32, 32, sad32x32_neon, -1), + make_tuple(16, 16, sad16x16_neon, -1), + make_tuple(16, 8, sad16x8_neon, -1), + make_tuple(8, 16, sad8x16_neon, -1), + make_tuple(8, 8, sad8x8_neon, -1), + make_tuple(4, 4, sad4x4_neon, -1), +}; +INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests)); + +const SadMxNx4Func sad64x64x4d_neon = vpx_sad64x64x4d_neon; +const SadMxNx4Func sad32x32x4d_neon = vpx_sad32x32x4d_neon; +const SadMxNx4Func sad16x16x4d_neon = vpx_sad16x16x4d_neon; +const SadMxNx4Param x4d_neon_tests[] = { + make_tuple(64, 64, sad64x64x4d_neon, -1), + make_tuple(32, 32, sad32x32x4d_neon, -1), + make_tuple(16, 16, sad16x16x4d_neon, -1), }; -INSTANTIATE_TEST_CASE_P(NEON, SADVP9Test, ::testing::ValuesIn(neon_vp9_tests)); -#endif // CONFIG_VP9_ENCODER +INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests)); #endif // HAVE_NEON //------------------------------------------------------------------------------ // x86 functions #if HAVE_MMX -#if CONFIG_VP8_ENCODER -const SadMxNFunc sad_16x16_mmx = vp8_sad16x16_mmx; -const SadMxNFunc sad_8x16_mmx = vp8_sad8x16_mmx; -const SadMxNFunc sad_16x8_mmx = vp8_sad16x8_mmx; -const SadMxNFunc sad_8x8_mmx = vp8_sad8x8_mmx; -const SadMxNFunc sad_4x4_mmx = vp8_sad4x4_mmx; +const SadMxNFunc sad16x16_mmx = vpx_sad16x16_mmx; +const SadMxNFunc sad16x8_mmx = vpx_sad16x8_mmx; +const SadMxNFunc sad8x16_mmx = vpx_sad8x16_mmx; +const SadMxNFunc sad8x8_mmx = vpx_sad8x8_mmx; +const SadMxNFunc sad4x4_mmx = vpx_sad4x4_mmx; const SadMxNParam mmx_tests[] = { - make_tuple(16, 16, sad_16x16_mmx, -1), - make_tuple(8, 16, sad_8x16_mmx, -1), - make_tuple(16, 8, sad_16x8_mmx, -1), - make_tuple(8, 8, sad_8x8_mmx, -1), - make_tuple(4, 4, sad_4x4_mmx, -1), + make_tuple(16, 16, sad16x16_mmx, -1), + make_tuple(16, 8, sad16x8_mmx, -1), + make_tuple(8, 16, sad8x16_mmx, -1), + make_tuple(8, 8, sad8x8_mmx, -1), + make_tuple(4, 4, sad4x4_mmx, -1), }; INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests)); -#endif // CONFIG_VP8_ENCODER - #endif // HAVE_MMX #if HAVE_SSE -#if CONFIG_VP9_ENCODER #if CONFIG_USE_X86INC -const SadMxNVp9Func sad_4x4_sse_vp9 = vp9_sad4x4_sse; -const SadMxNVp9Func sad_4x8_sse_vp9 = vp9_sad4x8_sse; -INSTANTIATE_TEST_CASE_P(SSE, SADVP9Test, ::testing::Values( - make_tuple(4, 4, sad_4x4_sse_vp9, -1), - make_tuple(4, 8, sad_4x8_sse_vp9, -1))); - -const SadMxNx4Func sad_4x8x4d_sse = vp9_sad4x8x4d_sse; -const SadMxNx4Func sad_4x4x4d_sse = vp9_sad4x4x4d_sse; -INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values( - make_tuple(4, 8, sad_4x8x4d_sse, -1), - make_tuple(4, 4, sad_4x4x4d_sse, -1))); +const SadMxNFunc sad4x8_sse = vpx_sad4x8_sse; +const SadMxNFunc sad4x4_sse = vpx_sad4x4_sse; +const SadMxNParam sse_tests[] = { + make_tuple(4, 8, sad4x8_sse, -1), + make_tuple(4, 4, sad4x4_sse, -1), +}; +INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::ValuesIn(sse_tests)); + +const SadMxNAvgFunc sad4x8_avg_sse = vpx_sad4x8_avg_sse; +const SadMxNAvgFunc sad4x4_avg_sse = vpx_sad4x4_avg_sse; +const SadMxNAvgParam avg_sse_tests[] = { + make_tuple(4, 8, sad4x8_avg_sse, -1), + make_tuple(4, 4, sad4x4_avg_sse, -1), +}; +INSTANTIATE_TEST_CASE_P(SSE, SADavgTest, ::testing::ValuesIn(avg_sse_tests)); + +const SadMxNx4Func sad4x8x4d_sse = vpx_sad4x8x4d_sse; +const SadMxNx4Func sad4x4x4d_sse = vpx_sad4x4x4d_sse; +const SadMxNx4Param x4d_sse_tests[] = { + make_tuple(4, 8, sad4x8x4d_sse, -1), + make_tuple(4, 4, sad4x4x4d_sse, -1), +}; +INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::ValuesIn(x4d_sse_tests)); #endif // CONFIG_USE_X86INC -#endif // CONFIG_VP9_ENCODER #endif // HAVE_SSE #if HAVE_SSE2 -#if CONFIG_VP8_ENCODER -const SadMxNFunc sad_16x16_wmt = vp8_sad16x16_wmt; -const SadMxNFunc sad_8x16_wmt = vp8_sad8x16_wmt; -const SadMxNFunc sad_16x8_wmt = vp8_sad16x8_wmt; -const SadMxNFunc sad_8x8_wmt = vp8_sad8x8_wmt; -const SadMxNFunc sad_4x4_wmt = vp8_sad4x4_wmt; +#if CONFIG_USE_X86INC +const SadMxNFunc sad64x64_sse2 = vpx_sad64x64_sse2; +const SadMxNFunc sad64x32_sse2 = vpx_sad64x32_sse2; +const SadMxNFunc sad32x64_sse2 = vpx_sad32x64_sse2; +const SadMxNFunc sad32x32_sse2 = vpx_sad32x32_sse2; +const SadMxNFunc sad32x16_sse2 = vpx_sad32x16_sse2; +const SadMxNFunc sad16x32_sse2 = vpx_sad16x32_sse2; +const SadMxNFunc sad16x16_sse2 = vpx_sad16x16_sse2; +const SadMxNFunc sad16x8_sse2 = vpx_sad16x8_sse2; +const SadMxNFunc sad8x16_sse2 = vpx_sad8x16_sse2; +const SadMxNFunc sad8x8_sse2 = vpx_sad8x8_sse2; +const SadMxNFunc sad8x4_sse2 = vpx_sad8x4_sse2; +#if CONFIG_VP9_HIGHBITDEPTH +const SadMxNFunc highbd_sad64x64_sse2 = vpx_highbd_sad64x64_sse2; +const SadMxNFunc highbd_sad64x32_sse2 = vpx_highbd_sad64x32_sse2; +const SadMxNFunc highbd_sad32x64_sse2 = vpx_highbd_sad32x64_sse2; +const SadMxNFunc highbd_sad32x32_sse2 = vpx_highbd_sad32x32_sse2; +const SadMxNFunc highbd_sad32x16_sse2 = vpx_highbd_sad32x16_sse2; +const SadMxNFunc highbd_sad16x32_sse2 = vpx_highbd_sad16x32_sse2; +const SadMxNFunc highbd_sad16x16_sse2 = vpx_highbd_sad16x16_sse2; +const SadMxNFunc highbd_sad16x8_sse2 = vpx_highbd_sad16x8_sse2; +const SadMxNFunc highbd_sad8x16_sse2 = vpx_highbd_sad8x16_sse2; +const SadMxNFunc highbd_sad8x8_sse2 = vpx_highbd_sad8x8_sse2; +const SadMxNFunc highbd_sad8x4_sse2 = vpx_highbd_sad8x4_sse2; +#endif // CONFIG_VP9_HIGHBITDEPTH const SadMxNParam sse2_tests[] = { - make_tuple(16, 16, sad_16x16_wmt, -1), - make_tuple(8, 16, sad_8x16_wmt, -1), - make_tuple(16, 8, sad_16x8_wmt, -1), - make_tuple(8, 8, sad_8x8_wmt, -1), - make_tuple(4, 4, sad_4x4_wmt, -1), + make_tuple(64, 64, sad64x64_sse2, -1), + make_tuple(64, 32, sad64x32_sse2, -1), + make_tuple(32, 64, sad32x64_sse2, -1), + make_tuple(32, 32, sad32x32_sse2, -1), + make_tuple(32, 16, sad32x16_sse2, -1), + make_tuple(16, 32, sad16x32_sse2, -1), + make_tuple(16, 16, sad16x16_sse2, -1), + make_tuple(16, 8, sad16x8_sse2, -1), + make_tuple(8, 16, sad8x16_sse2, -1), + make_tuple(8, 8, sad8x8_sse2, -1), + make_tuple(8, 4, sad8x4_sse2, -1), +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(64, 64, highbd_sad64x64_sse2, 8), + make_tuple(64, 32, highbd_sad64x32_sse2, 8), + make_tuple(32, 64, highbd_sad32x64_sse2, 8), + make_tuple(32, 32, highbd_sad32x32_sse2, 8), + make_tuple(32, 16, highbd_sad32x16_sse2, 8), + make_tuple(16, 32, highbd_sad16x32_sse2, 8), + make_tuple(16, 16, highbd_sad16x16_sse2, 8), + make_tuple(16, 8, highbd_sad16x8_sse2, 8), + make_tuple(8, 16, highbd_sad8x16_sse2, 8), + make_tuple(8, 8, highbd_sad8x8_sse2, 8), + make_tuple(8, 4, highbd_sad8x4_sse2, 8), + make_tuple(64, 64, highbd_sad64x64_sse2, 10), + make_tuple(64, 32, highbd_sad64x32_sse2, 10), + make_tuple(32, 64, highbd_sad32x64_sse2, 10), + make_tuple(32, 32, highbd_sad32x32_sse2, 10), + make_tuple(32, 16, highbd_sad32x16_sse2, 10), + make_tuple(16, 32, highbd_sad16x32_sse2, 10), + make_tuple(16, 16, highbd_sad16x16_sse2, 10), + make_tuple(16, 8, highbd_sad16x8_sse2, 10), + make_tuple(8, 16, highbd_sad8x16_sse2, 10), + make_tuple(8, 8, highbd_sad8x8_sse2, 10), + make_tuple(8, 4, highbd_sad8x4_sse2, 10), + make_tuple(64, 64, highbd_sad64x64_sse2, 12), + make_tuple(64, 32, highbd_sad64x32_sse2, 12), + make_tuple(32, 64, highbd_sad32x64_sse2, 12), + make_tuple(32, 32, highbd_sad32x32_sse2, 12), + make_tuple(32, 16, highbd_sad32x16_sse2, 12), + make_tuple(16, 32, highbd_sad16x32_sse2, 12), + make_tuple(16, 16, highbd_sad16x16_sse2, 12), + make_tuple(16, 8, highbd_sad16x8_sse2, 12), + make_tuple(8, 16, highbd_sad8x16_sse2, 12), + make_tuple(8, 8, highbd_sad8x8_sse2, 12), + make_tuple(8, 4, highbd_sad8x4_sse2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests)); -#endif // CONFIG_VP8_ENCODER - -#if CONFIG_VP9_ENCODER -#if CONFIG_USE_X86INC -const SadMxNVp9Func sad_64x64_sse2_vp9 = vp9_sad64x64_sse2; -const SadMxNVp9Func sad_64x32_sse2_vp9 = vp9_sad64x32_sse2; -const SadMxNVp9Func sad_32x64_sse2_vp9 = vp9_sad32x64_sse2; -const SadMxNVp9Func sad_32x32_sse2_vp9 = vp9_sad32x32_sse2; -const SadMxNVp9Func sad_32x16_sse2_vp9 = vp9_sad32x16_sse2; -const SadMxNVp9Func sad_16x32_sse2_vp9 = vp9_sad16x32_sse2; -const SadMxNVp9Func sad_16x16_sse2_vp9 = vp9_sad16x16_sse2; -const SadMxNVp9Func sad_16x8_sse2_vp9 = vp9_sad16x8_sse2; -const SadMxNVp9Func sad_8x16_sse2_vp9 = vp9_sad8x16_sse2; -const SadMxNVp9Func sad_8x8_sse2_vp9 = vp9_sad8x8_sse2; -const SadMxNVp9Func sad_8x4_sse2_vp9 = vp9_sad8x4_sse2; - -const SadMxNx4Func sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2; -const SadMxNx4Func sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2; -const SadMxNx4Func sad_32x64x4d_sse2 = vp9_sad32x64x4d_sse2; -const SadMxNx4Func sad_32x32x4d_sse2 = vp9_sad32x32x4d_sse2; -const SadMxNx4Func sad_32x16x4d_sse2 = vp9_sad32x16x4d_sse2; -const SadMxNx4Func sad_16x32x4d_sse2 = vp9_sad16x32x4d_sse2; -const SadMxNx4Func sad_16x16x4d_sse2 = vp9_sad16x16x4d_sse2; -const SadMxNx4Func sad_16x8x4d_sse2 = vp9_sad16x8x4d_sse2; -const SadMxNx4Func sad_8x16x4d_sse2 = vp9_sad8x16x4d_sse2; -const SadMxNx4Func sad_8x8x4d_sse2 = vp9_sad8x8x4d_sse2; -const SadMxNx4Func sad_8x4x4d_sse2 = vp9_sad8x4x4d_sse2; +const SadMxNAvgFunc sad64x64_avg_sse2 = vpx_sad64x64_avg_sse2; +const SadMxNAvgFunc sad64x32_avg_sse2 = vpx_sad64x32_avg_sse2; +const SadMxNAvgFunc sad32x64_avg_sse2 = vpx_sad32x64_avg_sse2; +const SadMxNAvgFunc sad32x32_avg_sse2 = vpx_sad32x32_avg_sse2; +const SadMxNAvgFunc sad32x16_avg_sse2 = vpx_sad32x16_avg_sse2; +const SadMxNAvgFunc sad16x32_avg_sse2 = vpx_sad16x32_avg_sse2; +const SadMxNAvgFunc sad16x16_avg_sse2 = vpx_sad16x16_avg_sse2; +const SadMxNAvgFunc sad16x8_avg_sse2 = vpx_sad16x8_avg_sse2; +const SadMxNAvgFunc sad8x16_avg_sse2 = vpx_sad8x16_avg_sse2; +const SadMxNAvgFunc sad8x8_avg_sse2 = vpx_sad8x8_avg_sse2; +const SadMxNAvgFunc sad8x4_avg_sse2 = vpx_sad8x4_avg_sse2; +#if CONFIG_VP9_HIGHBITDEPTH +const SadMxNAvgFunc highbd_sad64x64_avg_sse2 = vpx_highbd_sad64x64_avg_sse2; +const SadMxNAvgFunc highbd_sad64x32_avg_sse2 = vpx_highbd_sad64x32_avg_sse2; +const SadMxNAvgFunc highbd_sad32x64_avg_sse2 = vpx_highbd_sad32x64_avg_sse2; +const SadMxNAvgFunc highbd_sad32x32_avg_sse2 = vpx_highbd_sad32x32_avg_sse2; +const SadMxNAvgFunc highbd_sad32x16_avg_sse2 = vpx_highbd_sad32x16_avg_sse2; +const SadMxNAvgFunc highbd_sad16x32_avg_sse2 = vpx_highbd_sad16x32_avg_sse2; +const SadMxNAvgFunc highbd_sad16x16_avg_sse2 = vpx_highbd_sad16x16_avg_sse2; +const SadMxNAvgFunc highbd_sad16x8_avg_sse2 = vpx_highbd_sad16x8_avg_sse2; +const SadMxNAvgFunc highbd_sad8x16_avg_sse2 = vpx_highbd_sad8x16_avg_sse2; +const SadMxNAvgFunc highbd_sad8x8_avg_sse2 = vpx_highbd_sad8x8_avg_sse2; +const SadMxNAvgFunc highbd_sad8x4_avg_sse2 = vpx_highbd_sad8x4_avg_sse2; +#endif // CONFIG_VP9_HIGHBITDEPTH +const SadMxNAvgParam avg_sse2_tests[] = { + make_tuple(64, 64, sad64x64_avg_sse2, -1), + make_tuple(64, 32, sad64x32_avg_sse2, -1), + make_tuple(32, 64, sad32x64_avg_sse2, -1), + make_tuple(32, 32, sad32x32_avg_sse2, -1), + make_tuple(32, 16, sad32x16_avg_sse2, -1), + make_tuple(16, 32, sad16x32_avg_sse2, -1), + make_tuple(16, 16, sad16x16_avg_sse2, -1), + make_tuple(16, 8, sad16x8_avg_sse2, -1), + make_tuple(8, 16, sad8x16_avg_sse2, -1), + make_tuple(8, 8, sad8x8_avg_sse2, -1), + make_tuple(8, 4, sad8x4_avg_sse2, -1), +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(64, 64, highbd_sad64x64_avg_sse2, 8), + make_tuple(64, 32, highbd_sad64x32_avg_sse2, 8), + make_tuple(32, 64, highbd_sad32x64_avg_sse2, 8), + make_tuple(32, 32, highbd_sad32x32_avg_sse2, 8), + make_tuple(32, 16, highbd_sad32x16_avg_sse2, 8), + make_tuple(16, 32, highbd_sad16x32_avg_sse2, 8), + make_tuple(16, 16, highbd_sad16x16_avg_sse2, 8), + make_tuple(16, 8, highbd_sad16x8_avg_sse2, 8), + make_tuple(8, 16, highbd_sad8x16_avg_sse2, 8), + make_tuple(8, 8, highbd_sad8x8_avg_sse2, 8), + make_tuple(8, 4, highbd_sad8x4_avg_sse2, 8), + make_tuple(64, 64, highbd_sad64x64_avg_sse2, 10), + make_tuple(64, 32, highbd_sad64x32_avg_sse2, 10), + make_tuple(32, 64, highbd_sad32x64_avg_sse2, 10), + make_tuple(32, 32, highbd_sad32x32_avg_sse2, 10), + make_tuple(32, 16, highbd_sad32x16_avg_sse2, 10), + make_tuple(16, 32, highbd_sad16x32_avg_sse2, 10), + make_tuple(16, 16, highbd_sad16x16_avg_sse2, 10), + make_tuple(16, 8, highbd_sad16x8_avg_sse2, 10), + make_tuple(8, 16, highbd_sad8x16_avg_sse2, 10), + make_tuple(8, 8, highbd_sad8x8_avg_sse2, 10), + make_tuple(8, 4, highbd_sad8x4_avg_sse2, 10), + make_tuple(64, 64, highbd_sad64x64_avg_sse2, 12), + make_tuple(64, 32, highbd_sad64x32_avg_sse2, 12), + make_tuple(32, 64, highbd_sad32x64_avg_sse2, 12), + make_tuple(32, 32, highbd_sad32x32_avg_sse2, 12), + make_tuple(32, 16, highbd_sad32x16_avg_sse2, 12), + make_tuple(16, 32, highbd_sad16x32_avg_sse2, 12), + make_tuple(16, 16, highbd_sad16x16_avg_sse2, 12), + make_tuple(16, 8, highbd_sad16x8_avg_sse2, 12), + make_tuple(8, 16, highbd_sad8x16_avg_sse2, 12), + make_tuple(8, 8, highbd_sad8x8_avg_sse2, 12), + make_tuple(8, 4, highbd_sad8x4_avg_sse2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_CASE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests)); + +const SadMxNx4Func sad64x64x4d_sse2 = vpx_sad64x64x4d_sse2; +const SadMxNx4Func sad64x32x4d_sse2 = vpx_sad64x32x4d_sse2; +const SadMxNx4Func sad32x64x4d_sse2 = vpx_sad32x64x4d_sse2; +const SadMxNx4Func sad32x32x4d_sse2 = vpx_sad32x32x4d_sse2; +const SadMxNx4Func sad32x16x4d_sse2 = vpx_sad32x16x4d_sse2; +const SadMxNx4Func sad16x32x4d_sse2 = vpx_sad16x32x4d_sse2; +const SadMxNx4Func sad16x16x4d_sse2 = vpx_sad16x16x4d_sse2; +const SadMxNx4Func sad16x8x4d_sse2 = vpx_sad16x8x4d_sse2; +const SadMxNx4Func sad8x16x4d_sse2 = vpx_sad8x16x4d_sse2; +const SadMxNx4Func sad8x8x4d_sse2 = vpx_sad8x8x4d_sse2; +const SadMxNx4Func sad8x4x4d_sse2 = vpx_sad8x4x4d_sse2; +#if CONFIG_VP9_HIGHBITDEPTH +const SadMxNx4Func highbd_sad64x64x4d_sse2 = vpx_highbd_sad64x64x4d_sse2; +const SadMxNx4Func highbd_sad64x32x4d_sse2 = vpx_highbd_sad64x32x4d_sse2; +const SadMxNx4Func highbd_sad32x64x4d_sse2 = vpx_highbd_sad32x64x4d_sse2; +const SadMxNx4Func highbd_sad32x32x4d_sse2 = vpx_highbd_sad32x32x4d_sse2; +const SadMxNx4Func highbd_sad32x16x4d_sse2 = vpx_highbd_sad32x16x4d_sse2; +const SadMxNx4Func highbd_sad16x32x4d_sse2 = vpx_highbd_sad16x32x4d_sse2; +const SadMxNx4Func highbd_sad16x16x4d_sse2 = vpx_highbd_sad16x16x4d_sse2; +const SadMxNx4Func highbd_sad16x8x4d_sse2 = vpx_highbd_sad16x8x4d_sse2; +const SadMxNx4Func highbd_sad8x16x4d_sse2 = vpx_highbd_sad8x16x4d_sse2; +const SadMxNx4Func highbd_sad8x8x4d_sse2 = vpx_highbd_sad8x8x4d_sse2; +const SadMxNx4Func highbd_sad8x4x4d_sse2 = vpx_highbd_sad8x4x4d_sse2; +const SadMxNx4Func highbd_sad4x8x4d_sse2 = vpx_highbd_sad4x8x4d_sse2; +const SadMxNx4Func highbd_sad4x4x4d_sse2 = vpx_highbd_sad4x4x4d_sse2; +#endif // CONFIG_VP9_HIGHBITDEPTH +const SadMxNx4Param x4d_sse2_tests[] = { + make_tuple(64, 64, sad64x64x4d_sse2, -1), + make_tuple(64, 32, sad64x32x4d_sse2, -1), + make_tuple(32, 64, sad32x64x4d_sse2, -1), + make_tuple(32, 32, sad32x32x4d_sse2, -1), + make_tuple(32, 16, sad32x16x4d_sse2, -1), + make_tuple(16, 32, sad16x32x4d_sse2, -1), + make_tuple(16, 16, sad16x16x4d_sse2, -1), + make_tuple(16, 8, sad16x8x4d_sse2, -1), + make_tuple(8, 16, sad8x16x4d_sse2, -1), + make_tuple(8, 8, sad8x8x4d_sse2, -1), + make_tuple(8, 4, sad8x4x4d_sse2, -1), #if CONFIG_VP9_HIGHBITDEPTH -const SadMxNVp9Func highbd_sad8x4_sse2_vp9 = vp9_highbd_sad8x4_sse2; -const SadMxNVp9Func highbd_sad8x8_sse2_vp9 = vp9_highbd_sad8x8_sse2; -const SadMxNVp9Func highbd_sad8x16_sse2_vp9 = vp9_highbd_sad8x16_sse2; -const SadMxNVp9Func highbd_sad16x8_sse2_vp9 = vp9_highbd_sad16x8_sse2; -const SadMxNVp9Func highbd_sad16x16_sse2_vp9 = vp9_highbd_sad16x16_sse2; -const SadMxNVp9Func highbd_sad16x32_sse2_vp9 = vp9_highbd_sad16x32_sse2; -const SadMxNVp9Func highbd_sad32x16_sse2_vp9 = vp9_highbd_sad32x16_sse2; -const SadMxNVp9Func highbd_sad32x32_sse2_vp9 = vp9_highbd_sad32x32_sse2; -const SadMxNVp9Func highbd_sad32x64_sse2_vp9 = vp9_highbd_sad32x64_sse2; -const SadMxNVp9Func highbd_sad64x32_sse2_vp9 = vp9_highbd_sad64x32_sse2; -const SadMxNVp9Func highbd_sad64x64_sse2_vp9 = vp9_highbd_sad64x64_sse2; - -INSTANTIATE_TEST_CASE_P(SSE2, SADVP9Test, ::testing::Values( - make_tuple(64, 64, sad_64x64_sse2_vp9, -1), - make_tuple(64, 32, sad_64x32_sse2_vp9, -1), - make_tuple(32, 64, sad_32x64_sse2_vp9, -1), - make_tuple(32, 32, sad_32x32_sse2_vp9, -1), - make_tuple(32, 16, sad_32x16_sse2_vp9, -1), - make_tuple(16, 32, sad_16x32_sse2_vp9, -1), - make_tuple(16, 16, sad_16x16_sse2_vp9, -1), - make_tuple(16, 8, sad_16x8_sse2_vp9, -1), - make_tuple(8, 16, sad_8x16_sse2_vp9, -1), - make_tuple(8, 8, sad_8x8_sse2_vp9, -1), - make_tuple(8, 4, sad_8x4_sse2_vp9, -1), - make_tuple(8, 4, highbd_sad8x4_sse2_vp9, 8), - make_tuple(8, 8, highbd_sad8x8_sse2_vp9, 8), - make_tuple(8, 16, highbd_sad8x16_sse2_vp9, 8), - make_tuple(16, 8, highbd_sad16x8_sse2_vp9, 8), - make_tuple(16, 16, highbd_sad16x16_sse2_vp9, 8), - make_tuple(16, 32, highbd_sad16x32_sse2_vp9, 8), - make_tuple(32, 16, highbd_sad32x16_sse2_vp9, 8), - make_tuple(32, 32, highbd_sad32x32_sse2_vp9, 8), - make_tuple(32, 64, highbd_sad32x64_sse2_vp9, 8), - make_tuple(64, 32, highbd_sad64x32_sse2_vp9, 8), - make_tuple(64, 64, highbd_sad64x64_sse2_vp9, 8), - make_tuple(8, 4, highbd_sad8x4_sse2_vp9, 10), - make_tuple(8, 8, highbd_sad8x8_sse2_vp9, 10), - make_tuple(8, 16, highbd_sad8x16_sse2_vp9, 10), - make_tuple(16, 8, highbd_sad16x8_sse2_vp9, 10), - make_tuple(16, 16, highbd_sad16x16_sse2_vp9, 10), - make_tuple(16, 32, highbd_sad16x32_sse2_vp9, 10), - make_tuple(32, 16, highbd_sad32x16_sse2_vp9, 10), - make_tuple(32, 32, highbd_sad32x32_sse2_vp9, 10), - make_tuple(32, 64, highbd_sad32x64_sse2_vp9, 10), - make_tuple(64, 32, highbd_sad64x32_sse2_vp9, 10), - make_tuple(64, 64, highbd_sad64x64_sse2_vp9, 10), - make_tuple(8, 4, highbd_sad8x4_sse2_vp9, 12), - make_tuple(8, 8, highbd_sad8x8_sse2_vp9, 12), - make_tuple(8, 16, highbd_sad8x16_sse2_vp9, 12), - make_tuple(16, 8, highbd_sad16x8_sse2_vp9, 12), - make_tuple(16, 16, highbd_sad16x16_sse2_vp9, 12), - make_tuple(16, 32, highbd_sad16x32_sse2_vp9, 12), - make_tuple(32, 16, highbd_sad32x16_sse2_vp9, 12), - make_tuple(32, 32, highbd_sad32x32_sse2_vp9, 12), - make_tuple(32, 64, highbd_sad32x64_sse2_vp9, 12), - make_tuple(64, 32, highbd_sad64x32_sse2_vp9, 12), - make_tuple(64, 64, highbd_sad64x64_sse2_vp9, 12))); - -const SadMxNAvgVp9Func highbd_sad8x4_avg_sse2_vp9 = vp9_highbd_sad8x4_avg_sse2; -const SadMxNAvgVp9Func highbd_sad8x8_avg_sse2_vp9 = vp9_highbd_sad8x8_avg_sse2; -const SadMxNAvgVp9Func highbd_sad8x16_avg_sse2_vp9 = - vp9_highbd_sad8x16_avg_sse2; -const SadMxNAvgVp9Func highbd_sad16x8_avg_sse2_vp9 = - vp9_highbd_sad16x8_avg_sse2; -const SadMxNAvgVp9Func highbd_sad16x16_avg_sse2_vp9 = - vp9_highbd_sad16x16_avg_sse2; -const SadMxNAvgVp9Func highbd_sad16x32_avg_sse2_vp9 = - vp9_highbd_sad16x32_avg_sse2; -const SadMxNAvgVp9Func highbd_sad32x16_avg_sse2_vp9 = - vp9_highbd_sad32x16_avg_sse2; -const SadMxNAvgVp9Func highbd_sad32x32_avg_sse2_vp9 = - vp9_highbd_sad32x32_avg_sse2; -const SadMxNAvgVp9Func highbd_sad32x64_avg_sse2_vp9 = - vp9_highbd_sad32x64_avg_sse2; -const SadMxNAvgVp9Func highbd_sad64x32_avg_sse2_vp9 = - vp9_highbd_sad64x32_avg_sse2; -const SadMxNAvgVp9Func highbd_sad64x64_avg_sse2_vp9 = - vp9_highbd_sad64x64_avg_sse2; - -INSTANTIATE_TEST_CASE_P(SSE2, SADavgVP9Test, ::testing::Values( - make_tuple(8, 4, highbd_sad8x4_avg_sse2_vp9, 8), - make_tuple(8, 8, highbd_sad8x8_avg_sse2_vp9, 8), - make_tuple(8, 16, highbd_sad8x16_avg_sse2_vp9, 8), - make_tuple(16, 8, highbd_sad16x8_avg_sse2_vp9, 8), - make_tuple(16, 16, highbd_sad16x16_avg_sse2_vp9, 8), - make_tuple(16, 32, highbd_sad16x32_avg_sse2_vp9, 8), - make_tuple(32, 16, highbd_sad32x16_avg_sse2_vp9, 8), - make_tuple(32, 32, highbd_sad32x32_avg_sse2_vp9, 8), - make_tuple(32, 64, highbd_sad32x64_avg_sse2_vp9, 8), - make_tuple(64, 32, highbd_sad64x32_avg_sse2_vp9, 8), - make_tuple(64, 64, highbd_sad64x64_avg_sse2_vp9, 8), - make_tuple(8, 4, highbd_sad8x4_avg_sse2_vp9, 10), - make_tuple(8, 8, highbd_sad8x8_avg_sse2_vp9, 10), - make_tuple(8, 16, highbd_sad8x16_avg_sse2_vp9, 10), - make_tuple(16, 8, highbd_sad16x8_avg_sse2_vp9, 10), - make_tuple(16, 16, highbd_sad16x16_avg_sse2_vp9, 10), - make_tuple(16, 32, highbd_sad16x32_avg_sse2_vp9, 10), - make_tuple(32, 16, highbd_sad32x16_avg_sse2_vp9, 10), - make_tuple(32, 32, highbd_sad32x32_avg_sse2_vp9, 10), - make_tuple(32, 64, highbd_sad32x64_avg_sse2_vp9, 10), - make_tuple(64, 32, highbd_sad64x32_avg_sse2_vp9, 10), - make_tuple(64, 64, highbd_sad64x64_avg_sse2_vp9, 10), - make_tuple(8, 4, highbd_sad8x4_avg_sse2_vp9, 12), - make_tuple(8, 8, highbd_sad8x8_avg_sse2_vp9, 12), - make_tuple(8, 16, highbd_sad8x16_avg_sse2_vp9, 12), - make_tuple(16, 8, highbd_sad16x8_avg_sse2_vp9, 12), - make_tuple(16, 16, highbd_sad16x16_avg_sse2_vp9, 12), - make_tuple(16, 32, highbd_sad16x32_avg_sse2_vp9, 12), - make_tuple(32, 16, highbd_sad32x16_avg_sse2_vp9, 12), - make_tuple(32, 32, highbd_sad32x32_avg_sse2_vp9, 12), - make_tuple(32, 64, highbd_sad32x64_avg_sse2_vp9, 12), - make_tuple(64, 32, highbd_sad64x32_avg_sse2_vp9, 12), - make_tuple(64, 64, highbd_sad64x64_avg_sse2_vp9, 12))); - -const SadMxNx4Func highbd_sad_64x64x4d_sse2 = vp9_highbd_sad64x64x4d_sse2; -const SadMxNx4Func highbd_sad_64x32x4d_sse2 = vp9_highbd_sad64x32x4d_sse2; -const SadMxNx4Func highbd_sad_32x64x4d_sse2 = vp9_highbd_sad32x64x4d_sse2; -const SadMxNx4Func highbd_sad_32x32x4d_sse2 = vp9_highbd_sad32x32x4d_sse2; -const SadMxNx4Func highbd_sad_32x16x4d_sse2 = vp9_highbd_sad32x16x4d_sse2; -const SadMxNx4Func highbd_sad_16x32x4d_sse2 = vp9_highbd_sad16x32x4d_sse2; -const SadMxNx4Func highbd_sad_16x16x4d_sse2 = vp9_highbd_sad16x16x4d_sse2; -const SadMxNx4Func highbd_sad_16x8x4d_sse2 = vp9_highbd_sad16x8x4d_sse2; -const SadMxNx4Func highbd_sad_8x16x4d_sse2 = vp9_highbd_sad8x16x4d_sse2; -const SadMxNx4Func highbd_sad_8x8x4d_sse2 = vp9_highbd_sad8x8x4d_sse2; -const SadMxNx4Func highbd_sad_8x4x4d_sse2 = vp9_highbd_sad8x4x4d_sse2; -const SadMxNx4Func highbd_sad_4x8x4d_sse2 = vp9_highbd_sad4x8x4d_sse2; -const SadMxNx4Func highbd_sad_4x4x4d_sse2 = vp9_highbd_sad4x4x4d_sse2; - -INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values( - make_tuple(64, 64, sad_64x64x4d_sse2, -1), - make_tuple(64, 32, sad_64x32x4d_sse2, -1), - make_tuple(32, 64, sad_32x64x4d_sse2, -1), - make_tuple(32, 32, sad_32x32x4d_sse2, -1), - make_tuple(32, 16, sad_32x16x4d_sse2, -1), - make_tuple(16, 32, sad_16x32x4d_sse2, -1), - make_tuple(16, 16, sad_16x16x4d_sse2, -1), - make_tuple(16, 8, sad_16x8x4d_sse2, -1), - make_tuple(8, 16, sad_8x16x4d_sse2, -1), - make_tuple(8, 8, sad_8x8x4d_sse2, -1), - make_tuple(8, 4, sad_8x4x4d_sse2, -1), - make_tuple(64, 64, highbd_sad_64x64x4d_sse2, 8), - make_tuple(64, 32, highbd_sad_64x32x4d_sse2, 8), - make_tuple(32, 64, highbd_sad_32x64x4d_sse2, 8), - make_tuple(32, 32, highbd_sad_32x32x4d_sse2, 8), - make_tuple(32, 16, highbd_sad_32x16x4d_sse2, 8), - make_tuple(16, 32, highbd_sad_16x32x4d_sse2, 8), - make_tuple(16, 16, highbd_sad_16x16x4d_sse2, 8), - make_tuple(16, 8, highbd_sad_16x8x4d_sse2, 8), - make_tuple(8, 16, highbd_sad_8x16x4d_sse2, 8), - make_tuple(8, 8, highbd_sad_8x8x4d_sse2, 8), - make_tuple(8, 4, highbd_sad_8x4x4d_sse2, 8), - make_tuple(4, 8, highbd_sad_4x8x4d_sse2, 8), - make_tuple(4, 4, highbd_sad_4x4x4d_sse2, 8), - make_tuple(64, 64, highbd_sad_64x64x4d_sse2, 10), - make_tuple(64, 32, highbd_sad_64x32x4d_sse2, 10), - make_tuple(32, 64, highbd_sad_32x64x4d_sse2, 10), - make_tuple(32, 32, highbd_sad_32x32x4d_sse2, 10), - make_tuple(32, 16, highbd_sad_32x16x4d_sse2, 10), - make_tuple(16, 32, highbd_sad_16x32x4d_sse2, 10), - make_tuple(16, 16, highbd_sad_16x16x4d_sse2, 10), - make_tuple(16, 8, highbd_sad_16x8x4d_sse2, 10), - make_tuple(8, 16, highbd_sad_8x16x4d_sse2, 10), - make_tuple(8, 8, highbd_sad_8x8x4d_sse2, 10), - make_tuple(8, 4, highbd_sad_8x4x4d_sse2, 10), - make_tuple(4, 8, highbd_sad_4x8x4d_sse2, 10), - make_tuple(4, 4, highbd_sad_4x4x4d_sse2, 10), - make_tuple(64, 64, highbd_sad_64x64x4d_sse2, 12), - make_tuple(64, 32, highbd_sad_64x32x4d_sse2, 12), - make_tuple(32, 64, highbd_sad_32x64x4d_sse2, 12), - make_tuple(32, 32, highbd_sad_32x32x4d_sse2, 12), - make_tuple(32, 16, highbd_sad_32x16x4d_sse2, 12), - make_tuple(16, 32, highbd_sad_16x32x4d_sse2, 12), - make_tuple(16, 16, highbd_sad_16x16x4d_sse2, 12), - make_tuple(16, 8, highbd_sad_16x8x4d_sse2, 12), - make_tuple(8, 16, highbd_sad_8x16x4d_sse2, 12), - make_tuple(8, 8, highbd_sad_8x8x4d_sse2, 12), - make_tuple(8, 4, highbd_sad_8x4x4d_sse2, 12), - make_tuple(4, 8, highbd_sad_4x8x4d_sse2, 12), - make_tuple(4, 4, highbd_sad_4x4x4d_sse2, 12))); -#else -INSTANTIATE_TEST_CASE_P(SSE2, SADVP9Test, ::testing::Values( - make_tuple(64, 64, sad_64x64_sse2_vp9, -1), - make_tuple(64, 32, sad_64x32_sse2_vp9, -1), - make_tuple(32, 64, sad_32x64_sse2_vp9, -1), - make_tuple(32, 32, sad_32x32_sse2_vp9, -1), - make_tuple(32, 16, sad_32x16_sse2_vp9, -1), - make_tuple(16, 32, sad_16x32_sse2_vp9, -1), - make_tuple(16, 16, sad_16x16_sse2_vp9, -1), - make_tuple(16, 8, sad_16x8_sse2_vp9, -1), - make_tuple(8, 16, sad_8x16_sse2_vp9, -1), - make_tuple(8, 8, sad_8x8_sse2_vp9, -1), - make_tuple(8, 4, sad_8x4_sse2_vp9, -1))); - -INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values( - make_tuple(64, 64, sad_64x64x4d_sse2, -1), - make_tuple(64, 32, sad_64x32x4d_sse2, -1), - make_tuple(32, 64, sad_32x64x4d_sse2, -1), - make_tuple(32, 32, sad_32x32x4d_sse2, -1), - make_tuple(32, 16, sad_32x16x4d_sse2, -1), - make_tuple(16, 32, sad_16x32x4d_sse2, -1), - make_tuple(16, 16, sad_16x16x4d_sse2, -1), - make_tuple(16, 8, sad_16x8x4d_sse2, -1), - make_tuple(8, 16, sad_8x16x4d_sse2, -1), - make_tuple(8, 8, sad_8x8x4d_sse2, -1), - make_tuple(8, 4, sad_8x4x4d_sse2, -1))); + make_tuple(64, 64, highbd_sad64x64x4d_sse2, 8), + make_tuple(64, 32, highbd_sad64x32x4d_sse2, 8), + make_tuple(32, 64, highbd_sad32x64x4d_sse2, 8), + make_tuple(32, 32, highbd_sad32x32x4d_sse2, 8), + make_tuple(32, 16, highbd_sad32x16x4d_sse2, 8), + make_tuple(16, 32, highbd_sad16x32x4d_sse2, 8), + make_tuple(16, 16, highbd_sad16x16x4d_sse2, 8), + make_tuple(16, 8, highbd_sad16x8x4d_sse2, 8), + make_tuple(8, 16, highbd_sad8x16x4d_sse2, 8), + make_tuple(8, 8, highbd_sad8x8x4d_sse2, 8), + make_tuple(8, 4, highbd_sad8x4x4d_sse2, 8), + make_tuple(4, 8, highbd_sad4x8x4d_sse2, 8), + make_tuple(4, 4, highbd_sad4x4x4d_sse2, 8), + make_tuple(64, 64, highbd_sad64x64x4d_sse2, 10), + make_tuple(64, 32, highbd_sad64x32x4d_sse2, 10), + make_tuple(32, 64, highbd_sad32x64x4d_sse2, 10), + make_tuple(32, 32, highbd_sad32x32x4d_sse2, 10), + make_tuple(32, 16, highbd_sad32x16x4d_sse2, 10), + make_tuple(16, 32, highbd_sad16x32x4d_sse2, 10), + make_tuple(16, 16, highbd_sad16x16x4d_sse2, 10), + make_tuple(16, 8, highbd_sad16x8x4d_sse2, 10), + make_tuple(8, 16, highbd_sad8x16x4d_sse2, 10), + make_tuple(8, 8, highbd_sad8x8x4d_sse2, 10), + make_tuple(8, 4, highbd_sad8x4x4d_sse2, 10), + make_tuple(4, 8, highbd_sad4x8x4d_sse2, 10), + make_tuple(4, 4, highbd_sad4x4x4d_sse2, 10), + make_tuple(64, 64, highbd_sad64x64x4d_sse2, 12), + make_tuple(64, 32, highbd_sad64x32x4d_sse2, 12), + make_tuple(32, 64, highbd_sad32x64x4d_sse2, 12), + make_tuple(32, 32, highbd_sad32x32x4d_sse2, 12), + make_tuple(32, 16, highbd_sad32x16x4d_sse2, 12), + make_tuple(16, 32, highbd_sad16x32x4d_sse2, 12), + make_tuple(16, 16, highbd_sad16x16x4d_sse2, 12), + make_tuple(16, 8, highbd_sad16x8x4d_sse2, 12), + make_tuple(8, 16, highbd_sad8x16x4d_sse2, 12), + make_tuple(8, 8, highbd_sad8x8x4d_sse2, 12), + make_tuple(8, 4, highbd_sad8x4x4d_sse2, 12), + make_tuple(4, 8, highbd_sad4x8x4d_sse2, 12), + make_tuple(4, 4, highbd_sad4x4x4d_sse2, 12), #endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests)); #endif // CONFIG_USE_X86INC -#endif // CONFIG_VP9_ENCODER #endif // HAVE_SSE2 #if HAVE_SSE3 -#if CONFIG_VP8_ENCODER -const SadMxNx4Func sad_16x16x4d_sse3 = vp8_sad16x16x4d_sse3; -const SadMxNx4Func sad_16x8x4d_sse3 = vp8_sad16x8x4d_sse3; -const SadMxNx4Func sad_8x16x4d_sse3 = vp8_sad8x16x4d_sse3; -const SadMxNx4Func sad_8x8x4d_sse3 = vp8_sad8x8x4d_sse3; -const SadMxNx4Func sad_4x4x4d_sse3 = vp8_sad4x4x4d_sse3; -INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values( - make_tuple(16, 16, sad_16x16x4d_sse3, -1), - make_tuple(16, 8, sad_16x8x4d_sse3, -1), - make_tuple(8, 16, sad_8x16x4d_sse3, -1), - make_tuple(8, 8, sad_8x8x4d_sse3, -1), - make_tuple(4, 4, sad_4x4x4d_sse3, -1))); -#endif // CONFIG_VP8_ENCODER +// Only functions are x3, which do not have tests. #endif // HAVE_SSE3 #if HAVE_SSSE3 -#if CONFIG_USE_X86INC -#if CONFIG_VP8_ENCODER -const SadMxNFunc sad_16x16_sse3 = vp8_sad16x16_sse3; -INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values( - make_tuple(16, 16, sad_16x16_sse3, -1))); -#endif // CONFIG_VP8_ENCODER -#endif // CONFIG_USE_X86INC +// Only functions are x3, which do not have tests. #endif // HAVE_SSSE3 -#if CONFIG_VP9_ENCODER +#if HAVE_SSE4_1 +// Only functions are x8, which do not have tests. +#endif // HAVE_SSE4_1 + #if HAVE_AVX2 -const SadMxNx4Func sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2; -const SadMxNx4Func sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2; -INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::Values( - make_tuple(32, 32, sad_32x32x4d_avx2, -1), - make_tuple(64, 64, sad_64x64x4d_avx2, -1))); -#endif // HAVE_AVX2 +const SadMxNFunc sad64x64_avx2 = vpx_sad64x64_avx2; +const SadMxNFunc sad64x32_avx2 = vpx_sad64x32_avx2; +const SadMxNFunc sad32x64_avx2 = vpx_sad32x64_avx2; +const SadMxNFunc sad32x32_avx2 = vpx_sad32x32_avx2; +const SadMxNFunc sad32x16_avx2 = vpx_sad32x16_avx2; +const SadMxNParam avx2_tests[] = { + make_tuple(64, 64, sad64x64_avx2, -1), + make_tuple(64, 32, sad64x32_avx2, -1), + make_tuple(32, 64, sad32x64_avx2, -1), + make_tuple(32, 32, sad32x32_avx2, -1), + make_tuple(32, 16, sad32x16_avx2, -1), +}; +INSTANTIATE_TEST_CASE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests)); + +const SadMxNAvgFunc sad64x64_avg_avx2 = vpx_sad64x64_avg_avx2; +const SadMxNAvgFunc sad64x32_avg_avx2 = vpx_sad64x32_avg_avx2; +const SadMxNAvgFunc sad32x64_avg_avx2 = vpx_sad32x64_avg_avx2; +const SadMxNAvgFunc sad32x32_avg_avx2 = vpx_sad32x32_avg_avx2; +const SadMxNAvgFunc sad32x16_avg_avx2 = vpx_sad32x16_avg_avx2; +const SadMxNAvgParam avg_avx2_tests[] = { + make_tuple(64, 64, sad64x64_avg_avx2, -1), + make_tuple(64, 32, sad64x32_avg_avx2, -1), + make_tuple(32, 64, sad32x64_avg_avx2, -1), + make_tuple(32, 32, sad32x32_avg_avx2, -1), + make_tuple(32, 16, sad32x16_avg_avx2, -1), +}; +INSTANTIATE_TEST_CASE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests)); -#if HAVE_NEON -const SadMxNx4Func sad_16x16x4d_neon = vp9_sad16x16x4d_neon; -const SadMxNx4Func sad_32x32x4d_neon = vp9_sad32x32x4d_neon; -const SadMxNx4Func sad_64x64x4d_neon = vp9_sad64x64x4d_neon; -INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::Values( - make_tuple(16, 16, sad_16x16x4d_neon, -1), - make_tuple(32, 32, sad_32x32x4d_neon, -1), - make_tuple(64, 64, sad_64x64x4d_neon, -1))); -#endif // HAVE_NEON -#endif // CONFIG_VP9_ENCODER +const SadMxNx4Func sad64x64x4d_avx2 = vpx_sad64x64x4d_avx2; +const SadMxNx4Func sad32x32x4d_avx2 = vpx_sad32x32x4d_avx2; +const SadMxNx4Param x4d_avx2_tests[] = { + make_tuple(64, 64, sad64x64x4d_avx2, -1), + make_tuple(32, 32, sad32x32x4d_avx2, -1), +}; +INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests)); +#endif // HAVE_AVX2 } // namespace diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc index 30a5255b2..edbeec294 100644 --- a/test/test_libvpx.cc +++ b/test/test_libvpx.cc @@ -19,6 +19,7 @@ extern void vp8_rtcd(); #if CONFIG_VP9 extern void vp9_rtcd(); #endif // CONFIG_VP9 +extern void vpx_dsp_rtcd(); extern void vpx_scale_rtcd(); } #include "third_party/googletest/src/include/gtest/gtest.h" @@ -64,6 +65,7 @@ int main(int argc, char **argv) { #if CONFIG_VP9 vp9_rtcd(); #endif // CONFIG_VP9 + vpx_dsp_rtcd(); vpx_scale_rtcd(); #endif // !CONFIG_SHARED diff --git a/vp8/common/arm/neon/sad_neon.c b/vp8/common/arm/neon/sad_neon.c deleted file mode 100644 index 6595ac051..000000000 --- a/vp8/common/arm/neon/sad_neon.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -unsigned int vp8_sad8x8_neon( - unsigned char *src_ptr, - int src_stride, - unsigned char *ref_ptr, - int ref_stride) { - uint8x8_t d0, d8; - uint16x8_t q12; - uint32x4_t q1; - uint64x2_t q3; - uint32x2_t d5; - int i; - - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(d0, d8); - - for (i = 0; i < 7; i++) { - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, d0, d8); - } - - q1 = vpaddlq_u16(q12); - q3 = vpaddlq_u32(q1); - d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), - vreinterpret_u32_u64(vget_high_u64(q3))); - - return vget_lane_u32(d5, 0); -} - -unsigned int vp8_sad8x16_neon( - unsigned char *src_ptr, - int src_stride, - unsigned char *ref_ptr, - int ref_stride) { - uint8x8_t d0, d8; - uint16x8_t q12; - uint32x4_t q1; - uint64x2_t q3; - uint32x2_t d5; - int i; - - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(d0, d8); - - for (i = 0; i < 15; i++) { - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, d0, d8); - } - - q1 = vpaddlq_u16(q12); - q3 = vpaddlq_u32(q1); - d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), - vreinterpret_u32_u64(vget_high_u64(q3))); - - return vget_lane_u32(d5, 0); -} - -unsigned int vp8_sad4x4_neon( - unsigned char *src_ptr, - int src_stride, - unsigned char *ref_ptr, - int ref_stride) { - uint8x8_t d0, d8; - uint16x8_t q12; - uint32x2_t d1; - uint64x1_t d3; - int i; - - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(d0, d8); - - for (i = 0; i < 3; i++) { - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, d0, d8); - } - - d1 = vpaddl_u16(vget_low_u16(q12)); - d3 = vpaddl_u32(d1); - - return vget_lane_u32(vreinterpret_u32_u64(d3), 0); -} - -unsigned int vp8_sad16x16_neon( - unsigned char *src_ptr, - int src_stride, - unsigned char *ref_ptr, - int ref_stride) { - uint8x16_t q0, q4; - uint16x8_t q12, q13; - uint32x4_t q1; - uint64x2_t q3; - uint32x2_t d5; - int i; - - q0 = vld1q_u8(src_ptr); - src_ptr += src_stride; - q4 = vld1q_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); - q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); - - for (i = 0; i < 15; i++) { - q0 = vld1q_u8(src_ptr); - src_ptr += src_stride; - q4 = vld1q_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); - q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); - } - - q12 = vaddq_u16(q12, q13); - q1 = vpaddlq_u16(q12); - q3 = vpaddlq_u32(q1); - d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), - vreinterpret_u32_u64(vget_high_u64(q3))); - - return vget_lane_u32(d5, 0); -} - -unsigned int vp8_sad16x8_neon( - unsigned char *src_ptr, - int src_stride, - unsigned char *ref_ptr, - int ref_stride) { - uint8x16_t q0, q4; - uint16x8_t q12, q13; - uint32x4_t q1; - uint64x2_t q3; - uint32x2_t d5; - int i; - - q0 = vld1q_u8(src_ptr); - src_ptr += src_stride; - q4 = vld1q_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); - q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); - - for (i = 0; i < 7; i++) { - q0 = vld1q_u8(src_ptr); - src_ptr += src_stride; - q4 = vld1q_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); - q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); - } - - q12 = vaddq_u16(q12, q13); - q1 = vpaddlq_u16(q12); - q3 = vpaddlq_u32(q1); - d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), - vreinterpret_u32_u64(vget_high_u64(q3))); - - return vget_lane_u32(d5, 0); -} diff --git a/vp8/common/copy_c.c b/vp8/common/copy_c.c new file mode 100644 index 000000000..febfcb24c --- /dev/null +++ b/vp8/common/copy_c.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vpx/vpx_integer.h" + +/* Copy 2 macroblocks to a buffer */ +void vp8_copy32xn_c(unsigned char *src_ptr, int src_stride, + unsigned char *dst_ptr, int dst_stride, + int height) +{ + int r; + + for (r = 0; r < height; r++) + { +#if !(CONFIG_FAST_UNALIGNED) + dst_ptr[0] = src_ptr[0]; + dst_ptr[1] = src_ptr[1]; + dst_ptr[2] = src_ptr[2]; + dst_ptr[3] = src_ptr[3]; + dst_ptr[4] = src_ptr[4]; + dst_ptr[5] = src_ptr[5]; + dst_ptr[6] = src_ptr[6]; + dst_ptr[7] = src_ptr[7]; + dst_ptr[8] = src_ptr[8]; + dst_ptr[9] = src_ptr[9]; + dst_ptr[10] = src_ptr[10]; + dst_ptr[11] = src_ptr[11]; + dst_ptr[12] = src_ptr[12]; + dst_ptr[13] = src_ptr[13]; + dst_ptr[14] = src_ptr[14]; + dst_ptr[15] = src_ptr[15]; + dst_ptr[16] = src_ptr[16]; + dst_ptr[17] = src_ptr[17]; + dst_ptr[18] = src_ptr[18]; + dst_ptr[19] = src_ptr[19]; + dst_ptr[20] = src_ptr[20]; + dst_ptr[21] = src_ptr[21]; + dst_ptr[22] = src_ptr[22]; + dst_ptr[23] = src_ptr[23]; + dst_ptr[24] = src_ptr[24]; + dst_ptr[25] = src_ptr[25]; + dst_ptr[26] = src_ptr[26]; + dst_ptr[27] = src_ptr[27]; + dst_ptr[28] = src_ptr[28]; + dst_ptr[29] = src_ptr[29]; + dst_ptr[30] = src_ptr[30]; + dst_ptr[31] = src_ptr[31]; +#else + ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0] ; + ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1] ; + ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2] ; + ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3] ; + ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4] ; + ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5] ; + ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6] ; + ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7] ; +#endif + src_ptr += src_stride; + dst_ptr += dst_stride; + + } +} diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c index eea63473d..d12dea193 100644 --- a/vp8/common/mfqe.c +++ b/vp8/common/mfqe.c @@ -17,10 +17,11 @@ * higher quality. */ -#include "postproc.h" -#include "variance.h" +#include "./vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "vp8/common/postproc.h" +#include "vp8/common/variance.h" #include "vpx_mem/vpx_mem.h" -#include "vp8_rtcd.h" #include "vpx_scale/yv12config.h" #include @@ -160,9 +161,9 @@ static void multiframe_quality_enhance_block vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse); vsad = (sse + 32)>>6; #else - sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, UINT_MAX) + 128) >> 8; - usad = (vp8_sad8x8(u, uv_stride, ud, uvd_stride, UINT_MAX) + 32) >> 6; - vsad = (vp8_sad8x8(v, uv_stride, vd, uvd_stride, UINT_MAX)+ 32) >> 6; + sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8; + usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6; + vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride)+ 32) >> 6; #endif } else /* if (blksize == 8) */ @@ -177,9 +178,9 @@ static void multiframe_quality_enhance_block vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse); vsad = (sse + 8)>>4; #else - sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, UINT_MAX) + 32) >> 6; - usad = (vp8_sad4x4(u, uv_stride, ud, uvd_stride, UINT_MAX) + 8) >> 4; - vsad = (vp8_sad4x4(v, uv_stride, vd, uvd_stride, UINT_MAX) + 8) >> 4; + sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6; + usad = (vpx_sad4x4(u, uv_stride, ud, uvd_stride) + 8) >> 4; + vsad = (vpx_sad4x4(v, uv_stride, vd, uvd_stride) + 8) >> 4; #endif } diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 63fde4c9c..56b7db7ec 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -303,88 +303,6 @@ specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon/; $vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt; $vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6; -# -# Single block SAD -# -add_proto qw/unsigned int vp8_sad4x4/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"; -specialize qw/vp8_sad4x4 mmx sse2 neon/; -$vp8_sad4x4_sse2=vp8_sad4x4_wmt; - -add_proto qw/unsigned int vp8_sad8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"; -specialize qw/vp8_sad8x8 mmx sse2 neon/; -$vp8_sad8x8_sse2=vp8_sad8x8_wmt; - -add_proto qw/unsigned int vp8_sad8x16/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"; -specialize qw/vp8_sad8x16 mmx sse2 neon/; -$vp8_sad8x16_sse2=vp8_sad8x16_wmt; - -add_proto qw/unsigned int vp8_sad16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"; -specialize qw/vp8_sad16x8 mmx sse2 neon/; -$vp8_sad16x8_sse2=vp8_sad16x8_wmt; - -add_proto qw/unsigned int vp8_sad16x16/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"; -specialize qw/vp8_sad16x16 mmx sse2 sse3 media neon/; -$vp8_sad16x16_sse2=vp8_sad16x16_wmt; -$vp8_sad16x16_media=vp8_sad16x16_armv6; - -# -# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally -# -add_proto qw/void vp8_sad4x4x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"; -specialize qw/vp8_sad4x4x3 sse3/; - -add_proto qw/void vp8_sad8x8x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"; -specialize qw/vp8_sad8x8x3 sse3/; - -add_proto qw/void vp8_sad8x16x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"; -specialize qw/vp8_sad8x16x3 sse3/; - -add_proto qw/void vp8_sad16x8x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"; -specialize qw/vp8_sad16x8x3 sse3 ssse3/; - -add_proto qw/void vp8_sad16x16x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"; -specialize qw/vp8_sad16x16x3 sse3 ssse3/; - -# Note the only difference in the following prototypes is that they return into -# an array of short -add_proto qw/void vp8_sad4x4x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"; -specialize qw/vp8_sad4x4x8 sse4_1/; -$vp8_sad4x4x8_sse4_1=vp8_sad4x4x8_sse4; - -add_proto qw/void vp8_sad8x8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"; -specialize qw/vp8_sad8x8x8 sse4_1/; -$vp8_sad8x8x8_sse4_1=vp8_sad8x8x8_sse4; - -add_proto qw/void vp8_sad8x16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"; -specialize qw/vp8_sad8x16x8 sse4_1/; -$vp8_sad8x16x8_sse4_1=vp8_sad8x16x8_sse4; - -add_proto qw/void vp8_sad16x8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"; -specialize qw/vp8_sad16x8x8 sse4_1/; -$vp8_sad16x8x8_sse4_1=vp8_sad16x8x8_sse4; - -add_proto qw/void vp8_sad16x16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"; -specialize qw/vp8_sad16x16x8 sse4_1/; -$vp8_sad16x16x8_sse4_1=vp8_sad16x16x8_sse4; - -# -# Multi-block SAD, comparing a reference to N independent blocks -# -add_proto qw/void vp8_sad4x4x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp8_sad4x4x4d sse3/; - -add_proto qw/void vp8_sad8x8x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp8_sad8x8x4d sse3/; - -add_proto qw/void vp8_sad8x16x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp8_sad8x16x4d sse3/; - -add_proto qw/void vp8_sad16x8x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp8_sad16x8x4d sse3/; - -add_proto qw/void vp8_sad16x16x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp8_sad16x16x4d sse3/; - # # Encoder functions below this point. # diff --git a/vp8/common/sad_c.c b/vp8/common/sad_c.c deleted file mode 100644 index 5f36fc96e..000000000 --- a/vp8/common/sad_c.c +++ /dev/null @@ -1,302 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include -#include -#include "vpx_config.h" -#include "vpx/vpx_integer.h" - -static unsigned int sad_mx_n_c(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned int max_sad, int m, int n) -{ - int r, c; - unsigned int sad = 0; - - for (r = 0; r < n; r++) - { - for (c = 0; c < m; c++) - { - sad += abs(src_ptr[c] - ref_ptr[c]); - } - - if (sad > max_sad) - break; - - src_ptr += src_stride; - ref_ptr += ref_stride; - } - - return sad; -} - -/* max_sad is provided as an optional optimization point. Alternative - * implementations of these functions are not required to check it. - */ - -unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned int max_sad) -{ - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 16); -} - -unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned int max_sad) -{ - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 8); -} - -unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned int max_sad) -{ - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 8); - -} - -unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned int max_sad) -{ - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 16); -} - -unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned int max_sad) -{ - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 4, 4); -} - -void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned int *sad_array) -{ - sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX); - sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX); - sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX); -} - -void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned short *sad_array) -{ - sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX); - sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX); - sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX); - sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX); - sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX); - sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX); - sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX); - sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX); -} - -void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned int *sad_array) -{ - sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX); - sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX); - sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX); -} - -void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned short *sad_array) -{ - sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX); - sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX); - sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX); - sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX); - sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX); - sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX); - sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX); - sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX); -} - -void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned int *sad_array) -{ - sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX); - sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX); - sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX); -} - -void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned short *sad_array) -{ - sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX); - sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX); - sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX); - sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX); - sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX); - sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX); - sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX); - sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX); -} - -void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned int *sad_array) -{ - sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX); - sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX); - sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX); -} - -void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned short *sad_array) -{ - sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX); - sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX); - sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX); - sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX); - sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX); - sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX); - sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX); - sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX); -} - -void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned int *sad_array) -{ - sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX); - sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX); - sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX); -} - -void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned short *sad_array) -{ - sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX); - sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX); - sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX); - sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX); - sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX); - sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX); - sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX); - sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX); -} - -void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, - const unsigned char * const ref_ptr[], int ref_stride, - unsigned int *sad_array) -{ - sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX); - sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX); - sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX); - sad_array[3] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX); -} - -void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, - const unsigned char * const ref_ptr[], int ref_stride, - unsigned int *sad_array) -{ - sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX); - sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX); - sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX); - sad_array[3] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX); -} - -void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, - const unsigned char * const ref_ptr[], int ref_stride, - unsigned int *sad_array) -{ - sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX); - sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX); - sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX); - sad_array[3] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX); -} - -void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, - const unsigned char * const ref_ptr[], int ref_stride, - unsigned int *sad_array) -{ - sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX); - sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX); - sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX); - sad_array[3] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX); -} - -void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, - const unsigned char * const ref_ptr[], int ref_stride, - unsigned int *sad_array) -{ - sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX); - sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX); - sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX); - sad_array[3] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX); -} - -/* Copy 2 macroblocks to a buffer */ -void vp8_copy32xn_c(unsigned char *src_ptr, int src_stride, - unsigned char *dst_ptr, int dst_stride, - int height) -{ - int r; - - for (r = 0; r < height; r++) - { -#if !(CONFIG_FAST_UNALIGNED) - dst_ptr[0] = src_ptr[0]; - dst_ptr[1] = src_ptr[1]; - dst_ptr[2] = src_ptr[2]; - dst_ptr[3] = src_ptr[3]; - dst_ptr[4] = src_ptr[4]; - dst_ptr[5] = src_ptr[5]; - dst_ptr[6] = src_ptr[6]; - dst_ptr[7] = src_ptr[7]; - dst_ptr[8] = src_ptr[8]; - dst_ptr[9] = src_ptr[9]; - dst_ptr[10] = src_ptr[10]; - dst_ptr[11] = src_ptr[11]; - dst_ptr[12] = src_ptr[12]; - dst_ptr[13] = src_ptr[13]; - dst_ptr[14] = src_ptr[14]; - dst_ptr[15] = src_ptr[15]; - dst_ptr[16] = src_ptr[16]; - dst_ptr[17] = src_ptr[17]; - dst_ptr[18] = src_ptr[18]; - dst_ptr[19] = src_ptr[19]; - dst_ptr[20] = src_ptr[20]; - dst_ptr[21] = src_ptr[21]; - dst_ptr[22] = src_ptr[22]; - dst_ptr[23] = src_ptr[23]; - dst_ptr[24] = src_ptr[24]; - dst_ptr[25] = src_ptr[25]; - dst_ptr[26] = src_ptr[26]; - dst_ptr[27] = src_ptr[27]; - dst_ptr[28] = src_ptr[28]; - dst_ptr[29] = src_ptr[29]; - dst_ptr[30] = src_ptr[30]; - dst_ptr[31] = src_ptr[31]; -#else - ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0] ; - ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1] ; - ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2] ; - ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3] ; - ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4] ; - ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5] ; - ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6] ; - ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7] ; -#endif - src_ptr += src_stride; - dst_ptr += dst_stride; - - } -} diff --git a/vp8/common/variance.h b/vp8/common/variance.h index 89a32a722..552a28025 100644 --- a/vp8/common/variance.h +++ b/vp8/common/variance.h @@ -14,16 +14,17 @@ #include "vpx_config.h" +#include "vpx/vpx_integer.h" + #ifdef __cplusplus extern "C" { #endif -typedef unsigned int(*vp8_sad_fn_t)( - const unsigned char *src_ptr, +typedef unsigned int(*vpx_sad_fn_t)( + const uint8_t *src_ptr, int source_stride, - const unsigned char *ref_ptr, - int ref_stride, - unsigned int max_sad); + const uint8_t *ref_ptr, + int ref_stride); typedef void (*vp8_copy32xn_fn_t)( const unsigned char *src_ptr, @@ -32,27 +33,17 @@ typedef void (*vp8_copy32xn_fn_t)( int ref_stride, int n); -typedef void (*vp8_sad_multi_fn_t)( +typedef void (*vpx_sad_multi_fn_t)( const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, + const unsigned char *ref_array, int ref_stride, unsigned int *sad_array); - -typedef void (*vp8_sad_multi1_fn_t) - ( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int ref_stride, - unsigned short *sad_array - ); - -typedef void (*vp8_sad_multi_d_fn_t) +typedef void (*vpx_sad_multi_d_fn_t) ( const unsigned char *src_ptr, int source_stride, - const unsigned char * const ref_ptr[], + const unsigned char * const ref_array[], int ref_stride, unsigned int *sad_array ); @@ -102,15 +93,15 @@ typedef unsigned int (*vp8_get16x16prederror_fn_t) typedef struct variance_vtable { - vp8_sad_fn_t sdf; + vpx_sad_fn_t sdf; vp8_variance_fn_t vf; vp8_subpixvariance_fn_t svf; vp8_variance_fn_t svf_halfpix_h; vp8_variance_fn_t svf_halfpix_v; vp8_variance_fn_t svf_halfpix_hv; - vp8_sad_multi_fn_t sdx3f; - vp8_sad_multi1_fn_t sdx8f; - vp8_sad_multi_d_fn_t sdx4df; + vpx_sad_multi_fn_t sdx3f; + vpx_sad_multi_fn_t sdx8f; + vpx_sad_multi_d_fn_t sdx4df; #if ARCH_X86 || ARCH_X86_64 vp8_copy32xn_fn_t copymem; #endif diff --git a/vp8/common/x86/copy_sse2.asm b/vp8/common/x86/copy_sse2.asm new file mode 100644 index 000000000..86fae2695 --- /dev/null +++ b/vp8/common/x86/copy_sse2.asm @@ -0,0 +1,93 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void vp8_copy32xn_sse2( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; int height); +global sym(vp8_copy32xn_sse2) PRIVATE +sym(vp8_copy32xn_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;dst_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;dst_stride + movsxd rcx, dword ptr arg(4) ;height + +.block_copy_sse2_loopx4: + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + 16] + movdqu xmm2, XMMWORD PTR [rsi + rax] + movdqu xmm3, XMMWORD PTR [rsi + rax + 16] + + lea rsi, [rsi+rax*2] + + movdqu xmm4, XMMWORD PTR [rsi] + movdqu xmm5, XMMWORD PTR [rsi + 16] + movdqu xmm6, XMMWORD PTR [rsi + rax] + movdqu xmm7, XMMWORD PTR [rsi + rax + 16] + + lea rsi, [rsi+rax*2] + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm1 + movdqa XMMWORD PTR [rdi + rdx], xmm2 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 + + lea rdi, [rdi+rdx*2] + + movdqa XMMWORD PTR [rdi], xmm4 + movdqa XMMWORD PTR [rdi + 16], xmm5 + movdqa XMMWORD PTR [rdi + rdx], xmm6 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 + + lea rdi, [rdi+rdx*2] + + sub rcx, 4 + cmp rcx, 4 + jge .block_copy_sse2_loopx4 + + cmp rcx, 0 + je .copy_is_done + +.block_copy_sse2_loop: + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + 16] + lea rsi, [rsi+rax] + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm1 + lea rdi, [rdi+rdx] + + sub rcx, 1 + jne .block_copy_sse2_loop + +.copy_is_done: + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/common/x86/copy_sse3.asm b/vp8/common/x86/copy_sse3.asm new file mode 100644 index 000000000..d789a40cc --- /dev/null +++ b/vp8/common/x86/copy_sse3.asm @@ -0,0 +1,146 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro STACK_FRAME_CREATE_X3 0 +%if ABI_IS_32BIT + %define src_ptr rsi + %define src_stride rax + %define ref_ptr rdi + %define ref_stride rdx + %define end_ptr rcx + %define ret_var rbx + %define result_ptr arg(4) + %define max_sad arg(4) + %define height dword ptr arg(4) + push rbp + mov rbp, rsp + push rsi + push rdi + push rbx + + mov rsi, arg(0) ; src_ptr + mov rdi, arg(2) ; ref_ptr + + movsxd rax, dword ptr arg(1) ; src_stride + movsxd rdx, dword ptr arg(3) ; ref_stride +%else + %if LIBVPX_YASM_WIN64 + SAVE_XMM 7, u + %define src_ptr rcx + %define src_stride rdx + %define ref_ptr r8 + %define ref_stride r9 + %define end_ptr r10 + %define ret_var r11 + %define result_ptr [rsp+xmm_stack_space+8+4*8] + %define max_sad [rsp+xmm_stack_space+8+4*8] + %define height dword ptr [rsp+xmm_stack_space+8+4*8] + %else + %define src_ptr rdi + %define src_stride rsi + %define ref_ptr rdx + %define ref_stride rcx + %define end_ptr r9 + %define ret_var r10 + %define result_ptr r8 + %define max_sad r8 + %define height r8 + %endif +%endif + +%endmacro + +%macro STACK_FRAME_DESTROY_X3 0 + %define src_ptr + %define src_stride + %define ref_ptr + %define ref_stride + %define end_ptr + %define ret_var + %define result_ptr + %define max_sad + %define height + +%if ABI_IS_32BIT + pop rbx + pop rdi + pop rsi + pop rbp +%else + %if LIBVPX_YASM_WIN64 + RESTORE_XMM + %endif +%endif + ret +%endmacro + + +;void vp8_copy32xn_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; int height); +global sym(vp8_copy32xn_sse3) PRIVATE +sym(vp8_copy32xn_sse3): + + STACK_FRAME_CREATE_X3 + +.block_copy_sse3_loopx4: + lea end_ptr, [src_ptr+src_stride*2] + + movdqu xmm0, XMMWORD PTR [src_ptr] + movdqu xmm1, XMMWORD PTR [src_ptr + 16] + movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] + movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] + movdqu xmm4, XMMWORD PTR [end_ptr] + movdqu xmm5, XMMWORD PTR [end_ptr + 16] + movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] + movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] + + lea src_ptr, [src_ptr+src_stride*4] + + lea end_ptr, [ref_ptr+ref_stride*2] + + movdqa XMMWORD PTR [ref_ptr], xmm0 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 + movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 + movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 + movdqa XMMWORD PTR [end_ptr], xmm4 + movdqa XMMWORD PTR [end_ptr + 16], xmm5 + movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 + movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 + + lea ref_ptr, [ref_ptr+ref_stride*4] + + sub height, 4 + cmp height, 4 + jge .block_copy_sse3_loopx4 + + ;Check to see if there is more rows need to be copied. + cmp height, 0 + je .copy_is_done + +.block_copy_sse3_loop: + movdqu xmm0, XMMWORD PTR [src_ptr] + movdqu xmm1, XMMWORD PTR [src_ptr + 16] + lea src_ptr, [src_ptr+src_stride] + + movdqa XMMWORD PTR [ref_ptr], xmm0 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 + lea ref_ptr, [ref_ptr+ref_stride] + + sub height, 1 + jne .block_copy_sse3_loop + +.copy_is_done: + STACK_FRAME_DESTROY_X3 diff --git a/vp8/common/x86/sad_sse2.asm b/vp8/common/x86/sad_sse2.asm deleted file mode 100644 index 8d86abc07..000000000 --- a/vp8/common/x86/sad_sse2.asm +++ /dev/null @@ -1,410 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;unsigned int vp8_sad16x16_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -global sym(vp8_sad16x16_wmt) PRIVATE -sym(vp8_sad16x16_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - SAVE_XMM 6 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rax*8] - - lea rcx, [rcx+rax*8] - pxor xmm6, xmm6 - -.x16x16sad_wmt_loop: - - movq xmm0, QWORD PTR [rsi] - movq xmm2, QWORD PTR [rsi+8] - - movq xmm1, QWORD PTR [rdi] - movq xmm3, QWORD PTR [rdi+8] - - movq xmm4, QWORD PTR [rsi+rax] - movq xmm5, QWORD PTR [rdi+rdx] - - - punpcklbw xmm0, xmm2 - punpcklbw xmm1, xmm3 - - psadbw xmm0, xmm1 - movq xmm2, QWORD PTR [rsi+rax+8] - - movq xmm3, QWORD PTR [rdi+rdx+8] - lea rsi, [rsi+rax*2] - - lea rdi, [rdi+rdx*2] - punpcklbw xmm4, xmm2 - - punpcklbw xmm5, xmm3 - psadbw xmm4, xmm5 - - paddw xmm6, xmm0 - paddw xmm6, xmm4 - - cmp rsi, rcx - jne .x16x16sad_wmt_loop - - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movq rax, xmm0 - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;unsigned int vp8_sad8x16_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int max_sad) -global sym(vp8_sad8x16_wmt) PRIVATE -sym(vp8_sad8x16_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rbx, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rbx*8] - - lea rcx, [rcx+rbx*8] - pxor mm7, mm7 - -.x8x16sad_wmt_loop: - - movq rax, mm7 - cmp eax, arg(4) - ja .x8x16sad_wmt_early_exit - - movq mm0, QWORD PTR [rsi] - movq mm1, QWORD PTR [rdi] - - movq mm2, QWORD PTR [rsi+rbx] - movq mm3, QWORD PTR [rdi+rdx] - - psadbw mm0, mm1 - psadbw mm2, mm3 - - lea rsi, [rsi+rbx*2] - lea rdi, [rdi+rdx*2] - - paddw mm7, mm0 - paddw mm7, mm2 - - cmp rsi, rcx - jne .x8x16sad_wmt_loop - - movq rax, mm7 - -.x8x16sad_wmt_early_exit: - - ; begin epilog - pop rdi - pop rsi - pop rbx - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp8_sad8x8_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -global sym(vp8_sad8x8_wmt) PRIVATE -sym(vp8_sad8x8_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rbx, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rbx*8] - pxor mm7, mm7 - -.x8x8sad_wmt_loop: - - movq rax, mm7 - cmp eax, arg(4) - ja .x8x8sad_wmt_early_exit - - movq mm0, QWORD PTR [rsi] - movq mm1, QWORD PTR [rdi] - - psadbw mm0, mm1 - lea rsi, [rsi+rbx] - - add rdi, rdx - paddw mm7, mm0 - - cmp rsi, rcx - jne .x8x8sad_wmt_loop - - movq rax, mm7 -.x8x8sad_wmt_early_exit: - - ; begin epilog - pop rdi - pop rsi - pop rbx - UNSHADOW_ARGS - pop rbp - ret - -;unsigned int vp8_sad4x4_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -global sym(vp8_sad4x4_wmt) PRIVATE -sym(vp8_sad4x4_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - movd mm0, DWORD PTR [rsi] - movd mm1, DWORD PTR [rdi] - - movd mm2, DWORD PTR [rsi+rax] - movd mm3, DWORD PTR [rdi+rdx] - - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - - psadbw mm0, mm1 - lea rsi, [rsi+rax*2] - - lea rdi, [rdi+rdx*2] - movd mm4, DWORD PTR [rsi] - - movd mm5, DWORD PTR [rdi] - movd mm6, DWORD PTR [rsi+rax] - - movd mm7, DWORD PTR [rdi+rdx] - punpcklbw mm4, mm6 - - punpcklbw mm5, mm7 - psadbw mm4, mm5 - - paddw mm0, mm4 - movq rax, mm0 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp8_sad16x8_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -global sym(vp8_sad16x8_wmt) PRIVATE -sym(vp8_sad16x8_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rbx - push rsi - push rdi - ; end prolog - - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rbx, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rbx*8] - pxor mm7, mm7 - -.x16x8sad_wmt_loop: - - movq rax, mm7 - cmp eax, arg(4) - ja .x16x8sad_wmt_early_exit - - movq mm0, QWORD PTR [rsi] - movq mm2, QWORD PTR [rsi+8] - - movq mm1, QWORD PTR [rdi] - movq mm3, QWORD PTR [rdi+8] - - movq mm4, QWORD PTR [rsi+rbx] - movq mm5, QWORD PTR [rdi+rdx] - - psadbw mm0, mm1 - psadbw mm2, mm3 - - movq mm1, QWORD PTR [rsi+rbx+8] - movq mm3, QWORD PTR [rdi+rdx+8] - - psadbw mm4, mm5 - psadbw mm1, mm3 - - lea rsi, [rsi+rbx*2] - lea rdi, [rdi+rdx*2] - - paddw mm0, mm2 - paddw mm4, mm1 - - paddw mm7, mm0 - paddw mm7, mm4 - - cmp rsi, rcx - jne .x16x8sad_wmt_loop - - movq rax, mm7 - -.x16x8sad_wmt_early_exit: - - ; begin epilog - pop rdi - pop rsi - pop rbx - UNSHADOW_ARGS - pop rbp - ret - -;void vp8_copy32xn_sse2( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *dst_ptr, -; int dst_stride, -; int height); -global sym(vp8_copy32xn_sse2) PRIVATE -sym(vp8_copy32xn_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;dst_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;dst_stride - movsxd rcx, dword ptr arg(4) ;height - -.block_copy_sse2_loopx4: - movdqu xmm0, XMMWORD PTR [rsi] - movdqu xmm1, XMMWORD PTR [rsi + 16] - movdqu xmm2, XMMWORD PTR [rsi + rax] - movdqu xmm3, XMMWORD PTR [rsi + rax + 16] - - lea rsi, [rsi+rax*2] - - movdqu xmm4, XMMWORD PTR [rsi] - movdqu xmm5, XMMWORD PTR [rsi + 16] - movdqu xmm6, XMMWORD PTR [rsi + rax] - movdqu xmm7, XMMWORD PTR [rsi + rax + 16] - - lea rsi, [rsi+rax*2] - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi + 16], xmm1 - movdqa XMMWORD PTR [rdi + rdx], xmm2 - movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 - - lea rdi, [rdi+rdx*2] - - movdqa XMMWORD PTR [rdi], xmm4 - movdqa XMMWORD PTR [rdi + 16], xmm5 - movdqa XMMWORD PTR [rdi + rdx], xmm6 - movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 - - lea rdi, [rdi+rdx*2] - - sub rcx, 4 - cmp rcx, 4 - jge .block_copy_sse2_loopx4 - - cmp rcx, 0 - je .copy_is_done - -.block_copy_sse2_loop: - movdqu xmm0, XMMWORD PTR [rsi] - movdqu xmm1, XMMWORD PTR [rsi + 16] - lea rsi, [rsi+rax] - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi + 16], xmm1 - lea rdi, [rdi+rdx] - - sub rcx, 1 - jne .block_copy_sse2_loop - -.copy_is_done: - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/vp8/common/x86/sad_sse3.asm b/vp8/common/x86/sad_sse3.asm deleted file mode 100644 index 69c8d3769..000000000 --- a/vp8/common/x86/sad_sse3.asm +++ /dev/null @@ -1,960 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "vpx_ports/x86_abi_support.asm" - -%macro STACK_FRAME_CREATE_X3 0 -%if ABI_IS_32BIT - %define src_ptr rsi - %define src_stride rax - %define ref_ptr rdi - %define ref_stride rdx - %define end_ptr rcx - %define ret_var rbx - %define result_ptr arg(4) - %define max_sad arg(4) - %define height dword ptr arg(4) - push rbp - mov rbp, rsp - push rsi - push rdi - push rbx - - mov rsi, arg(0) ; src_ptr - mov rdi, arg(2) ; ref_ptr - - movsxd rax, dword ptr arg(1) ; src_stride - movsxd rdx, dword ptr arg(3) ; ref_stride -%else - %if LIBVPX_YASM_WIN64 - SAVE_XMM 7, u - %define src_ptr rcx - %define src_stride rdx - %define ref_ptr r8 - %define ref_stride r9 - %define end_ptr r10 - %define ret_var r11 - %define result_ptr [rsp+xmm_stack_space+8+4*8] - %define max_sad [rsp+xmm_stack_space+8+4*8] - %define height dword ptr [rsp+xmm_stack_space+8+4*8] - %else - %define src_ptr rdi - %define src_stride rsi - %define ref_ptr rdx - %define ref_stride rcx - %define end_ptr r9 - %define ret_var r10 - %define result_ptr r8 - %define max_sad r8 - %define height r8 - %endif -%endif - -%endmacro - -%macro STACK_FRAME_DESTROY_X3 0 - %define src_ptr - %define src_stride - %define ref_ptr - %define ref_stride - %define end_ptr - %define ret_var - %define result_ptr - %define max_sad - %define height - -%if ABI_IS_32BIT - pop rbx - pop rdi - pop rsi - pop rbp -%else - %if LIBVPX_YASM_WIN64 - RESTORE_XMM - %endif -%endif - ret -%endmacro - -%macro STACK_FRAME_CREATE_X4 0 -%if ABI_IS_32BIT - %define src_ptr rsi - %define src_stride rax - %define r0_ptr rcx - %define r1_ptr rdx - %define r2_ptr rbx - %define r3_ptr rdi - %define ref_stride rbp - %define result_ptr arg(4) - push rbp - mov rbp, rsp - push rsi - push rdi - push rbx - - push rbp - mov rdi, arg(2) ; ref_ptr_base - - LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi - - mov rsi, arg(0) ; src_ptr - - movsxd rbx, dword ptr arg(1) ; src_stride - movsxd rbp, dword ptr arg(3) ; ref_stride - - xchg rbx, rax -%else - %if LIBVPX_YASM_WIN64 - SAVE_XMM 7, u - %define src_ptr rcx - %define src_stride rdx - %define r0_ptr rsi - %define r1_ptr r10 - %define r2_ptr r11 - %define r3_ptr r8 - %define ref_stride r9 - %define result_ptr [rsp+xmm_stack_space+16+4*8] - push rsi - - LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr - %else - %define src_ptr rdi - %define src_stride rsi - %define r0_ptr r9 - %define r1_ptr r10 - %define r2_ptr r11 - %define r3_ptr rdx - %define ref_stride rcx - %define result_ptr r8 - - LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr - - %endif -%endif -%endmacro - -%macro STACK_FRAME_DESTROY_X4 0 - %define src_ptr - %define src_stride - %define r0_ptr - %define r1_ptr - %define r2_ptr - %define r3_ptr - %define ref_stride - %define result_ptr - -%if ABI_IS_32BIT - pop rbx - pop rdi - pop rsi - pop rbp -%else - %if LIBVPX_YASM_WIN64 - pop rsi - RESTORE_XMM - %endif -%endif - ret -%endmacro - -%macro PROCESS_16X2X3 5 -%if %1==0 - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm5, XMMWORD PTR [%3] - lddqu xmm6, XMMWORD PTR [%3+1] - lddqu xmm7, XMMWORD PTR [%3+2] - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm1, XMMWORD PTR [%3] - lddqu xmm2, XMMWORD PTR [%3+1] - lddqu xmm3, XMMWORD PTR [%3+2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [%2+%4] - lddqu xmm1, XMMWORD PTR [%3+%5] - lddqu xmm2, XMMWORD PTR [%3+%5+1] - lddqu xmm3, XMMWORD PTR [%3+%5+2] - -%if %1==0 || %1==1 - lea %2, [%2+%4*2] - lea %3, [%3+%5*2] -%endif - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_8X2X3 5 -%if %1==0 - movq mm0, QWORD PTR [%2] - movq mm5, QWORD PTR [%3] - movq mm6, QWORD PTR [%3+1] - movq mm7, QWORD PTR [%3+2] - - psadbw mm5, mm0 - psadbw mm6, mm0 - psadbw mm7, mm0 -%else - movq mm0, QWORD PTR [%2] - movq mm1, QWORD PTR [%3] - movq mm2, QWORD PTR [%3+1] - movq mm3, QWORD PTR [%3+2] - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm5, mm1 - paddw mm6, mm2 - paddw mm7, mm3 -%endif - movq mm0, QWORD PTR [%2+%4] - movq mm1, QWORD PTR [%3+%5] - movq mm2, QWORD PTR [%3+%5+1] - movq mm3, QWORD PTR [%3+%5+2] - -%if %1==0 || %1==1 - lea %2, [%2+%4*2] - lea %3, [%3+%5*2] -%endif - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm5, mm1 - paddw mm6, mm2 - paddw mm7, mm3 -%endmacro - -%macro LOAD_X4_ADDRESSES 5 - mov %2, [%1+REG_SZ_BYTES*0] - mov %3, [%1+REG_SZ_BYTES*1] - - mov %4, [%1+REG_SZ_BYTES*2] - mov %5, [%1+REG_SZ_BYTES*3] -%endmacro - -%macro PROCESS_16X2X4 8 -%if %1==0 - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm4, XMMWORD PTR [%3] - lddqu xmm5, XMMWORD PTR [%4] - lddqu xmm6, XMMWORD PTR [%5] - lddqu xmm7, XMMWORD PTR [%6] - - psadbw xmm4, xmm0 - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm1, XMMWORD PTR [%3] - lddqu xmm2, XMMWORD PTR [%4] - lddqu xmm3, XMMWORD PTR [%5] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm4, xmm1 - lddqu xmm1, XMMWORD PTR [%6] - paddw xmm5, xmm2 - paddw xmm6, xmm3 - - psadbw xmm1, xmm0 - paddw xmm7, xmm1 -%endif - movdqa xmm0, XMMWORD PTR [%2+%7] - lddqu xmm1, XMMWORD PTR [%3+%8] - lddqu xmm2, XMMWORD PTR [%4+%8] - lddqu xmm3, XMMWORD PTR [%5+%8] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm4, xmm1 - lddqu xmm1, XMMWORD PTR [%6+%8] - paddw xmm5, xmm2 - paddw xmm6, xmm3 - -%if %1==0 || %1==1 - lea %2, [%2+%7*2] - lea %3, [%3+%8*2] - - lea %4, [%4+%8*2] - lea %5, [%5+%8*2] - - lea %6, [%6+%8*2] -%endif - psadbw xmm1, xmm0 - paddw xmm7, xmm1 - -%endmacro - -%macro PROCESS_8X2X4 8 -%if %1==0 - movq mm0, QWORD PTR [%2] - movq mm4, QWORD PTR [%3] - movq mm5, QWORD PTR [%4] - movq mm6, QWORD PTR [%5] - movq mm7, QWORD PTR [%6] - - psadbw mm4, mm0 - psadbw mm5, mm0 - psadbw mm6, mm0 - psadbw mm7, mm0 -%else - movq mm0, QWORD PTR [%2] - movq mm1, QWORD PTR [%3] - movq mm2, QWORD PTR [%4] - movq mm3, QWORD PTR [%5] - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm4, mm1 - movq mm1, QWORD PTR [%6] - paddw mm5, mm2 - paddw mm6, mm3 - - psadbw mm1, mm0 - paddw mm7, mm1 -%endif - movq mm0, QWORD PTR [%2+%7] - movq mm1, QWORD PTR [%3+%8] - movq mm2, QWORD PTR [%4+%8] - movq mm3, QWORD PTR [%5+%8] - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm4, mm1 - movq mm1, QWORD PTR [%6+%8] - paddw mm5, mm2 - paddw mm6, mm3 - -%if %1==0 || %1==1 - lea %2, [%2+%7*2] - lea %3, [%3+%8*2] - - lea %4, [%4+%8*2] - lea %5, [%5+%8*2] - - lea %6, [%6+%8*2] -%endif - psadbw mm1, mm0 - paddw mm7, mm1 - -%endmacro - -;void int vp8_sad16x16x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp8_sad16x16x3_sse3) PRIVATE -sym(vp8_sad16x16x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+8], xmm0 - - STACK_FRAME_DESTROY_X3 - -;void int vp8_sad16x8x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp8_sad16x8x3_sse3) PRIVATE -sym(vp8_sad16x8x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+8], xmm0 - - STACK_FRAME_DESTROY_X3 - -;void int vp8_sad8x16x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp8_sad8x16x3_sse3) PRIVATE -sym(vp8_sad8x16x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - punpckldq mm5, mm6 - - movq [rcx], mm5 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 - -;void int vp8_sad8x8x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp8_sad8x8x3_sse3) PRIVATE -sym(vp8_sad8x8x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - punpckldq mm5, mm6 - - movq [rcx], mm5 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 - -;void int vp8_sad4x4x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp8_sad4x4x3_sse3) PRIVATE -sym(vp8_sad4x4x3_sse3): - - STACK_FRAME_CREATE_X3 - - movd mm0, DWORD PTR [src_ptr] - movd mm1, DWORD PTR [ref_ptr] - - movd mm2, DWORD PTR [src_ptr+src_stride] - movd mm3, DWORD PTR [ref_ptr+ref_stride] - - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - - movd mm4, DWORD PTR [ref_ptr+1] - movd mm5, DWORD PTR [ref_ptr+2] - - movd mm2, DWORD PTR [ref_ptr+ref_stride+1] - movd mm3, DWORD PTR [ref_ptr+ref_stride+2] - - psadbw mm1, mm0 - - punpcklbw mm4, mm2 - punpcklbw mm5, mm3 - - psadbw mm4, mm0 - psadbw mm5, mm0 - - lea src_ptr, [src_ptr+src_stride*2] - lea ref_ptr, [ref_ptr+ref_stride*2] - - movd mm0, DWORD PTR [src_ptr] - movd mm2, DWORD PTR [ref_ptr] - - movd mm3, DWORD PTR [src_ptr+src_stride] - movd mm6, DWORD PTR [ref_ptr+ref_stride] - - punpcklbw mm0, mm3 - punpcklbw mm2, mm6 - - movd mm3, DWORD PTR [ref_ptr+1] - movd mm7, DWORD PTR [ref_ptr+2] - - psadbw mm2, mm0 - - paddw mm1, mm2 - - movd mm2, DWORD PTR [ref_ptr+ref_stride+1] - movd mm6, DWORD PTR [ref_ptr+ref_stride+2] - - punpcklbw mm3, mm2 - punpcklbw mm7, mm6 - - psadbw mm3, mm0 - psadbw mm7, mm0 - - paddw mm3, mm4 - paddw mm7, mm5 - - mov rcx, result_ptr - - punpckldq mm1, mm3 - - movq [rcx], mm1 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 - -;unsigned int vp8_sad16x16_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int max_sad) -;%define lddqu movdqu -global sym(vp8_sad16x16_sse3) PRIVATE -sym(vp8_sad16x16_sse3): - - STACK_FRAME_CREATE_X3 - - mov end_ptr, 4 - pxor xmm7, xmm7 - -.vp8_sad16x16_sse3_loop: - movdqa xmm0, XMMWORD PTR [src_ptr] - movdqu xmm1, XMMWORD PTR [ref_ptr] - movdqa xmm2, XMMWORD PTR [src_ptr+src_stride] - movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride] - - lea src_ptr, [src_ptr+src_stride*2] - lea ref_ptr, [ref_ptr+ref_stride*2] - - movdqa xmm4, XMMWORD PTR [src_ptr] - movdqu xmm5, XMMWORD PTR [ref_ptr] - movdqa xmm6, XMMWORD PTR [src_ptr+src_stride] - - psadbw xmm0, xmm1 - - movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride] - - psadbw xmm2, xmm3 - psadbw xmm4, xmm5 - psadbw xmm6, xmm1 - - lea src_ptr, [src_ptr+src_stride*2] - lea ref_ptr, [ref_ptr+ref_stride*2] - - paddw xmm7, xmm0 - paddw xmm7, xmm2 - paddw xmm7, xmm4 - paddw xmm7, xmm6 - - sub end_ptr, 1 - jne .vp8_sad16x16_sse3_loop - - movq xmm0, xmm7 - psrldq xmm7, 8 - paddw xmm0, xmm7 - movq rax, xmm0 - - STACK_FRAME_DESTROY_X3 - -;void vp8_copy32xn_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *dst_ptr, -; int dst_stride, -; int height); -global sym(vp8_copy32xn_sse3) PRIVATE -sym(vp8_copy32xn_sse3): - - STACK_FRAME_CREATE_X3 - -.block_copy_sse3_loopx4: - lea end_ptr, [src_ptr+src_stride*2] - - movdqu xmm0, XMMWORD PTR [src_ptr] - movdqu xmm1, XMMWORD PTR [src_ptr + 16] - movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] - movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] - movdqu xmm4, XMMWORD PTR [end_ptr] - movdqu xmm5, XMMWORD PTR [end_ptr + 16] - movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] - movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] - - lea src_ptr, [src_ptr+src_stride*4] - - lea end_ptr, [ref_ptr+ref_stride*2] - - movdqa XMMWORD PTR [ref_ptr], xmm0 - movdqa XMMWORD PTR [ref_ptr + 16], xmm1 - movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 - movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 - movdqa XMMWORD PTR [end_ptr], xmm4 - movdqa XMMWORD PTR [end_ptr + 16], xmm5 - movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 - movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 - - lea ref_ptr, [ref_ptr+ref_stride*4] - - sub height, 4 - cmp height, 4 - jge .block_copy_sse3_loopx4 - - ;Check to see if there is more rows need to be copied. - cmp height, 0 - je .copy_is_done - -.block_copy_sse3_loop: - movdqu xmm0, XMMWORD PTR [src_ptr] - movdqu xmm1, XMMWORD PTR [src_ptr + 16] - lea src_ptr, [src_ptr+src_stride] - - movdqa XMMWORD PTR [ref_ptr], xmm0 - movdqa XMMWORD PTR [ref_ptr + 16], xmm1 - lea ref_ptr, [ref_ptr+ref_stride] - - sub height, 1 - jne .block_copy_sse3_loop - -.copy_is_done: - STACK_FRAME_DESTROY_X3 - -;void vp8_sad16x16x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr_base, -; int ref_stride, -; int *results) -global sym(vp8_sad16x16x4d_sse3) PRIVATE -sym(vp8_sad16x16x4d_sse3): - - STACK_FRAME_CREATE_X4 - - PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - -%if ABI_IS_32BIT - pop rbp -%endif - mov rcx, result_ptr - - movq xmm0, xmm4 - psrldq xmm4, 8 - - paddw xmm0, xmm4 - movd [rcx], xmm0 -;- - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+8], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+12], xmm0 - - STACK_FRAME_DESTROY_X4 - -;void vp8_sad16x8x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr_base, -; int ref_stride, -; int *results) -global sym(vp8_sad16x8x4d_sse3) PRIVATE -sym(vp8_sad16x8x4d_sse3): - - STACK_FRAME_CREATE_X4 - - PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - -%if ABI_IS_32BIT - pop rbp -%endif - mov rcx, result_ptr - - movq xmm0, xmm4 - psrldq xmm4, 8 - - paddw xmm0, xmm4 - movd [rcx], xmm0 -;- - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+8], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+12], xmm0 - - STACK_FRAME_DESTROY_X4 - -;void int vp8_sad8x16x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp8_sad8x16x4d_sse3) PRIVATE -sym(vp8_sad8x16x4d_sse3): - - STACK_FRAME_CREATE_X4 - - PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - -%if ABI_IS_32BIT - pop rbp -%endif - mov rcx, result_ptr - - punpckldq mm4, mm5 - punpckldq mm6, mm7 - - movq [rcx], mm4 - movq [rcx+8], mm6 - - STACK_FRAME_DESTROY_X4 - -;void int vp8_sad8x8x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp8_sad8x8x4d_sse3) PRIVATE -sym(vp8_sad8x8x4d_sse3): - - STACK_FRAME_CREATE_X4 - - PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - -%if ABI_IS_32BIT - pop rbp -%endif - mov rcx, result_ptr - - punpckldq mm4, mm5 - punpckldq mm6, mm7 - - movq [rcx], mm4 - movq [rcx+8], mm6 - - STACK_FRAME_DESTROY_X4 - -;void int vp8_sad4x4x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp8_sad4x4x4d_sse3) PRIVATE -sym(vp8_sad4x4x4d_sse3): - - STACK_FRAME_CREATE_X4 - - movd mm0, DWORD PTR [src_ptr] - movd mm1, DWORD PTR [r0_ptr] - - movd mm2, DWORD PTR [src_ptr+src_stride] - movd mm3, DWORD PTR [r0_ptr+ref_stride] - - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - - movd mm4, DWORD PTR [r1_ptr] - movd mm5, DWORD PTR [r2_ptr] - - movd mm6, DWORD PTR [r3_ptr] - movd mm2, DWORD PTR [r1_ptr+ref_stride] - - movd mm3, DWORD PTR [r2_ptr+ref_stride] - movd mm7, DWORD PTR [r3_ptr+ref_stride] - - psadbw mm1, mm0 - - punpcklbw mm4, mm2 - punpcklbw mm5, mm3 - - punpcklbw mm6, mm7 - psadbw mm4, mm0 - - psadbw mm5, mm0 - psadbw mm6, mm0 - - - - lea src_ptr, [src_ptr+src_stride*2] - lea r0_ptr, [r0_ptr+ref_stride*2] - - lea r1_ptr, [r1_ptr+ref_stride*2] - lea r2_ptr, [r2_ptr+ref_stride*2] - - lea r3_ptr, [r3_ptr+ref_stride*2] - - movd mm0, DWORD PTR [src_ptr] - movd mm2, DWORD PTR [r0_ptr] - - movd mm3, DWORD PTR [src_ptr+src_stride] - movd mm7, DWORD PTR [r0_ptr+ref_stride] - - punpcklbw mm0, mm3 - punpcklbw mm2, mm7 - - movd mm3, DWORD PTR [r1_ptr] - movd mm7, DWORD PTR [r2_ptr] - - psadbw mm2, mm0 -%if ABI_IS_32BIT - mov rax, rbp - - pop rbp -%define ref_stride rax -%endif - mov rsi, result_ptr - - paddw mm1, mm2 - movd [rsi], mm1 - - movd mm2, DWORD PTR [r1_ptr+ref_stride] - movd mm1, DWORD PTR [r2_ptr+ref_stride] - - punpcklbw mm3, mm2 - punpcklbw mm7, mm1 - - psadbw mm3, mm0 - psadbw mm7, mm0 - - movd mm2, DWORD PTR [r3_ptr] - movd mm1, DWORD PTR [r3_ptr+ref_stride] - - paddw mm3, mm4 - paddw mm7, mm5 - - movd [rsi+4], mm3 - punpcklbw mm2, mm1 - - movd [rsi+8], mm7 - psadbw mm2, mm0 - - paddw mm2, mm6 - movd [rsi+12], mm2 - - - STACK_FRAME_DESTROY_X4 - diff --git a/vp8/common/x86/sad_sse4.asm b/vp8/common/x86/sad_sse4.asm deleted file mode 100644 index f7fccd77c..000000000 --- a/vp8/common/x86/sad_sse4.asm +++ /dev/null @@ -1,353 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%macro PROCESS_16X2X8 1 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - movq xmm2, MMWORD PTR [rdi+16] - punpcklqdq xmm1, xmm3 - punpcklqdq xmm3, xmm2 - - movdqa xmm2, xmm1 - mpsadbw xmm1, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm1, xmm2 - paddw xmm1, xmm3 - paddw xmm1, xmm4 -%else - movdqa xmm0, XMMWORD PTR [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - movq xmm2, MMWORD PTR [rdi+16] - punpcklqdq xmm5, xmm3 - punpcklqdq xmm3, xmm2 - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm5, xmm2 - paddw xmm5, xmm3 - paddw xmm5, xmm4 - - paddw xmm1, xmm5 -%endif - movdqa xmm0, XMMWORD PTR [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - movq xmm2, MMWORD PTR [rdi+ rdx+16] - punpcklqdq xmm5, xmm3 - punpcklqdq xmm3, xmm2 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm5, xmm2 - paddw xmm5, xmm3 - paddw xmm5, xmm4 - - paddw xmm1, xmm5 -%endmacro - -%macro PROCESS_8X2X8 1 -%if %1 - movq xmm0, MMWORD PTR [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm1, xmm3 - - movdqa xmm2, xmm1 - mpsadbw xmm1, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm1, xmm2 -%else - movq xmm0, MMWORD PTR [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm5, xmm3 - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm5, xmm2 - - paddw xmm1, xmm5 -%endif - movq xmm0, MMWORD PTR [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - punpcklqdq xmm5, xmm3 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm5, xmm2 - - paddw xmm1, xmm5 -%endmacro - -%macro PROCESS_4X2X8 1 -%if %1 - movd xmm0, [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm1, xmm3 - - mpsadbw xmm1, xmm0, 0x0 -%else - movd xmm0, [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm5, xmm3 - - mpsadbw xmm5, xmm0, 0x0 - - paddw xmm1, xmm5 -%endif - movd xmm0, [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - punpcklqdq xmm5, xmm3 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - mpsadbw xmm5, xmm0, 0x0 - - paddw xmm1, xmm5 -%endmacro - - -;void vp8_sad16x16x8_sse4( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array); -global sym(vp8_sad16x16x8_sse4) PRIVATE -sym(vp8_sad16x16x8_sse4): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_sad16x8x8_sse4( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(vp8_sad16x8x8_sse4) PRIVATE -sym(vp8_sad16x8x8_sse4): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_sad8x8x8_sse4( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(vp8_sad8x8x8_sse4) PRIVATE -sym(vp8_sad8x8x8_sse4): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_sad8x16x8_sse4( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(vp8_sad8x16x8_sse4) PRIVATE -sym(vp8_sad8x16x8_sse4): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_sad4x4x8_c( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(vp8_sad4x4x8_sse4) PRIVATE -sym(vp8_sad4x4x8_sse4): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_4X2X8 1 - PROCESS_4X2X8 0 - - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - - diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index 7149f5f02..06f7f4639 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -9,6 +9,8 @@ */ +#include "./vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "onyx_int.h" #include "mcomp.h" #include "vpx_mem/vpx_mem.h" @@ -900,7 +902,7 @@ int vp8_hex_search this_offset = base_offset + (br * (pre_stride)) + bc; this_mv.as_mv.row = br; this_mv.as_mv.col = bc; - bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, UINT_MAX) + bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride) + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); #if CONFIG_MULTI_RES_ENCODING @@ -927,7 +929,7 @@ int vp8_hex_search this_mv.as_mv.row = br + hex[i].row; this_mv.as_mv.col = bc + hex[i].col; this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); CHECK_BETTER } }else @@ -938,7 +940,7 @@ int vp8_hex_search this_mv.as_mv.col = bc + hex[i].col; CHECK_POINT this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); CHECK_BETTER } } @@ -964,7 +966,7 @@ int vp8_hex_search this_mv.as_mv.row = br + next_chkpts[k][i].row; this_mv.as_mv.col = bc + next_chkpts[k][i].col; this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); CHECK_BETTER } }else @@ -975,7 +977,7 @@ int vp8_hex_search this_mv.as_mv.col = bc + next_chkpts[k][i].col; CHECK_POINT this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); CHECK_BETTER } } @@ -1006,7 +1008,7 @@ cal_neighbors: this_mv.as_mv.row = br + neighbors[i].row; this_mv.as_mv.col = bc + neighbors[i].col; this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); CHECK_BETTER } }else @@ -1017,7 +1019,7 @@ cal_neighbors: this_mv.as_mv.col = bc + neighbors[i].col; CHECK_POINT this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); CHECK_BETTER } } @@ -1101,7 +1103,7 @@ int vp8_diamond_search_sad_c best_address = in_what; /* Check the starting position */ - bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, UINT_MAX) + bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit); /* search_param determines the length of the initial step and hence @@ -1126,7 +1128,7 @@ int vp8_diamond_search_sad_c { check_here = ss[i].offset + best_address; - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad); + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride); if (thissad < bestsad) { @@ -1225,7 +1227,7 @@ int vp8_diamond_search_sadx4 best_address = in_what; /* Check the starting position */ - bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, UINT_MAX) + bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit); /* search_param determines the length of the initial step and hence the @@ -1293,7 +1295,7 @@ int vp8_diamond_search_sadx4 (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) { check_here = ss[i].offset + best_address; - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad); + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride); if (thissad < bestsad) { @@ -1376,8 +1378,7 @@ int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, best_mv->as_mv.col = ref_col; /* Baseline value at the centre */ - bestsad = fn_ptr->sdf(what, what_stride, bestaddress, - in_what_stride, UINT_MAX) + bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit); /* Apply further limits to prevent us looking using vectors that @@ -1402,7 +1403,7 @@ int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, for (c = col_min; c < col_max; c++) { - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad); + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride); this_mv.as_mv.col = c; thissad += mvsad_err_cost(&this_mv, &fcenter_mv, @@ -1474,8 +1475,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, best_mv->as_mv.col = ref_col; /* Baseline value at the centre */ - bestsad = fn_ptr->sdf(what, what_stride, bestaddress, - in_what_stride, UINT_MAX) + bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit); /* Apply further limits to prevent us looking using vectors that stretch @@ -1531,7 +1531,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, while (c < col_max) { - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad); + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride); if (thissad < bestsad) { @@ -1590,7 +1590,8 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int col_min = ref_col - distance; int col_max = ref_col + distance; - DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8); + // TODO(johannkoenig): check if this alignment is necessary. + DECLARE_ALIGNED_ARRAY(16, unsigned int, sad_array8, 8); unsigned int sad_array[3]; int *mvsadcost[2]; @@ -1609,8 +1610,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, best_mv->as_mv.col = ref_col; /* Baseline value at the centre */ - bestsad = fn_ptr->sdf(what, what_stride, - bestaddress, in_what_stride, UINT_MAX) + bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit); /* Apply further limits to prevent us looking using vectors that stretch @@ -1696,7 +1696,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, while (c < col_max) { - thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad); + thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride); if (thissad < bestsad) { @@ -1754,8 +1754,7 @@ int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - bestsad = fn_ptr->sdf(what, what_stride, best_address, - in_what_stride, UINT_MAX) + bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride) + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit); for (i=0; i x->mv_row_min) && (this_row_offset < x->mv_row_max)) { check_here = (neighbors[j].row)*in_what_stride + neighbors[j].col + best_address; - thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad); + thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride); if (thissad < bestsad) { @@ -1834,8 +1833,7 @@ int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - bestsad = fn_ptr->sdf(what, what_stride, best_address, - in_what_stride, UINT_MAX) + bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride) + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit); for (i=0; i x->mv_row_min) && (this_row_offset < x->mv_row_max)) { check_here = (neighbors[j].row)*in_what_stride + neighbors[j].col + best_address; - thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad); + thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride); if (thissad < bestsad) { diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index e7cbbc38b..5b452312e 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -11,6 +11,7 @@ #include "vpx_config.h" #include "./vpx_scale_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "./vp8_rtcd.h" #include "vp8/common/onyxc_int.h" #include "vp8/common/blockd.h" @@ -2126,55 +2127,55 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) } #endif - cpi->fn_ptr[BLOCK_16X16].sdf = vp8_sad16x16; + cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16; cpi->fn_ptr[BLOCK_16X16].vf = vp8_variance16x16; cpi->fn_ptr[BLOCK_16X16].svf = vp8_sub_pixel_variance16x16; cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vp8_variance_halfpixvar16x16_h; cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vp8_variance_halfpixvar16x16_v; cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vp8_variance_halfpixvar16x16_hv; - cpi->fn_ptr[BLOCK_16X16].sdx3f = vp8_sad16x16x3; - cpi->fn_ptr[BLOCK_16X16].sdx8f = vp8_sad16x16x8; - cpi->fn_ptr[BLOCK_16X16].sdx4df = vp8_sad16x16x4d; + cpi->fn_ptr[BLOCK_16X16].sdx3f = vpx_sad16x16x3; + cpi->fn_ptr[BLOCK_16X16].sdx8f = vpx_sad16x16x8; + cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d; - cpi->fn_ptr[BLOCK_16X8].sdf = vp8_sad16x8; + cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8; cpi->fn_ptr[BLOCK_16X8].vf = vp8_variance16x8; cpi->fn_ptr[BLOCK_16X8].svf = vp8_sub_pixel_variance16x8; cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL; cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL; - cpi->fn_ptr[BLOCK_16X8].sdx3f = vp8_sad16x8x3; - cpi->fn_ptr[BLOCK_16X8].sdx8f = vp8_sad16x8x8; - cpi->fn_ptr[BLOCK_16X8].sdx4df = vp8_sad16x8x4d; + cpi->fn_ptr[BLOCK_16X8].sdx3f = vpx_sad16x8x3; + cpi->fn_ptr[BLOCK_16X8].sdx8f = vpx_sad16x8x8; + cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d; - cpi->fn_ptr[BLOCK_8X16].sdf = vp8_sad8x16; + cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16; cpi->fn_ptr[BLOCK_8X16].vf = vp8_variance8x16; cpi->fn_ptr[BLOCK_8X16].svf = vp8_sub_pixel_variance8x16; cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL; cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL; - cpi->fn_ptr[BLOCK_8X16].sdx3f = vp8_sad8x16x3; - cpi->fn_ptr[BLOCK_8X16].sdx8f = vp8_sad8x16x8; - cpi->fn_ptr[BLOCK_8X16].sdx4df = vp8_sad8x16x4d; + cpi->fn_ptr[BLOCK_8X16].sdx3f = vpx_sad8x16x3; + cpi->fn_ptr[BLOCK_8X16].sdx8f = vpx_sad8x16x8; + cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d; - cpi->fn_ptr[BLOCK_8X8].sdf = vp8_sad8x8; + cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8; cpi->fn_ptr[BLOCK_8X8].vf = vp8_variance8x8; cpi->fn_ptr[BLOCK_8X8].svf = vp8_sub_pixel_variance8x8; cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL; cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL; - cpi->fn_ptr[BLOCK_8X8].sdx3f = vp8_sad8x8x3; - cpi->fn_ptr[BLOCK_8X8].sdx8f = vp8_sad8x8x8; - cpi->fn_ptr[BLOCK_8X8].sdx4df = vp8_sad8x8x4d; + cpi->fn_ptr[BLOCK_8X8].sdx3f = vpx_sad8x8x3; + cpi->fn_ptr[BLOCK_8X8].sdx8f = vpx_sad8x8x8; + cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d; - cpi->fn_ptr[BLOCK_4X4].sdf = vp8_sad4x4; + cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4; cpi->fn_ptr[BLOCK_4X4].vf = vp8_variance4x4; cpi->fn_ptr[BLOCK_4X4].svf = vp8_sub_pixel_variance4x4; cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL; cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL; - cpi->fn_ptr[BLOCK_4X4].sdx3f = vp8_sad4x4x3; - cpi->fn_ptr[BLOCK_4X4].sdx8f = vp8_sad4x4x8; - cpi->fn_ptr[BLOCK_4X4].sdx4df = vp8_sad4x4x4d; + cpi->fn_ptr[BLOCK_4X4].sdx3f = vpx_sad4x4x3; + cpi->fn_ptr[BLOCK_4X4].sdx8f = vpx_sad4x4x8; + cpi->fn_ptr[BLOCK_4X4].sdx4df = vpx_sad4x4x4d; #if ARCH_X86 || ARCH_X86_64 cpi->fn_ptr[BLOCK_16X16].copymem = vp8_copy32xn; diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index a945890e2..4f90402a7 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -1690,16 +1690,16 @@ void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffse }else if(xd->mb_to_top_edge==0) { /* only has left MB for sad calculation. */ near_sad[0] = near_sad[2] = INT_MAX; - near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, UINT_MAX); + near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride); }else if(xd->mb_to_left_edge ==0) { /* only has left MB for sad calculation. */ near_sad[1] = near_sad[2] = INT_MAX; - near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, UINT_MAX); + near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride); }else { - near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, UINT_MAX); - near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, UINT_MAX); - near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride, UINT_MAX); + near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride); + near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride); + near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride); } if(cpi->common.last_frame_type != KEY_FRAME) @@ -1714,14 +1714,14 @@ void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffse if(xd->mb_to_bottom_edge==0) near_sad[7] = INT_MAX; if(near_sad[4] != INT_MAX) - near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, UINT_MAX); + near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride); if(near_sad[5] != INT_MAX) - near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - 16, pre_y_stride, UINT_MAX); - near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer, pre_y_stride, UINT_MAX); + near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - 16, pre_y_stride); + near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer, pre_y_stride); if(near_sad[6] != INT_MAX) - near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + 16, pre_y_stride, UINT_MAX); + near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + 16, pre_y_stride); if(near_sad[7] != INT_MAX) - near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride, UINT_MAX); + near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride); } if(cpi->common.last_frame_type != KEY_FRAME) diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 9b11c0da3..b4c814075 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -15,6 +15,7 @@ VP8_COMMON_SRCS-yes += common/onyxd.h VP8_COMMON_SRCS-yes += common/alloccommon.c VP8_COMMON_SRCS-yes += common/blockd.c VP8_COMMON_SRCS-yes += common/coefupdateprobs.h +VP8_COMMON_SRCS-yes += common/copy_c.c VP8_COMMON_SRCS-yes += common/debugmodes.c VP8_COMMON_SRCS-yes += common/default_coef_probs.h VP8_COMMON_SRCS-yes += common/dequantize.c @@ -60,7 +61,6 @@ VP8_COMMON_SRCS-yes += common/quant_common.c VP8_COMMON_SRCS-yes += common/reconinter.c VP8_COMMON_SRCS-yes += common/reconintra.c VP8_COMMON_SRCS-yes += common/reconintra4x4.c -VP8_COMMON_SRCS-yes += common/sad_c.c VP8_COMMON_SRCS-yes += common/setupintrarecon.c VP8_COMMON_SRCS-yes += common/swapyv12buffer.c VP8_COMMON_SRCS-yes += common/variance_c.c @@ -85,26 +85,23 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm -VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/sad_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_mmx.c VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_impl_mmx.asm +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c -VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/sad_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_sse2.c VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_impl_sse2.asm -VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/sad_sse3.asm -VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/sad_ssse3.asm +VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_ssse3.c VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm -VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/sad_sse4.asm ifeq ($(CONFIG_POSTPROC),yes) VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm @@ -148,7 +145,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/intra4x4_predict_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_sad16x16_armv6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance8x8_armv6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance16x16_armv6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM) @@ -170,7 +166,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimpleverticaledge_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon.c diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index b4c4c09e9..af9cc7320 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -11,6 +11,7 @@ #include "./vpx_config.h" #include "./vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vpx/vpx_codec.h" #include "vpx/internal/vpx_codec_internal.h" @@ -650,6 +651,7 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, vp8_rtcd(); + vpx_dsp_rtcd(); vpx_scale_rtcd(); if (!ctx->priv) diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index d67745520..72e4770c0 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -12,6 +12,7 @@ #include #include #include "./vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vpx/vpx_decoder.h" #include "vpx/vp8dx.h" @@ -107,6 +108,7 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, (void) data; vp8_rtcd(); + vpx_dsp_rtcd(); vpx_scale_rtcd(); /* This function only allocates space for the vpx_codec_alg_priv_t diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c index 1494c3fd7..57189df16 100644 --- a/vp9/common/vp9_mfqe.c +++ b/vp9/common/vp9_mfqe.c @@ -9,8 +9,9 @@ */ #include "./vpx_config.h" -#include "./vpx_scale_rtcd.h" #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_postproc.h" @@ -171,13 +172,13 @@ static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u, if (bs == BLOCK_16X16) { vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8; - sad = (vp9_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8; + sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8; } else if (bs == BLOCK_32X32) { vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10; - sad = (vp9_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10; + sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10; } else /* if (bs == BLOCK_64X64) */ { vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12; - sad = (vp9_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12; + sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12; } // vdiff > sad * 3 means vdiff should not be too small, otherwise, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 42cb8fe16..8765ac7de 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -922,177 +922,6 @@ specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc"; -add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad64x64 neon avx2/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad32x64 avx2/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad64x32 avx2/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad32x16 avx2/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad16x32/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad32x32 neon avx2/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad16x16 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad16x8/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad8x16/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad8x8 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad8x4/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad4x8/, "$sse_x86inc"; - -add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vp9_sad4x4/, "$sse_x86inc"; - -add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vp9_sad64x64_avg avx2/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vp9_sad32x64_avg avx2/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vp9_sad64x32_avg avx2/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vp9_sad32x16_avg avx2/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vp9_sad16x32_avg/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vp9_sad32x32_avg avx2/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vp9_sad16x16_avg/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vp9_sad16x8_avg/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vp9_sad8x16_avg/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vp9_sad8x8_avg/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vp9_sad8x4_avg/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vp9_sad4x8_avg/, "$sse_x86inc"; - -add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vp9_sad4x4_avg/, "$sse_x86inc"; - -add_proto qw/void vp9_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad64x64x3/; - -add_proto qw/void vp9_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad32x32x3/; - -add_proto qw/void vp9_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad16x16x3 sse3 ssse3/; - -add_proto qw/void vp9_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad16x8x3 sse3 ssse3/; - -add_proto qw/void vp9_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad8x16x3 sse3/; - -add_proto qw/void vp9_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad8x8x3 sse3/; - -add_proto qw/void vp9_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad4x4x3 sse3/; - -add_proto qw/void vp9_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vp9_sad64x64x8/; - -add_proto qw/void vp9_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vp9_sad32x32x8/; - -add_proto qw/void vp9_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vp9_sad16x16x8 sse4_1/; -$vp9_sad16x16x8_sse4_1=vp9_sad16x16x8_sse4; - -add_proto qw/void vp9_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vp9_sad16x8x8 sse4_1/; -$vp9_sad16x8x8_sse4_1=vp9_sad16x8x8_sse4; - -add_proto qw/void vp9_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vp9_sad8x16x8 sse4_1/; -$vp9_sad8x16x8_sse4_1=vp9_sad8x16x8_sse4; - -add_proto qw/void vp9_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vp9_sad8x8x8 sse4_1/; -$vp9_sad8x8x8_sse4_1=vp9_sad8x8x8_sse4; - -add_proto qw/void vp9_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vp9_sad8x4x8/; - -add_proto qw/void vp9_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vp9_sad4x8x8/; - -add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vp9_sad4x4x8 sse4_1/; -$vp9_sad4x4x8_sse4_1=vp9_sad4x4x8_sse4; - -add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad64x64x4d sse2 avx2 neon/; - -add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad32x64x4d sse2/; - -add_proto qw/void vp9_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad64x32x4d sse2/; - -add_proto qw/void vp9_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad32x16x4d sse2/; - -add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad16x32x4d sse2/; - -add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad32x32x4d sse2 avx2 neon/; - -add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad16x16x4d sse2 neon/; - -add_proto qw/void vp9_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad16x8x4d sse2/; - -add_proto qw/void vp9_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad8x16x4d sse2/; - -add_proto qw/void vp9_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad8x8x4d sse2/; - -# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form -add_proto qw/void vp9_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad8x4x4d sse2/; - -add_proto qw/void vp9_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad4x8x4d sse/; - -add_proto qw/void vp9_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad4x4x4d sse/; - add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc"; @@ -1682,171 +1511,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/; - add_proto qw/unsigned int vp9_highbd_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad64x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad32x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad64x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad32x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad16x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad32x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad16x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad8x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad8x4/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad4x8/; - - add_proto qw/unsigned int vp9_highbd_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vp9_highbd_sad4x4/; - - add_proto qw/unsigned int vp9_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad64x64_avg/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad32x64_avg/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad64x32_avg/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad32x16_avg/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad16x32_avg/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad32x32_avg/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad16x16_avg/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad16x8_avg/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad8x16_avg/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad8x8_avg/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad8x4_avg/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad4x8_avg/; - - add_proto qw/unsigned int vp9_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sad4x4_avg/; - - add_proto qw/void vp9_highbd_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad64x64x3/; - - add_proto qw/void vp9_highbd_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad32x32x3/; - - add_proto qw/void vp9_highbd_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad16x16x3/; - - add_proto qw/void vp9_highbd_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad16x8x3/; - - add_proto qw/void vp9_highbd_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad8x16x3/; - - add_proto qw/void vp9_highbd_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad8x8x3/; - - add_proto qw/void vp9_highbd_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad4x4x3/; - - add_proto qw/void vp9_highbd_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - specialize qw/vp9_highbd_sad64x64x8/; - - add_proto qw/void vp9_highbd_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - specialize qw/vp9_highbd_sad32x32x8/; - - add_proto qw/void vp9_highbd_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - specialize qw/vp9_highbd_sad16x16x8/; - - add_proto qw/void vp9_highbd_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - specialize qw/vp9_highbd_sad16x8x8/; - - add_proto qw/void vp9_highbd_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - specialize qw/vp9_highbd_sad8x16x8/; - - add_proto qw/void vp9_highbd_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - specialize qw/vp9_highbd_sad8x8x8/; - - add_proto qw/void vp9_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - specialize qw/vp9_highbd_sad8x4x8/; - - add_proto qw/void vp9_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - specialize qw/vp9_highbd_sad4x8x8/; - - add_proto qw/void vp9_highbd_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - specialize qw/vp9_highbd_sad4x4x8/; - - add_proto qw/void vp9_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad64x64x4d sse2/; - - add_proto qw/void vp9_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad32x64x4d sse2/; - - add_proto qw/void vp9_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad64x32x4d sse2/; - - add_proto qw/void vp9_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad32x16x4d sse2/; - - add_proto qw/void vp9_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad16x32x4d sse2/; - - add_proto qw/void vp9_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad32x32x4d sse2/; - - add_proto qw/void vp9_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad16x16x4d sse2/; - - add_proto qw/void vp9_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad16x8x4d sse2/; - - add_proto qw/void vp9_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad8x16x4d sse2/; - - add_proto qw/void vp9_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad8x8x4d sse2/; - - add_proto qw/void vp9_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad8x4x4d sse2/; - - add_proto qw/void vp9_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad4x8x4d sse2/; - - add_proto qw/void vp9_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; - specialize qw/vp9_highbd_sad4x4x4d sse2/; - add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc"; diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index bf4037721..288d8690c 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -13,6 +13,7 @@ #include #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vpx_mem/vpx_mem.h" @@ -40,6 +41,7 @@ static void initialize_dec(void) { if (!init_done) { vp9_rtcd(); + vpx_dsp_rtcd(); vpx_scale_rtcd(); vp9_init_intra_predictors(); init_done = 1; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index a6e4c9c27..a1018adb8 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -14,6 +14,7 @@ #include "./vpx_config.h" #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vpx/internal/vpx_psnr.h" #include "vpx_ports/vpx_timer.h" @@ -318,6 +319,7 @@ void vp9_initialize_enc(void) { if (!init_done) { vp9_rtcd(); + vpx_dsp_rtcd(); vpx_scale_rtcd(); vp9_init_intra_predictors(); vp9_init_me_luts(); @@ -929,61 +931,61 @@ static void fnname##_bits12(const uint8_t *src_ptr, \ sad_array[i] >>= 4; \ } -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x16) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x16_avg) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x16x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x32) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x32_avg) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x32x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad64x32) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad64x32_avg) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad64x32x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x64) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x64_avg) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x64x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x32) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x32_avg) -MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad32x32x3) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad32x32x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x32x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad64x64) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad64x64_avg) -MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad64x64x3) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad64x64x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad64x64x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x16) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x16_avg) -MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad16x16x3) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad16x16x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x16x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x8) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x8_avg) -MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad16x8x3) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad16x8x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x8x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x16) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x16_avg) -MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad8x16x3) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x16x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x16x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x8) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x8_avg) -MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad8x8x3) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x8x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x8x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x4) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x4_avg) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x4x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x4x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad4x8) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad4x8_avg) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad4x8x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad4x8x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad4x4) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad4x4_avg) -MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad4x4x3) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad4x4x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad4x4x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg) +MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad32x32x3) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad32x32x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg) +MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad64x64x3) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad64x64x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg) +MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x16x3) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x16x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg) +MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x8x3) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x8x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg) +MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x16x3) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x16x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg) +MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x8x3) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x8x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x4x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x8x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg) +MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad4x4x3) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x4x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d) static void highbd_set_var_fns(VP9_COMP *const cpi) { VP9_COMMON *const cm = &cpi->common; @@ -991,398 +993,398 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { switch (cm->bit_depth) { case VPX_BITS_8: HIGHBD_BFP(BLOCK_32X16, - vp9_highbd_sad32x16_bits8, - vp9_highbd_sad32x16_avg_bits8, + vpx_highbd_sad32x16_bits8, + vpx_highbd_sad32x16_avg_bits8, vp9_highbd_variance32x16, vp9_highbd_sub_pixel_variance32x16, vp9_highbd_sub_pixel_avg_variance32x16, NULL, NULL, - vp9_highbd_sad32x16x4d_bits8) + vpx_highbd_sad32x16x4d_bits8) HIGHBD_BFP(BLOCK_16X32, - vp9_highbd_sad16x32_bits8, - vp9_highbd_sad16x32_avg_bits8, + vpx_highbd_sad16x32_bits8, + vpx_highbd_sad16x32_avg_bits8, vp9_highbd_variance16x32, vp9_highbd_sub_pixel_variance16x32, vp9_highbd_sub_pixel_avg_variance16x32, NULL, NULL, - vp9_highbd_sad16x32x4d_bits8) + vpx_highbd_sad16x32x4d_bits8) HIGHBD_BFP(BLOCK_64X32, - vp9_highbd_sad64x32_bits8, - vp9_highbd_sad64x32_avg_bits8, + vpx_highbd_sad64x32_bits8, + vpx_highbd_sad64x32_avg_bits8, vp9_highbd_variance64x32, vp9_highbd_sub_pixel_variance64x32, vp9_highbd_sub_pixel_avg_variance64x32, NULL, NULL, - vp9_highbd_sad64x32x4d_bits8) + vpx_highbd_sad64x32x4d_bits8) HIGHBD_BFP(BLOCK_32X64, - vp9_highbd_sad32x64_bits8, - vp9_highbd_sad32x64_avg_bits8, + vpx_highbd_sad32x64_bits8, + vpx_highbd_sad32x64_avg_bits8, vp9_highbd_variance32x64, vp9_highbd_sub_pixel_variance32x64, vp9_highbd_sub_pixel_avg_variance32x64, NULL, NULL, - vp9_highbd_sad32x64x4d_bits8) + vpx_highbd_sad32x64x4d_bits8) HIGHBD_BFP(BLOCK_32X32, - vp9_highbd_sad32x32_bits8, - vp9_highbd_sad32x32_avg_bits8, + vpx_highbd_sad32x32_bits8, + vpx_highbd_sad32x32_avg_bits8, vp9_highbd_variance32x32, vp9_highbd_sub_pixel_variance32x32, vp9_highbd_sub_pixel_avg_variance32x32, - vp9_highbd_sad32x32x3_bits8, - vp9_highbd_sad32x32x8_bits8, - vp9_highbd_sad32x32x4d_bits8) + vpx_highbd_sad32x32x3_bits8, + vpx_highbd_sad32x32x8_bits8, + vpx_highbd_sad32x32x4d_bits8) HIGHBD_BFP(BLOCK_64X64, - vp9_highbd_sad64x64_bits8, - vp9_highbd_sad64x64_avg_bits8, + vpx_highbd_sad64x64_bits8, + vpx_highbd_sad64x64_avg_bits8, vp9_highbd_variance64x64, vp9_highbd_sub_pixel_variance64x64, vp9_highbd_sub_pixel_avg_variance64x64, - vp9_highbd_sad64x64x3_bits8, - vp9_highbd_sad64x64x8_bits8, - vp9_highbd_sad64x64x4d_bits8) + vpx_highbd_sad64x64x3_bits8, + vpx_highbd_sad64x64x8_bits8, + vpx_highbd_sad64x64x4d_bits8) HIGHBD_BFP(BLOCK_16X16, - vp9_highbd_sad16x16_bits8, - vp9_highbd_sad16x16_avg_bits8, + vpx_highbd_sad16x16_bits8, + vpx_highbd_sad16x16_avg_bits8, vp9_highbd_variance16x16, vp9_highbd_sub_pixel_variance16x16, vp9_highbd_sub_pixel_avg_variance16x16, - vp9_highbd_sad16x16x3_bits8, - vp9_highbd_sad16x16x8_bits8, - vp9_highbd_sad16x16x4d_bits8) + vpx_highbd_sad16x16x3_bits8, + vpx_highbd_sad16x16x8_bits8, + vpx_highbd_sad16x16x4d_bits8) HIGHBD_BFP(BLOCK_16X8, - vp9_highbd_sad16x8_bits8, - vp9_highbd_sad16x8_avg_bits8, + vpx_highbd_sad16x8_bits8, + vpx_highbd_sad16x8_avg_bits8, vp9_highbd_variance16x8, vp9_highbd_sub_pixel_variance16x8, vp9_highbd_sub_pixel_avg_variance16x8, - vp9_highbd_sad16x8x3_bits8, - vp9_highbd_sad16x8x8_bits8, - vp9_highbd_sad16x8x4d_bits8) + vpx_highbd_sad16x8x3_bits8, + vpx_highbd_sad16x8x8_bits8, + vpx_highbd_sad16x8x4d_bits8) HIGHBD_BFP(BLOCK_8X16, - vp9_highbd_sad8x16_bits8, - vp9_highbd_sad8x16_avg_bits8, + vpx_highbd_sad8x16_bits8, + vpx_highbd_sad8x16_avg_bits8, vp9_highbd_variance8x16, vp9_highbd_sub_pixel_variance8x16, vp9_highbd_sub_pixel_avg_variance8x16, - vp9_highbd_sad8x16x3_bits8, - vp9_highbd_sad8x16x8_bits8, - vp9_highbd_sad8x16x4d_bits8) + vpx_highbd_sad8x16x3_bits8, + vpx_highbd_sad8x16x8_bits8, + vpx_highbd_sad8x16x4d_bits8) HIGHBD_BFP(BLOCK_8X8, - vp9_highbd_sad8x8_bits8, - vp9_highbd_sad8x8_avg_bits8, + vpx_highbd_sad8x8_bits8, + vpx_highbd_sad8x8_avg_bits8, vp9_highbd_variance8x8, vp9_highbd_sub_pixel_variance8x8, vp9_highbd_sub_pixel_avg_variance8x8, - vp9_highbd_sad8x8x3_bits8, - vp9_highbd_sad8x8x8_bits8, - vp9_highbd_sad8x8x4d_bits8) + vpx_highbd_sad8x8x3_bits8, + vpx_highbd_sad8x8x8_bits8, + vpx_highbd_sad8x8x4d_bits8) HIGHBD_BFP(BLOCK_8X4, - vp9_highbd_sad8x4_bits8, - vp9_highbd_sad8x4_avg_bits8, + vpx_highbd_sad8x4_bits8, + vpx_highbd_sad8x4_avg_bits8, vp9_highbd_variance8x4, vp9_highbd_sub_pixel_variance8x4, vp9_highbd_sub_pixel_avg_variance8x4, NULL, - vp9_highbd_sad8x4x8_bits8, - vp9_highbd_sad8x4x4d_bits8) + vpx_highbd_sad8x4x8_bits8, + vpx_highbd_sad8x4x4d_bits8) HIGHBD_BFP(BLOCK_4X8, - vp9_highbd_sad4x8_bits8, - vp9_highbd_sad4x8_avg_bits8, + vpx_highbd_sad4x8_bits8, + vpx_highbd_sad4x8_avg_bits8, vp9_highbd_variance4x8, vp9_highbd_sub_pixel_variance4x8, vp9_highbd_sub_pixel_avg_variance4x8, NULL, - vp9_highbd_sad4x8x8_bits8, - vp9_highbd_sad4x8x4d_bits8) + vpx_highbd_sad4x8x8_bits8, + vpx_highbd_sad4x8x4d_bits8) HIGHBD_BFP(BLOCK_4X4, - vp9_highbd_sad4x4_bits8, - vp9_highbd_sad4x4_avg_bits8, + vpx_highbd_sad4x4_bits8, + vpx_highbd_sad4x4_avg_bits8, vp9_highbd_variance4x4, vp9_highbd_sub_pixel_variance4x4, vp9_highbd_sub_pixel_avg_variance4x4, - vp9_highbd_sad4x4x3_bits8, - vp9_highbd_sad4x4x8_bits8, - vp9_highbd_sad4x4x4d_bits8) + vpx_highbd_sad4x4x3_bits8, + vpx_highbd_sad4x4x8_bits8, + vpx_highbd_sad4x4x4d_bits8) break; case VPX_BITS_10: HIGHBD_BFP(BLOCK_32X16, - vp9_highbd_sad32x16_bits10, - vp9_highbd_sad32x16_avg_bits10, + vpx_highbd_sad32x16_bits10, + vpx_highbd_sad32x16_avg_bits10, vp9_highbd_10_variance32x16, vp9_highbd_10_sub_pixel_variance32x16, vp9_highbd_10_sub_pixel_avg_variance32x16, NULL, NULL, - vp9_highbd_sad32x16x4d_bits10) + vpx_highbd_sad32x16x4d_bits10) HIGHBD_BFP(BLOCK_16X32, - vp9_highbd_sad16x32_bits10, - vp9_highbd_sad16x32_avg_bits10, + vpx_highbd_sad16x32_bits10, + vpx_highbd_sad16x32_avg_bits10, vp9_highbd_10_variance16x32, vp9_highbd_10_sub_pixel_variance16x32, vp9_highbd_10_sub_pixel_avg_variance16x32, NULL, NULL, - vp9_highbd_sad16x32x4d_bits10) + vpx_highbd_sad16x32x4d_bits10) HIGHBD_BFP(BLOCK_64X32, - vp9_highbd_sad64x32_bits10, - vp9_highbd_sad64x32_avg_bits10, + vpx_highbd_sad64x32_bits10, + vpx_highbd_sad64x32_avg_bits10, vp9_highbd_10_variance64x32, vp9_highbd_10_sub_pixel_variance64x32, vp9_highbd_10_sub_pixel_avg_variance64x32, NULL, NULL, - vp9_highbd_sad64x32x4d_bits10) + vpx_highbd_sad64x32x4d_bits10) HIGHBD_BFP(BLOCK_32X64, - vp9_highbd_sad32x64_bits10, - vp9_highbd_sad32x64_avg_bits10, + vpx_highbd_sad32x64_bits10, + vpx_highbd_sad32x64_avg_bits10, vp9_highbd_10_variance32x64, vp9_highbd_10_sub_pixel_variance32x64, vp9_highbd_10_sub_pixel_avg_variance32x64, NULL, NULL, - vp9_highbd_sad32x64x4d_bits10) + vpx_highbd_sad32x64x4d_bits10) HIGHBD_BFP(BLOCK_32X32, - vp9_highbd_sad32x32_bits10, - vp9_highbd_sad32x32_avg_bits10, + vpx_highbd_sad32x32_bits10, + vpx_highbd_sad32x32_avg_bits10, vp9_highbd_10_variance32x32, vp9_highbd_10_sub_pixel_variance32x32, vp9_highbd_10_sub_pixel_avg_variance32x32, - vp9_highbd_sad32x32x3_bits10, - vp9_highbd_sad32x32x8_bits10, - vp9_highbd_sad32x32x4d_bits10) + vpx_highbd_sad32x32x3_bits10, + vpx_highbd_sad32x32x8_bits10, + vpx_highbd_sad32x32x4d_bits10) HIGHBD_BFP(BLOCK_64X64, - vp9_highbd_sad64x64_bits10, - vp9_highbd_sad64x64_avg_bits10, + vpx_highbd_sad64x64_bits10, + vpx_highbd_sad64x64_avg_bits10, vp9_highbd_10_variance64x64, vp9_highbd_10_sub_pixel_variance64x64, vp9_highbd_10_sub_pixel_avg_variance64x64, - vp9_highbd_sad64x64x3_bits10, - vp9_highbd_sad64x64x8_bits10, - vp9_highbd_sad64x64x4d_bits10) + vpx_highbd_sad64x64x3_bits10, + vpx_highbd_sad64x64x8_bits10, + vpx_highbd_sad64x64x4d_bits10) HIGHBD_BFP(BLOCK_16X16, - vp9_highbd_sad16x16_bits10, - vp9_highbd_sad16x16_avg_bits10, + vpx_highbd_sad16x16_bits10, + vpx_highbd_sad16x16_avg_bits10, vp9_highbd_10_variance16x16, vp9_highbd_10_sub_pixel_variance16x16, vp9_highbd_10_sub_pixel_avg_variance16x16, - vp9_highbd_sad16x16x3_bits10, - vp9_highbd_sad16x16x8_bits10, - vp9_highbd_sad16x16x4d_bits10) + vpx_highbd_sad16x16x3_bits10, + vpx_highbd_sad16x16x8_bits10, + vpx_highbd_sad16x16x4d_bits10) HIGHBD_BFP(BLOCK_16X8, - vp9_highbd_sad16x8_bits10, - vp9_highbd_sad16x8_avg_bits10, + vpx_highbd_sad16x8_bits10, + vpx_highbd_sad16x8_avg_bits10, vp9_highbd_10_variance16x8, vp9_highbd_10_sub_pixel_variance16x8, vp9_highbd_10_sub_pixel_avg_variance16x8, - vp9_highbd_sad16x8x3_bits10, - vp9_highbd_sad16x8x8_bits10, - vp9_highbd_sad16x8x4d_bits10) + vpx_highbd_sad16x8x3_bits10, + vpx_highbd_sad16x8x8_bits10, + vpx_highbd_sad16x8x4d_bits10) HIGHBD_BFP(BLOCK_8X16, - vp9_highbd_sad8x16_bits10, - vp9_highbd_sad8x16_avg_bits10, + vpx_highbd_sad8x16_bits10, + vpx_highbd_sad8x16_avg_bits10, vp9_highbd_10_variance8x16, vp9_highbd_10_sub_pixel_variance8x16, vp9_highbd_10_sub_pixel_avg_variance8x16, - vp9_highbd_sad8x16x3_bits10, - vp9_highbd_sad8x16x8_bits10, - vp9_highbd_sad8x16x4d_bits10) + vpx_highbd_sad8x16x3_bits10, + vpx_highbd_sad8x16x8_bits10, + vpx_highbd_sad8x16x4d_bits10) HIGHBD_BFP(BLOCK_8X8, - vp9_highbd_sad8x8_bits10, - vp9_highbd_sad8x8_avg_bits10, + vpx_highbd_sad8x8_bits10, + vpx_highbd_sad8x8_avg_bits10, vp9_highbd_10_variance8x8, vp9_highbd_10_sub_pixel_variance8x8, vp9_highbd_10_sub_pixel_avg_variance8x8, - vp9_highbd_sad8x8x3_bits10, - vp9_highbd_sad8x8x8_bits10, - vp9_highbd_sad8x8x4d_bits10) + vpx_highbd_sad8x8x3_bits10, + vpx_highbd_sad8x8x8_bits10, + vpx_highbd_sad8x8x4d_bits10) HIGHBD_BFP(BLOCK_8X4, - vp9_highbd_sad8x4_bits10, - vp9_highbd_sad8x4_avg_bits10, + vpx_highbd_sad8x4_bits10, + vpx_highbd_sad8x4_avg_bits10, vp9_highbd_10_variance8x4, vp9_highbd_10_sub_pixel_variance8x4, vp9_highbd_10_sub_pixel_avg_variance8x4, NULL, - vp9_highbd_sad8x4x8_bits10, - vp9_highbd_sad8x4x4d_bits10) + vpx_highbd_sad8x4x8_bits10, + vpx_highbd_sad8x4x4d_bits10) HIGHBD_BFP(BLOCK_4X8, - vp9_highbd_sad4x8_bits10, - vp9_highbd_sad4x8_avg_bits10, + vpx_highbd_sad4x8_bits10, + vpx_highbd_sad4x8_avg_bits10, vp9_highbd_10_variance4x8, vp9_highbd_10_sub_pixel_variance4x8, vp9_highbd_10_sub_pixel_avg_variance4x8, NULL, - vp9_highbd_sad4x8x8_bits10, - vp9_highbd_sad4x8x4d_bits10) + vpx_highbd_sad4x8x8_bits10, + vpx_highbd_sad4x8x4d_bits10) HIGHBD_BFP(BLOCK_4X4, - vp9_highbd_sad4x4_bits10, - vp9_highbd_sad4x4_avg_bits10, + vpx_highbd_sad4x4_bits10, + vpx_highbd_sad4x4_avg_bits10, vp9_highbd_10_variance4x4, vp9_highbd_10_sub_pixel_variance4x4, vp9_highbd_10_sub_pixel_avg_variance4x4, - vp9_highbd_sad4x4x3_bits10, - vp9_highbd_sad4x4x8_bits10, - vp9_highbd_sad4x4x4d_bits10) + vpx_highbd_sad4x4x3_bits10, + vpx_highbd_sad4x4x8_bits10, + vpx_highbd_sad4x4x4d_bits10) break; case VPX_BITS_12: HIGHBD_BFP(BLOCK_32X16, - vp9_highbd_sad32x16_bits12, - vp9_highbd_sad32x16_avg_bits12, + vpx_highbd_sad32x16_bits12, + vpx_highbd_sad32x16_avg_bits12, vp9_highbd_12_variance32x16, vp9_highbd_12_sub_pixel_variance32x16, vp9_highbd_12_sub_pixel_avg_variance32x16, NULL, NULL, - vp9_highbd_sad32x16x4d_bits12) + vpx_highbd_sad32x16x4d_bits12) HIGHBD_BFP(BLOCK_16X32, - vp9_highbd_sad16x32_bits12, - vp9_highbd_sad16x32_avg_bits12, + vpx_highbd_sad16x32_bits12, + vpx_highbd_sad16x32_avg_bits12, vp9_highbd_12_variance16x32, vp9_highbd_12_sub_pixel_variance16x32, vp9_highbd_12_sub_pixel_avg_variance16x32, NULL, NULL, - vp9_highbd_sad16x32x4d_bits12) + vpx_highbd_sad16x32x4d_bits12) HIGHBD_BFP(BLOCK_64X32, - vp9_highbd_sad64x32_bits12, - vp9_highbd_sad64x32_avg_bits12, + vpx_highbd_sad64x32_bits12, + vpx_highbd_sad64x32_avg_bits12, vp9_highbd_12_variance64x32, vp9_highbd_12_sub_pixel_variance64x32, vp9_highbd_12_sub_pixel_avg_variance64x32, NULL, NULL, - vp9_highbd_sad64x32x4d_bits12) + vpx_highbd_sad64x32x4d_bits12) HIGHBD_BFP(BLOCK_32X64, - vp9_highbd_sad32x64_bits12, - vp9_highbd_sad32x64_avg_bits12, + vpx_highbd_sad32x64_bits12, + vpx_highbd_sad32x64_avg_bits12, vp9_highbd_12_variance32x64, vp9_highbd_12_sub_pixel_variance32x64, vp9_highbd_12_sub_pixel_avg_variance32x64, NULL, NULL, - vp9_highbd_sad32x64x4d_bits12) + vpx_highbd_sad32x64x4d_bits12) HIGHBD_BFP(BLOCK_32X32, - vp9_highbd_sad32x32_bits12, - vp9_highbd_sad32x32_avg_bits12, + vpx_highbd_sad32x32_bits12, + vpx_highbd_sad32x32_avg_bits12, vp9_highbd_12_variance32x32, vp9_highbd_12_sub_pixel_variance32x32, vp9_highbd_12_sub_pixel_avg_variance32x32, - vp9_highbd_sad32x32x3_bits12, - vp9_highbd_sad32x32x8_bits12, - vp9_highbd_sad32x32x4d_bits12) + vpx_highbd_sad32x32x3_bits12, + vpx_highbd_sad32x32x8_bits12, + vpx_highbd_sad32x32x4d_bits12) HIGHBD_BFP(BLOCK_64X64, - vp9_highbd_sad64x64_bits12, - vp9_highbd_sad64x64_avg_bits12, + vpx_highbd_sad64x64_bits12, + vpx_highbd_sad64x64_avg_bits12, vp9_highbd_12_variance64x64, vp9_highbd_12_sub_pixel_variance64x64, vp9_highbd_12_sub_pixel_avg_variance64x64, - vp9_highbd_sad64x64x3_bits12, - vp9_highbd_sad64x64x8_bits12, - vp9_highbd_sad64x64x4d_bits12) + vpx_highbd_sad64x64x3_bits12, + vpx_highbd_sad64x64x8_bits12, + vpx_highbd_sad64x64x4d_bits12) HIGHBD_BFP(BLOCK_16X16, - vp9_highbd_sad16x16_bits12, - vp9_highbd_sad16x16_avg_bits12, + vpx_highbd_sad16x16_bits12, + vpx_highbd_sad16x16_avg_bits12, vp9_highbd_12_variance16x16, vp9_highbd_12_sub_pixel_variance16x16, vp9_highbd_12_sub_pixel_avg_variance16x16, - vp9_highbd_sad16x16x3_bits12, - vp9_highbd_sad16x16x8_bits12, - vp9_highbd_sad16x16x4d_bits12) + vpx_highbd_sad16x16x3_bits12, + vpx_highbd_sad16x16x8_bits12, + vpx_highbd_sad16x16x4d_bits12) HIGHBD_BFP(BLOCK_16X8, - vp9_highbd_sad16x8_bits12, - vp9_highbd_sad16x8_avg_bits12, + vpx_highbd_sad16x8_bits12, + vpx_highbd_sad16x8_avg_bits12, vp9_highbd_12_variance16x8, vp9_highbd_12_sub_pixel_variance16x8, vp9_highbd_12_sub_pixel_avg_variance16x8, - vp9_highbd_sad16x8x3_bits12, - vp9_highbd_sad16x8x8_bits12, - vp9_highbd_sad16x8x4d_bits12) + vpx_highbd_sad16x8x3_bits12, + vpx_highbd_sad16x8x8_bits12, + vpx_highbd_sad16x8x4d_bits12) HIGHBD_BFP(BLOCK_8X16, - vp9_highbd_sad8x16_bits12, - vp9_highbd_sad8x16_avg_bits12, + vpx_highbd_sad8x16_bits12, + vpx_highbd_sad8x16_avg_bits12, vp9_highbd_12_variance8x16, vp9_highbd_12_sub_pixel_variance8x16, vp9_highbd_12_sub_pixel_avg_variance8x16, - vp9_highbd_sad8x16x3_bits12, - vp9_highbd_sad8x16x8_bits12, - vp9_highbd_sad8x16x4d_bits12) + vpx_highbd_sad8x16x3_bits12, + vpx_highbd_sad8x16x8_bits12, + vpx_highbd_sad8x16x4d_bits12) HIGHBD_BFP(BLOCK_8X8, - vp9_highbd_sad8x8_bits12, - vp9_highbd_sad8x8_avg_bits12, + vpx_highbd_sad8x8_bits12, + vpx_highbd_sad8x8_avg_bits12, vp9_highbd_12_variance8x8, vp9_highbd_12_sub_pixel_variance8x8, vp9_highbd_12_sub_pixel_avg_variance8x8, - vp9_highbd_sad8x8x3_bits12, - vp9_highbd_sad8x8x8_bits12, - vp9_highbd_sad8x8x4d_bits12) + vpx_highbd_sad8x8x3_bits12, + vpx_highbd_sad8x8x8_bits12, + vpx_highbd_sad8x8x4d_bits12) HIGHBD_BFP(BLOCK_8X4, - vp9_highbd_sad8x4_bits12, - vp9_highbd_sad8x4_avg_bits12, + vpx_highbd_sad8x4_bits12, + vpx_highbd_sad8x4_avg_bits12, vp9_highbd_12_variance8x4, vp9_highbd_12_sub_pixel_variance8x4, vp9_highbd_12_sub_pixel_avg_variance8x4, NULL, - vp9_highbd_sad8x4x8_bits12, - vp9_highbd_sad8x4x4d_bits12) + vpx_highbd_sad8x4x8_bits12, + vpx_highbd_sad8x4x4d_bits12) HIGHBD_BFP(BLOCK_4X8, - vp9_highbd_sad4x8_bits12, - vp9_highbd_sad4x8_avg_bits12, + vpx_highbd_sad4x8_bits12, + vpx_highbd_sad4x8_avg_bits12, vp9_highbd_12_variance4x8, vp9_highbd_12_sub_pixel_variance4x8, vp9_highbd_12_sub_pixel_avg_variance4x8, NULL, - vp9_highbd_sad4x8x8_bits12, - vp9_highbd_sad4x8x4d_bits12) + vpx_highbd_sad4x8x8_bits12, + vpx_highbd_sad4x8x4d_bits12) HIGHBD_BFP(BLOCK_4X4, - vp9_highbd_sad4x4_bits12, - vp9_highbd_sad4x4_avg_bits12, + vpx_highbd_sad4x4_bits12, + vpx_highbd_sad4x4_avg_bits12, vp9_highbd_12_variance4x4, vp9_highbd_12_sub_pixel_variance4x4, vp9_highbd_12_sub_pixel_avg_variance4x4, - vp9_highbd_sad4x4x3_bits12, - vp9_highbd_sad4x4x8_bits12, - vp9_highbd_sad4x4x4d_bits12) + vpx_highbd_sad4x4x3_bits12, + vpx_highbd_sad4x4x8_bits12, + vpx_highbd_sad4x4x4d_bits12) break; default: @@ -1799,64 +1801,64 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, cpi->fn_ptr[BT].sdx8f = SDX8F; \ cpi->fn_ptr[BT].sdx4df = SDX4DF; - BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg, + BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vp9_variance32x16, vp9_sub_pixel_variance32x16, - vp9_sub_pixel_avg_variance32x16, NULL, NULL, vp9_sad32x16x4d) + vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d) - BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg, + BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vp9_variance16x32, vp9_sub_pixel_variance16x32, - vp9_sub_pixel_avg_variance16x32, NULL, NULL, vp9_sad16x32x4d) + vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d) - BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg, + BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vp9_variance64x32, vp9_sub_pixel_variance64x32, - vp9_sub_pixel_avg_variance64x32, NULL, NULL, vp9_sad64x32x4d) + vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d) - BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg, + BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vp9_variance32x64, vp9_sub_pixel_variance32x64, - vp9_sub_pixel_avg_variance32x64, NULL, NULL, vp9_sad32x64x4d) + vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d) - BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg, + BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vp9_variance32x32, vp9_sub_pixel_variance32x32, - vp9_sub_pixel_avg_variance32x32, vp9_sad32x32x3, vp9_sad32x32x8, - vp9_sad32x32x4d) + vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8, + vpx_sad32x32x4d) - BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg, + BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vp9_variance64x64, vp9_sub_pixel_variance64x64, - vp9_sub_pixel_avg_variance64x64, vp9_sad64x64x3, vp9_sad64x64x8, - vp9_sad64x64x4d) + vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8, + vpx_sad64x64x4d) - BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg, + BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vp9_variance16x16, vp9_sub_pixel_variance16x16, - vp9_sub_pixel_avg_variance16x16, vp9_sad16x16x3, vp9_sad16x16x8, - vp9_sad16x16x4d) + vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8, + vpx_sad16x16x4d) - BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg, + BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vp9_variance16x8, vp9_sub_pixel_variance16x8, vp9_sub_pixel_avg_variance16x8, - vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d) + vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d) - BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg, + BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vp9_variance8x16, vp9_sub_pixel_variance8x16, vp9_sub_pixel_avg_variance8x16, - vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d) + vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d) - BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg, + BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vp9_variance8x8, vp9_sub_pixel_variance8x8, vp9_sub_pixel_avg_variance8x8, - vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d) + vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d) - BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg, + BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vp9_variance8x4, vp9_sub_pixel_variance8x4, - vp9_sub_pixel_avg_variance8x4, NULL, vp9_sad8x4x8, vp9_sad8x4x4d) + vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d) - BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg, + BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vp9_variance4x8, vp9_sub_pixel_variance4x8, - vp9_sub_pixel_avg_variance4x8, NULL, vp9_sad4x8x8, vp9_sad4x8x4d) + vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d) - BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg, + BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vp9_variance4x4, vp9_sub_pixel_variance4x4, vp9_sub_pixel_avg_variance4x4, - vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d) + vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d) #if CONFIG_VP9_HIGHBITDEPTH highbd_set_var_fns(cpi); diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c index 06c3885c1..d5eeb9cc5 100644 --- a/vp9/encoder/vp9_mbgraph.c +++ b/vp9/encoder/vp9_mbgraph.c @@ -10,6 +10,9 @@ #include +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" + #include "vpx_mem/vpx_mem.h" #include "vp9/encoder/vp9_segmentation.h" #include "vp9/encoder/vp9_mcomp.h" @@ -74,8 +77,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, x->mv_row_min = tmp_row_min; x->mv_row_max = tmp_row_max; - return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].dst.buf, xd->plane[0].dst.stride); + return vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].dst.buf, xd->plane[0].dst.stride); } static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv, @@ -87,7 +90,7 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv, // Try zero MV first // FIXME should really use something like near/nearest MV and/or MV prediction - err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); dst_mv->as_int = 0; @@ -123,7 +126,7 @@ static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) { // Try zero MV first // FIXME should really use something like near/nearest MV and/or MV prediction - err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); dst_mv->as_int = 0; @@ -146,7 +149,7 @@ static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) { x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, xd->plane[0].dst.stride, 0, 0, 0); - err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, xd->plane[0].dst.stride); // find best diff --git a/vp9/encoder/x86/vp9_sad_ssse3.asm b/vp9/encoder/x86/vp9_sad_ssse3.asm deleted file mode 100644 index 0cb35424e..000000000 --- a/vp9/encoder/x86/vp9_sad_ssse3.asm +++ /dev/null @@ -1,370 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%macro PROCESS_16X2X3 1 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - lddqu xmm5, XMMWORD PTR [rdi] - lddqu xmm6, XMMWORD PTR [rdi+1] - lddqu xmm7, XMMWORD PTR [rdi+2] - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [rsi] - lddqu xmm1, XMMWORD PTR [rdi] - lddqu xmm2, XMMWORD PTR [rdi+1] - lddqu xmm3, XMMWORD PTR [rdi+2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [rsi+rax] - lddqu xmm1, XMMWORD PTR [rdi+rdx] - lddqu xmm2, XMMWORD PTR [rdi+rdx+1] - lddqu xmm3, XMMWORD PTR [rdi+rdx+2] - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_16X2X3_OFFSET 2 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - movdqa xmm4, XMMWORD PTR [rdi] - movdqa xmm7, XMMWORD PTR [rdi+16] - - movdqa xmm5, xmm7 - palignr xmm5, xmm4, %2 - - movdqa xmm6, xmm7 - palignr xmm6, xmm4, (%2+1) - - palignr xmm7, xmm4, (%2+2) - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [rsi] - movdqa xmm4, XMMWORD PTR [rdi] - movdqa xmm3, XMMWORD PTR [rdi+16] - - movdqa xmm1, xmm3 - palignr xmm1, xmm4, %2 - - movdqa xmm2, xmm3 - palignr xmm2, xmm4, (%2+1) - - palignr xmm3, xmm4, (%2+2) - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [rsi+rax] - movdqa xmm4, XMMWORD PTR [rdi+rdx] - movdqa xmm3, XMMWORD PTR [rdi+rdx+16] - - movdqa xmm1, xmm3 - palignr xmm1, xmm4, %2 - - movdqa xmm2, xmm3 - palignr xmm2, xmm4, (%2+1) - - palignr xmm3, xmm4, (%2+2) - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_16X16X3_OFFSET 2 -%2_aligned_by_%1: - - sub rdi, %1 - - PROCESS_16X2X3_OFFSET 1, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - - jmp %2_store_off - -%endmacro - -%macro PROCESS_16X8X3_OFFSET 2 -%2_aligned_by_%1: - - sub rdi, %1 - - PROCESS_16X2X3_OFFSET 1, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - - jmp %2_store_off - -%endmacro - -;void int vp9_sad16x16x3_ssse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad16x16x3_ssse3) PRIVATE -sym(vp9_sad16x16x3_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - push rcx - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rdx, 0xf - and rdx, rdi - - jmp .vp9_sad16x16x3_ssse3_skiptable -.vp9_sad16x16x3_ssse3_jumptable: - dd .vp9_sad16x16x3_ssse3_aligned_by_0 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_1 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_2 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_3 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_4 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_5 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_6 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_7 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_8 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_9 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump -.vp9_sad16x16x3_ssse3_skiptable: - - call .vp9_sad16x16x3_ssse3_do_jump -.vp9_sad16x16x3_ssse3_do_jump: - pop rcx ; get the address of do_jump - mov rax, .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump - add rax, rcx ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable - - movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable - add rcx, rax - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - jmp rcx - - PROCESS_16X16X3_OFFSET 0, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 1, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 2, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 3, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 4, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 5, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 6, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 7, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 8, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 9, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3 - -.vp9_sad16x16x3_ssse3_aligned_by_15: - PROCESS_16X2X3 1 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - -.vp9_sad16x16x3_ssse3_store_off: - mov rdi, arg(4) ;Results - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rdi], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rdi+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rdi+8], xmm0 - - ; begin epilog - pop rcx - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void int vp9_sad16x8x3_ssse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad16x8x3_ssse3) PRIVATE -sym(vp9_sad16x8x3_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - push rcx - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rdx, 0xf - and rdx, rdi - - jmp .vp9_sad16x8x3_ssse3_skiptable -.vp9_sad16x8x3_ssse3_jumptable: - dd .vp9_sad16x8x3_ssse3_aligned_by_0 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_1 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_2 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_3 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_4 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_5 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_6 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_7 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_8 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_9 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump -.vp9_sad16x8x3_ssse3_skiptable: - - call .vp9_sad16x8x3_ssse3_do_jump -.vp9_sad16x8x3_ssse3_do_jump: - pop rcx ; get the address of do_jump - mov rax, .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump - add rax, rcx ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable - - movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable - add rcx, rax - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - jmp rcx - - PROCESS_16X8X3_OFFSET 0, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 1, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 2, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 3, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 4, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 5, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 6, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 7, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 8, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 9, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3 - -.vp9_sad16x8x3_ssse3_aligned_by_15: - - PROCESS_16X2X3 1 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - -.vp9_sad16x8x3_ssse3_store_off: - mov rdi, arg(4) ;Results - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rdi], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rdi+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rdi+8], xmm0 - - ; begin epilog - pop rcx - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index f5f9be8a1..7359b2de0 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -69,7 +69,6 @@ VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c VP9_CX_SRCS-yes += encoder/vp9_rd.c VP9_CX_SRCS-yes += encoder/vp9_rdopt.c VP9_CX_SRCS-yes += encoder/vp9_pickmode.c -VP9_CX_SRCS-yes += encoder/vp9_sad.c VP9_CX_SRCS-yes += encoder/vp9_segmentation.c VP9_CX_SRCS-yes += encoder/vp9_segmentation.h VP9_CX_SRCS-yes += encoder/vp9_speed_features.c @@ -104,15 +103,11 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c -VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm -VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c @@ -121,12 +116,10 @@ endif ifeq ($(CONFIG_USE_X86INC),yes) VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm endif @@ -136,9 +129,6 @@ ifeq ($(ARCH_X86_64),yes) VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3_x86_64.asm endif -VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm -VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm -VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad_intrin_avx2.c VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c @@ -161,8 +151,6 @@ VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c endif VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c -VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad4d_neon.c -VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c diff --git a/vp9/encoder/arm/neon/vp9_sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c similarity index 96% rename from vp9/encoder/arm/neon/vp9_sad4d_neon.c rename to vpx_dsp/arm/sad4d_neon.c index cec1689f1..c7704dc1b 100644 --- a/vp9/encoder/arm/neon/vp9_sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -9,9 +9,9 @@ */ #include -#include "./vp9_rtcd.h" -#include "./vpx_config.h" +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, @@ -80,9 +80,9 @@ static void sad_neon_32(const uint8x16_t vec_src_00, vget_high_u8(vec_ref_16)); } -void vp9_sad64x64x4d_neon(const uint8_t *src, int src_stride, +void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, const uint8_t* const ref[4], int ref_stride, - unsigned int *res) { + uint32_t *res) { int i; uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); @@ -126,9 +126,9 @@ void vp9_sad64x64x4d_neon(const uint8_t *src, int src_stride, res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); } -void vp9_sad32x32x4d_neon(const uint8_t *src, int src_stride, +void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, const uint8_t* const ref[4], int ref_stride, - unsigned int *res) { + uint32_t *res) { int i; uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); @@ -170,9 +170,9 @@ void vp9_sad32x32x4d_neon(const uint8_t *src, int src_stride, res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); } -void vp9_sad16x16x4d_neon(const uint8_t *src, int src_stride, +void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride, const uint8_t* const ref[4], int ref_stride, - unsigned int *res) { + uint32_t *res) { int i; uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); diff --git a/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm b/vpx_dsp/arm/sad_media.asm similarity index 97% rename from vp8/common/arm/armv6/vp8_sad16x16_armv6.asm rename to vpx_dsp/arm/sad_media.asm index 1b4f5cf3b..aed1d3a22 100644 --- a/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm +++ b/vpx_dsp/arm/sad_media.asm @@ -9,7 +9,7 @@ ; - EXPORT |vp8_sad16x16_armv6| + EXPORT |vpx_sad16x16_media| ARM REQUIRE8 @@ -21,8 +21,7 @@ ; r1 int src_stride ; r2 const unsigned char *ref_ptr ; r3 int ref_stride -; stack max_sad (not used) -|vp8_sad16x16_armv6| PROC +|vpx_sad16x16_media| PROC stmfd sp!, {r4-r12, lr} pld [r0, r1, lsl #0] diff --git a/vp9/encoder/arm/neon/vp9_sad_neon.c b/vpx_dsp/arm/sad_neon.c similarity index 65% rename from vp9/encoder/arm/neon/vp9_sad_neon.c rename to vpx_dsp/arm/sad_neon.c index c4cd85680..173f08ac3 100644 --- a/vp9/encoder/arm/neon/vp9_sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -9,11 +9,113 @@ */ #include -#include "./vp9_rtcd.h" + #include "./vpx_config.h" #include "vpx/vpx_integer.h" +unsigned int vpx_sad8x16_neon( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + + for (i = 0; i < 15; i++) { + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, d0, d8); + } + + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} + +unsigned int vpx_sad4x4_neon( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x2_t d1; + uint64x1_t d3; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + + for (i = 0; i < 3; i++) { + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, d0, d8); + } + + d1 = vpaddl_u16(vget_low_u16(q12)); + d3 = vpaddl_u32(d1); + + return vget_lane_u32(vreinterpret_u32_u64(d3), 0); +} + +unsigned int vpx_sad16x8_neon( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride) { + uint8x16_t q0, q4; + uint16x8_t q12, q13; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); + + for (i = 0; i < 7; i++) { + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); + } + + q12 = vaddq_u16(q12, q13); + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} + static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, const uint16x8_t vec_hi) { const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), @@ -34,7 +136,7 @@ static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { return vget_lane_u32(c, 0); } -unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride, +unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride) { int i; uint16x8_t vec_accum_lo = vdupq_n_u16(0); @@ -70,7 +172,7 @@ unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride, return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); } -unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride, +unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride) { int i; uint16x8_t vec_accum_lo = vdupq_n_u16(0); @@ -95,7 +197,7 @@ unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride, return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); } -unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride, +unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride) { int i; uint16x8_t vec_accum_lo = vdupq_n_u16(0); @@ -114,7 +216,7 @@ unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride, return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); } -unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride, +unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride) { int i; uint16x8_t vec_accum = vdupq_n_u16(0); diff --git a/vp9/encoder/vp9_sad.c b/vpx_dsp/sad.c similarity index 59% rename from vp9/encoder/vp9_sad.c rename to vpx_dsp/sad.c index 73134f2f2..9db312fbe 100644 --- a/vp9/encoder/vp9_sad.c +++ b/vpx_dsp/sad.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -10,15 +10,19 @@ #include -#include "./vp9_rtcd.h" #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" + #if CONFIG_VP9_HIGHBITDEPTH #include "vp9/common/vp9_common.h" -#endif -#include "vp9/encoder/vp9_variance.h" +#endif // CONFIG_VP9_HIGHBITDEPTH +// Temporary ... +#define ROUND_POWER_OF_TWO(value, n) \ + (((value) + (1 << ((n) - 1))) >> (n)) +/* Sum the difference between every corresponding element of the buffers. */ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height) { @@ -35,35 +39,78 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, return sad; } +/* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred. + * The function averages every corresponding element of the buffers and stores + * the value in a third buffer, comp_pred. + * pred and comp_pred are assumed to have stride = width + * In the usage below comp_pred is a local array. + */ +static INLINE void avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int i, j; + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_avg_pred(uint16_t *comp_pred, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride) { + int i, j; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + #define sadMxN(m, n) \ -unsigned int vp9_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ +unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride) { \ return sad(src, src_stride, ref, ref_stride, m, n); \ } \ -unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ +unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred) { \ uint8_t comp_pred[m * n]; \ - vp9_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ + avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ return sad(src, src_stride, comp_pred, m, m, n); \ } +// depending on call sites, pass **ref_array to avoid & in subsequent call and +// de-dup with 4D below. #define sadMxNxK(m, n, k) \ -void vp9_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - unsigned int *sads) { \ +void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref_array, int ref_stride, \ + uint32_t *sad_array) { \ int i; \ for (i = 0; i < k; ++i) \ - sads[i] = vp9_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \ + sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \ } +// This appears to be equivalent to the above when k == 4 and refs is const #define sadMxNx4D(m, n) \ -void vp9_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ - const uint8_t *const refs[], int ref_stride, \ - unsigned int *sads) { \ +void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], int ref_stride, \ + uint32_t *sad_array) { \ int i; \ for (i = 0; i < 4; ++i) \ - sads[i] = vp9_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \ + sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \ } // 64x64 @@ -169,40 +216,40 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, } #define highbd_sadMxN(m, n) \ -unsigned int vp9_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ +unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride) { \ return highbd_sad(src, src_stride, ref, ref_stride, m, n); \ } \ -unsigned int vp9_highbd_sad##m##x##n##_avg_c(const uint8_t *src, \ +unsigned int vpx_highbd_sad##m##x##n##_avg_c(const uint8_t *src, \ int src_stride, \ const uint8_t *ref, \ int ref_stride, \ const uint8_t *second_pred) { \ uint16_t comp_pred[m * n]; \ - vp9_highbd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ + highbd_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ } #define highbd_sadMxNxK(m, n, k) \ -void vp9_highbd_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - unsigned int *sads) { \ +void vpx_highbd_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref_array, int ref_stride, \ + uint32_t *sad_array) { \ int i; \ for (i = 0; i < k; ++i) { \ - sads[i] = vp9_highbd_sad##m##x##n##_c(src, src_stride, &ref[i], \ - ref_stride); \ + sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, &ref_array[i], \ + ref_stride); \ } \ } #define highbd_sadMxNx4D(m, n) \ -void vp9_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ - const uint8_t *const refs[], \ - int ref_stride, unsigned int *sads) { \ +void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ int i; \ for (i = 0; i < 4; ++i) { \ - sads[i] = vp9_highbd_sad##m##x##n##_c(src, src_stride, refs[i], \ - ref_stride); \ - } \ + sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, ref_array[i], \ + ref_stride); \ + } \ } // 64x64 diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk new file mode 100644 index 000000000..606515d2c --- /dev/null +++ b/vpx_dsp/vpx_dsp.mk @@ -0,0 +1,40 @@ +## +## Copyright (c) 2015 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +DSP_SRCS-yes += vpx_dsp.mk + +ifeq ($(CONFIG_ENCODERS),yes) +DSP_SRCS-yes += sad.c + +DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM) +DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c + +DSP_SRCS-$(HAVE_MMX) += x86/sad_mmx.asm +DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm +DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/sad_ssse3.asm +DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm +DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c + +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm +endif # CONFIG_VP9_HIGHBITDEPTH +endif # CONFIG_ENCODERS + +DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes) + +DSP_SRCS-yes += vpx_dsp_rtcd.c +DSP_SRCS-yes += vpx_dsp_rtcd_defs.pl + +$(eval $(call rtcd_h_template,vpx_dsp_rtcd,vpx_dsp/vpx_dsp_rtcd_defs.pl)) diff --git a/vpx_dsp/vpx_dsp_rtcd.c b/vpx_dsp/vpx_dsp_rtcd.c new file mode 100644 index 000000000..5fe27b614 --- /dev/null +++ b/vpx_dsp/vpx_dsp_rtcd.c @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#define RTCD_C +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/vpx_once.h" + +void vpx_dsp_rtcd() { + once(setup_rtcd_internal); +} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl new file mode 100644 index 000000000..ebec9ec06 --- /dev/null +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -0,0 +1,395 @@ +sub vpx_dsp_forward_decls() { +print < // AVX2 #include "vpx/vpx_integer.h" -void vp9_sad32x32x4d_avx2(uint8_t *src, +void vpx_sad32x32x4d_avx2(uint8_t *src, int src_stride, uint8_t *ref[4], int ref_stride, - unsigned int res[4]) { + uint32_t res[4]) { __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; __m256i sum_mlow, sum_mhigh; @@ -80,11 +80,11 @@ void vp9_sad32x32x4d_avx2(uint8_t *src, } } -void vp9_sad64x64x4d_avx2(uint8_t *src, +void vpx_sad64x64x4d_avx2(uint8_t *src, int src_stride, uint8_t *ref[4], int ref_stride, - unsigned int res[4]) { + uint32_t res[4]) { __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; __m256i ref3_reg, ref3next_reg; diff --git a/vp9/encoder/x86/vp9_sad4d_sse2.asm b/vpx_dsp/x86/sad4d_sse2.asm similarity index 98% rename from vp9/encoder/x86/vp9_sad4d_sse2.asm rename to vpx_dsp/x86/sad4d_sse2.asm index b4936281f..0f7fb93d4 100644 --- a/vp9/encoder/x86/vp9_sad4d_sse2.asm +++ b/vpx_dsp/x86/sad4d_sse2.asm @@ -8,6 +8,8 @@ ; be found in the AUTHORS file in the root of the source tree. ; +%define program_name vpx + %include "third_party/x86inc/x86inc.asm" SECTION .text @@ -167,9 +169,9 @@ SECTION .text PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 %endmacro -; void vp9_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; void vpx_sadNxNx4d_sse2(uint8_t *src, int src_stride, ; uint8_t *ref[4], int ref_stride, -; unsigned int res[4]); +; uint32_t res[4]); ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 %macro SADNXN4D 2 %if UNIX64 diff --git a/vp9/encoder/x86/vp9_sad_intrin_avx2.c b/vpx_dsp/x86/sad_avx2.c similarity index 96% rename from vp9/encoder/x86/vp9_sad_intrin_avx2.c rename to vpx_dsp/x86/sad_avx2.c index 113193070..78536a472 100644 --- a/vp9/encoder/x86/vp9_sad_intrin_avx2.c +++ b/vpx_dsp/x86/sad_avx2.c @@ -11,7 +11,7 @@ #include "vpx_ports/mem.h" #define FSAD64_H(h) \ -unsigned int vp9_sad64x##h##_avx2(const uint8_t *src_ptr, \ +unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, \ int src_stride, \ const uint8_t *ref_ptr, \ int ref_stride) { \ @@ -40,7 +40,7 @@ unsigned int vp9_sad64x##h##_avx2(const uint8_t *src_ptr, \ } #define FSAD32_H(h) \ -unsigned int vp9_sad32x##h##_avx2(const uint8_t *src_ptr, \ +unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, \ int src_stride, \ const uint8_t *ref_ptr, \ int ref_stride) { \ @@ -89,7 +89,7 @@ FSAD32; #undef FSAD32_H #define FSADAVG64_H(h) \ -unsigned int vp9_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \ +unsigned int vpx_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \ int src_stride, \ const uint8_t *ref_ptr, \ int ref_stride, \ @@ -124,7 +124,7 @@ unsigned int vp9_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \ } #define FSADAVG32_H(h) \ -unsigned int vp9_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \ +unsigned int vpx_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \ int src_stride, \ const uint8_t *ref_ptr, \ int ref_stride, \ diff --git a/vp8/common/x86/sad_mmx.asm b/vpx_dsp/x86/sad_mmx.asm similarity index 95% rename from vp8/common/x86/sad_mmx.asm rename to vpx_dsp/x86/sad_mmx.asm index 592112fa9..9968992bd 100644 --- a/vp8/common/x86/sad_mmx.asm +++ b/vpx_dsp/x86/sad_mmx.asm @@ -11,18 +11,18 @@ %include "vpx_ports/x86_abi_support.asm" -global sym(vp8_sad16x16_mmx) PRIVATE -global sym(vp8_sad8x16_mmx) PRIVATE -global sym(vp8_sad8x8_mmx) PRIVATE -global sym(vp8_sad4x4_mmx) PRIVATE -global sym(vp8_sad16x8_mmx) PRIVATE +global sym(vpx_sad16x16_mmx) PRIVATE +global sym(vpx_sad8x16_mmx) PRIVATE +global sym(vpx_sad8x8_mmx) PRIVATE +global sym(vpx_sad4x4_mmx) PRIVATE +global sym(vpx_sad16x8_mmx) PRIVATE -;unsigned int vp8_sad16x16_mmx( +;unsigned int vpx_sad16x16_mmx( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride) -sym(vp8_sad16x16_mmx): +sym(vpx_sad16x16_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 @@ -109,12 +109,12 @@ sym(vp8_sad16x16_mmx): ret -;unsigned int vp8_sad8x16_mmx( +;unsigned int vpx_sad8x16_mmx( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride) -sym(vp8_sad8x16_mmx): +sym(vpx_sad8x16_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 @@ -181,12 +181,12 @@ sym(vp8_sad8x16_mmx): ret -;unsigned int vp8_sad8x8_mmx( +;unsigned int vpx_sad8x8_mmx( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride) -sym(vp8_sad8x8_mmx): +sym(vpx_sad8x8_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 @@ -251,12 +251,12 @@ sym(vp8_sad8x8_mmx): ret -;unsigned int vp8_sad4x4_mmx( +;unsigned int vpx_sad4x4_mmx( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride) -sym(vp8_sad4x4_mmx): +sym(vpx_sad4x4_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 @@ -340,12 +340,12 @@ sym(vp8_sad4x4_mmx): ret -;unsigned int vp8_sad16x8_mmx( +;unsigned int vpx_sad16x8_mmx( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride) -sym(vp8_sad16x8_mmx): +sym(vpx_sad16x8_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 diff --git a/vp9/encoder/x86/vp9_sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm similarity index 95% rename from vp9/encoder/x86/vp9_sad_sse2.asm rename to vpx_dsp/x86/sad_sse2.asm index c4c5c54f0..c6a829dc2 100644 --- a/vp9/encoder/x86/vp9_sad_sse2.asm +++ b/vpx_dsp/x86/sad_sse2.asm @@ -8,6 +8,8 @@ ; be found in the AUTHORS file in the root of the source tree. ; +%define program_name vpx + %include "third_party/x86inc/x86inc.asm" SECTION .text @@ -44,7 +46,7 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \ %endif ; %3 == 7 %endmacro -; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride, +; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD64XN 1-2 0 SAD_FN 64, %1, 5, %2 @@ -87,7 +89,7 @@ SAD64XN 32 ; sad64x32_sse2 SAD64XN 64, 1 ; sad64x64_avg_sse2 SAD64XN 32, 1 ; sad64x32_avg_sse2 -; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride, +; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD32XN 1-2 0 SAD_FN 32, %1, 5, %2 @@ -132,7 +134,7 @@ SAD32XN 64, 1 ; sad32x64_avg_sse2 SAD32XN 32, 1 ; sad32x32_avg_sse2 SAD32XN 16, 1 ; sad32x16_avg_sse2 -; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride, +; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD16XN 1-2 0 SAD_FN 16, %1, 7, %2 @@ -178,7 +180,7 @@ SAD16XN 32, 1 ; sad16x32_avg_sse2 SAD16XN 16, 1 ; sad16x16_avg_sse2 SAD16XN 8, 1 ; sad16x8_avg_sse2 -; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride, +; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD8XN 1-2 0 SAD_FN 8, %1, 7, %2 @@ -222,7 +224,7 @@ SAD8XN 16, 1 ; sad8x16_avg_sse2 SAD8XN 8, 1 ; sad8x8_avg_sse2 SAD8XN 4, 1 ; sad8x4_avg_sse2 -; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride, +; unsigned int vpx_sad4x{4, 8}_sse(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD4XN 1-2 0 SAD_FN 4, %1, 7, %2 diff --git a/vp9/encoder/x86/vp9_sad_sse3.asm b/vpx_dsp/x86/sad_sse3.asm similarity index 94% rename from vp9/encoder/x86/vp9_sad_sse3.asm rename to vpx_dsp/x86/sad_sse3.asm index 2b90a5d54..18279bdb9 100644 --- a/vp9/encoder/x86/vp9_sad_sse3.asm +++ b/vpx_dsp/x86/sad_sse3.asm @@ -19,7 +19,6 @@ %define end_ptr rcx %define ret_var rbx %define result_ptr arg(4) - %define max_err arg(4) %define height dword ptr arg(4) push rbp mov rbp, rsp @@ -42,7 +41,6 @@ %define end_ptr r10 %define ret_var r11 %define result_ptr [rsp+xmm_stack_space+8+4*8] - %define max_err [rsp+xmm_stack_space+8+4*8] %define height dword ptr [rsp+xmm_stack_space+8+4*8] %else %define src_ptr rdi @@ -52,7 +50,6 @@ %define end_ptr r9 %define ret_var r10 %define result_ptr r8 - %define max_err r8 %define height r8 %endif %endif @@ -67,7 +64,6 @@ %define end_ptr %define ret_var %define result_ptr - %define max_err %define height %if ABI_IS_32BIT @@ -169,14 +165,14 @@ paddw mm7, mm3 %endmacro -;void int vp9_sad16x16x3_sse3( +;void int vpx_sad16x16x3_sse3( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride, ; int *results) -global sym(vp9_sad16x16x3_sse3) PRIVATE -sym(vp9_sad16x16x3_sse3): +global sym(vpx_sad16x16x3_sse3) PRIVATE +sym(vpx_sad16x16x3_sse3): STACK_FRAME_CREATE_X3 @@ -211,14 +207,14 @@ sym(vp9_sad16x16x3_sse3): STACK_FRAME_DESTROY_X3 -;void int vp9_sad16x8x3_sse3( +;void int vpx_sad16x8x3_sse3( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride, ; int *results) -global sym(vp9_sad16x8x3_sse3) PRIVATE -sym(vp9_sad16x8x3_sse3): +global sym(vpx_sad16x8x3_sse3) PRIVATE +sym(vpx_sad16x8x3_sse3): STACK_FRAME_CREATE_X3 @@ -249,14 +245,14 @@ sym(vp9_sad16x8x3_sse3): STACK_FRAME_DESTROY_X3 -;void int vp9_sad8x16x3_sse3( +;void int vpx_sad8x16x3_sse3( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride, ; int *results) -global sym(vp9_sad8x16x3_sse3) PRIVATE -sym(vp9_sad8x16x3_sse3): +global sym(vpx_sad8x16x3_sse3) PRIVATE +sym(vpx_sad8x16x3_sse3): STACK_FRAME_CREATE_X3 @@ -278,14 +274,14 @@ sym(vp9_sad8x16x3_sse3): STACK_FRAME_DESTROY_X3 -;void int vp9_sad8x8x3_sse3( +;void int vpx_sad8x8x3_sse3( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride, ; int *results) -global sym(vp9_sad8x8x3_sse3) PRIVATE -sym(vp9_sad8x8x3_sse3): +global sym(vpx_sad8x8x3_sse3) PRIVATE +sym(vpx_sad8x8x3_sse3): STACK_FRAME_CREATE_X3 @@ -303,14 +299,14 @@ sym(vp9_sad8x8x3_sse3): STACK_FRAME_DESTROY_X3 -;void int vp9_sad4x4x3_sse3( +;void int vpx_sad4x4x3_sse3( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride, ; int *results) -global sym(vp9_sad4x4x3_sse3) PRIVATE -sym(vp9_sad4x4x3_sse3): +global sym(vpx_sad4x4x3_sse3) PRIVATE +sym(vpx_sad4x4x3_sse3): STACK_FRAME_CREATE_X3 diff --git a/vp9/encoder/x86/vp9_sad_sse4.asm b/vpx_dsp/x86/sad_sse4.asm similarity index 95% rename from vp9/encoder/x86/vp9_sad_sse4.asm rename to vpx_dsp/x86/sad_sse4.asm index faf1768a9..bc6744797 100644 --- a/vp9/encoder/x86/vp9_sad_sse4.asm +++ b/vpx_dsp/x86/sad_sse4.asm @@ -165,14 +165,14 @@ movdqa [rdi + 16], xmm2 %endmacro -;void vp9_sad16x16x8_sse4( +;void vpx_sad16x16x8_sse4_1( ; const unsigned char *src_ptr, ; int src_stride, ; const unsigned char *ref_ptr, ; int ref_stride, ; unsigned short *sad_array); -global sym(vp9_sad16x16x8_sse4) PRIVATE -sym(vp9_sad16x16x8_sse4): +global sym(vpx_sad16x16x8_sse4_1) PRIVATE +sym(vpx_sad16x16x8_sse4_1): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 @@ -205,15 +205,15 @@ sym(vp9_sad16x16x8_sse4): ret -;void vp9_sad16x8x8_sse4( +;void vpx_sad16x8x8_sse4_1( ; const unsigned char *src_ptr, ; int src_stride, ; const unsigned char *ref_ptr, ; int ref_stride, ; unsigned short *sad_array ;); -global sym(vp9_sad16x8x8_sse4) PRIVATE -sym(vp9_sad16x8x8_sse4): +global sym(vpx_sad16x8x8_sse4_1) PRIVATE +sym(vpx_sad16x8x8_sse4_1): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 @@ -242,15 +242,15 @@ sym(vp9_sad16x8x8_sse4): ret -;void vp9_sad8x8x8_sse4( +;void vpx_sad8x8x8_sse4_1( ; const unsigned char *src_ptr, ; int src_stride, ; const unsigned char *ref_ptr, ; int ref_stride, ; unsigned short *sad_array ;); -global sym(vp9_sad8x8x8_sse4) PRIVATE -sym(vp9_sad8x8x8_sse4): +global sym(vpx_sad8x8x8_sse4_1) PRIVATE +sym(vpx_sad8x8x8_sse4_1): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 @@ -279,15 +279,15 @@ sym(vp9_sad8x8x8_sse4): ret -;void vp9_sad8x16x8_sse4( +;void vpx_sad8x16x8_sse4_1( ; const unsigned char *src_ptr, ; int src_stride, ; const unsigned char *ref_ptr, ; int ref_stride, ; unsigned short *sad_array ;); -global sym(vp9_sad8x16x8_sse4) PRIVATE -sym(vp9_sad8x16x8_sse4): +global sym(vpx_sad8x16x8_sse4_1) PRIVATE +sym(vpx_sad8x16x8_sse4_1): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 @@ -320,15 +320,15 @@ sym(vp9_sad8x16x8_sse4): ret -;void vp9_sad4x4x8_c( +;void vpx_sad4x4x8_sse4_1( ; const unsigned char *src_ptr, ; int src_stride, ; const unsigned char *ref_ptr, ; int ref_stride, ; unsigned short *sad_array ;); -global sym(vp9_sad4x4x8_sse4) PRIVATE -sym(vp9_sad4x4x8_sse4): +global sym(vpx_sad4x4x8_sse4_1) PRIVATE +sym(vpx_sad4x4x8_sse4_1): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 diff --git a/vp8/common/x86/sad_ssse3.asm b/vpx_dsp/x86/sad_ssse3.asm similarity index 64% rename from vp8/common/x86/sad_ssse3.asm rename to vpx_dsp/x86/sad_ssse3.asm index 278fc0640..49f204fa0 100644 --- a/vp8/common/x86/sad_ssse3.asm +++ b/vpx_dsp/x86/sad_ssse3.asm @@ -146,14 +146,14 @@ %endmacro -;void int vp8_sad16x16x3_ssse3( +;void int vpx_sad16x16x3_ssse3( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride, ; int *results) -global sym(vp8_sad16x16x3_ssse3) PRIVATE -sym(vp8_sad16x16x3_ssse3): +global sym(vpx_sad16x16x3_ssse3) PRIVATE +sym(vpx_sad16x16x3_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 @@ -169,31 +169,31 @@ sym(vp8_sad16x16x3_ssse3): mov rdx, 0xf and rdx, rdi - jmp .vp8_sad16x16x3_ssse3_skiptable -.vp8_sad16x16x3_ssse3_jumptable: - dd .vp8_sad16x16x3_ssse3_aligned_by_0 - .vp8_sad16x16x3_ssse3_do_jump - dd .vp8_sad16x16x3_ssse3_aligned_by_1 - .vp8_sad16x16x3_ssse3_do_jump - dd .vp8_sad16x16x3_ssse3_aligned_by_2 - .vp8_sad16x16x3_ssse3_do_jump - dd .vp8_sad16x16x3_ssse3_aligned_by_3 - .vp8_sad16x16x3_ssse3_do_jump - dd .vp8_sad16x16x3_ssse3_aligned_by_4 - .vp8_sad16x16x3_ssse3_do_jump - dd .vp8_sad16x16x3_ssse3_aligned_by_5 - .vp8_sad16x16x3_ssse3_do_jump - dd .vp8_sad16x16x3_ssse3_aligned_by_6 - .vp8_sad16x16x3_ssse3_do_jump - dd .vp8_sad16x16x3_ssse3_aligned_by_7 - .vp8_sad16x16x3_ssse3_do_jump - dd .vp8_sad16x16x3_ssse3_aligned_by_8 - .vp8_sad16x16x3_ssse3_do_jump - dd .vp8_sad16x16x3_ssse3_aligned_by_9 - .vp8_sad16x16x3_ssse3_do_jump - dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump - dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump - dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump - dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump - dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump - dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump -.vp8_sad16x16x3_ssse3_skiptable: - - call .vp8_sad16x16x3_ssse3_do_jump -.vp8_sad16x16x3_ssse3_do_jump: + jmp .vpx_sad16x16x3_ssse3_skiptable +.vpx_sad16x16x3_ssse3_jumptable: + dd .vpx_sad16x16x3_ssse3_aligned_by_0 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_1 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_2 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_3 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_4 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_5 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_6 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_7 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_8 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_9 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump + dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump +.vpx_sad16x16x3_ssse3_skiptable: + + call .vpx_sad16x16x3_ssse3_do_jump +.vpx_sad16x16x3_ssse3_do_jump: pop rcx ; get the address of do_jump - mov rax, .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump - add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable + mov rax, .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump + add rax, rcx ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable add rcx, rax @@ -203,23 +203,23 @@ sym(vp8_sad16x16x3_ssse3): jmp rcx - PROCESS_16X16X3_OFFSET 0, .vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 1, .vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 2, .vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 3, .vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 4, .vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 5, .vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 6, .vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 7, .vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 8, .vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 9, .vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3 - -.vp8_sad16x16x3_ssse3_aligned_by_15: + PROCESS_16X16X3_OFFSET 0, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 1, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 2, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 3, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 4, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 5, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 6, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 7, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 8, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 9, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3 + +.vpx_sad16x16x3_ssse3_aligned_by_15: PROCESS_16X2X3 1 PROCESS_16X2X3 0 PROCESS_16X2X3 0 @@ -229,7 +229,7 @@ sym(vp8_sad16x16x3_ssse3): PROCESS_16X2X3 0 PROCESS_16X2X3 0 -.vp8_sad16x16x3_ssse3_store_off: +.vpx_sad16x16x3_ssse3_store_off: mov rdi, arg(4) ;Results movq xmm0, xmm5 @@ -259,14 +259,14 @@ sym(vp8_sad16x16x3_ssse3): pop rbp ret -;void int vp8_sad16x8x3_ssse3( +;void int vpx_sad16x8x3_ssse3( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride, ; int *results) -global sym(vp8_sad16x8x3_ssse3) PRIVATE -sym(vp8_sad16x8x3_ssse3): +global sym(vpx_sad16x8x3_ssse3) PRIVATE +sym(vpx_sad16x8x3_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 @@ -282,31 +282,31 @@ sym(vp8_sad16x8x3_ssse3): mov rdx, 0xf and rdx, rdi - jmp .vp8_sad16x8x3_ssse3_skiptable -.vp8_sad16x8x3_ssse3_jumptable: - dd .vp8_sad16x8x3_ssse3_aligned_by_0 - .vp8_sad16x8x3_ssse3_do_jump - dd .vp8_sad16x8x3_ssse3_aligned_by_1 - .vp8_sad16x8x3_ssse3_do_jump - dd .vp8_sad16x8x3_ssse3_aligned_by_2 - .vp8_sad16x8x3_ssse3_do_jump - dd .vp8_sad16x8x3_ssse3_aligned_by_3 - .vp8_sad16x8x3_ssse3_do_jump - dd .vp8_sad16x8x3_ssse3_aligned_by_4 - .vp8_sad16x8x3_ssse3_do_jump - dd .vp8_sad16x8x3_ssse3_aligned_by_5 - .vp8_sad16x8x3_ssse3_do_jump - dd .vp8_sad16x8x3_ssse3_aligned_by_6 - .vp8_sad16x8x3_ssse3_do_jump - dd .vp8_sad16x8x3_ssse3_aligned_by_7 - .vp8_sad16x8x3_ssse3_do_jump - dd .vp8_sad16x8x3_ssse3_aligned_by_8 - .vp8_sad16x8x3_ssse3_do_jump - dd .vp8_sad16x8x3_ssse3_aligned_by_9 - .vp8_sad16x8x3_ssse3_do_jump - dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump - dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump - dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump - dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump - dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump - dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump -.vp8_sad16x8x3_ssse3_skiptable: - - call .vp8_sad16x8x3_ssse3_do_jump -.vp8_sad16x8x3_ssse3_do_jump: + jmp .vpx_sad16x8x3_ssse3_skiptable +.vpx_sad16x8x3_ssse3_jumptable: + dd .vpx_sad16x8x3_ssse3_aligned_by_0 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_1 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_2 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_3 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_4 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_5 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_6 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_7 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_8 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_9 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump + dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump +.vpx_sad16x8x3_ssse3_skiptable: + + call .vpx_sad16x8x3_ssse3_do_jump +.vpx_sad16x8x3_ssse3_do_jump: pop rcx ; get the address of do_jump - mov rax, .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump - add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable + mov rax, .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump + add rax, rcx ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable add rcx, rax @@ -316,30 +316,30 @@ sym(vp8_sad16x8x3_ssse3): jmp rcx - PROCESS_16X8X3_OFFSET 0, .vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 1, .vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 2, .vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 3, .vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 4, .vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 5, .vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 6, .vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 7, .vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 8, .vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 9, .vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3 - -.vp8_sad16x8x3_ssse3_aligned_by_15: + PROCESS_16X8X3_OFFSET 0, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 1, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 2, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 3, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 4, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 5, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 6, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 7, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 8, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 9, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3 + +.vpx_sad16x8x3_ssse3_aligned_by_15: PROCESS_16X2X3 1 PROCESS_16X2X3 0 PROCESS_16X2X3 0 PROCESS_16X2X3 0 -.vp8_sad16x8x3_ssse3_store_off: +.vpx_sad16x8x3_ssse3_store_off: mov rdi, arg(4) ;Results movq xmm0, xmm5