From ea167a58550e14f3b35dde46a405790a929703df Mon Sep 17 00:00:00 2001 From: Julia Robson Date: Mon, 23 Nov 2015 12:50:32 +0000 Subject: [PATCH] Adding SSSE3 accelerations of masked SAD functions Includes tests of masked SAD function optimisations against C versions Change-Id: I42f198767a113b58ae9456841f4ec71075591720 --- test/masked_sad_test.cc | 209 ++++++++++++++ test/test.mk | 4 + vp9/common/vp9_rtcd_defs.pl | 52 ++-- vp9/encoder/x86/vp9_sad_intrin_ssse3.c | 365 +++++++++++++++++++++++++ vp9/vp9cx.mk | 1 + 5 files changed, 605 insertions(+), 26 deletions(-) create mode 100644 test/masked_sad_test.cc create mode 100644 vp9/encoder/x86/vp9_sad_intrin_ssse3.c diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc new file mode 100644 index 000000000..2c9d3428c --- /dev/null +++ b/test/masked_sad_test.cc @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_entropy.h" +#include "vpx/vpx_integer.h" + +using libvpx_test::ACMRandom; + +namespace { +const int number_of_iterations = 500; + +typedef unsigned int (*MaskedSADFunc)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride); +typedef std::tr1::tuple MaskedSADParam; + +class MaskedSADTest : public ::testing::TestWithParam { + public: + virtual ~MaskedSADTest() {} + virtual void SetUp() { + maskedSAD_op_ = GET_PARAM(0); + ref_maskedSAD_op_ = GET_PARAM(1); + } + + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + MaskedSADFunc maskedSAD_op_; + MaskedSADFunc ref_maskedSAD_op_; +}; + +TEST_P(MaskedSADTest, OperationCheck) { + unsigned int ref_ret, ret; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED_ARRAY(16, uint8_t, src_ptr, 4096); + DECLARE_ALIGNED_ARRAY(16, uint8_t, ref_ptr, 4096); + DECLARE_ALIGNED_ARRAY(16, uint8_t, msk_ptr, 4096); + int err_count = 0; + int first_failure = -1; + int src_stride = 64; + int ref_stride = 64; + int msk_stride = 64; + for (int i = 0; i < number_of_iterations; ++i) { + for (int j = 0; j < 4096; j++) { + src_ptr[j] = rnd.Rand8(); + ref_ptr[j] = rnd.Rand8(); + msk_ptr[j] = ((rnd.Rand8()&0x7f) > 64) ? rnd.Rand8()&0x3f : 64; + } + + ref_ret = ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride, + msk_ptr, msk_stride); + ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src_ptr, src_stride, + ref_ptr, ref_stride, + msk_ptr, msk_stride)); + if (ret != ref_ret) { + err_count++; + if (first_failure == -1) + first_failure = i; + } + } + EXPECT_EQ(0, err_count) + << "Error: Masked SAD Test, C output doesn't match SSSE3 output. " + << "First failed at test case " << first_failure; +} + +#if CONFIG_VP9_HIGHBITDEPTH +typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride); +typedef std::tr1::tuple + HighbdMaskedSADParam; + +class HighbdMaskedSADTest : public ::testing:: + TestWithParam { + public: + virtual ~HighbdMaskedSADTest() {} + virtual void SetUp() { + maskedSAD_op_ = GET_PARAM(0); + ref_maskedSAD_op_ = GET_PARAM(1); + } + + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + HighbdMaskedSADFunc maskedSAD_op_; + HighbdMaskedSADFunc ref_maskedSAD_op_; +}; + +TEST_P(HighbdMaskedSADTest, OperationCheck) { + unsigned int ref_ret, ret; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED_ARRAY(16, uint16_t, src_ptr, 4096); + DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_ptr, 4096); + DECLARE_ALIGNED_ARRAY(16, uint8_t, msk_ptr, 4096); + uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr); + uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr); + int err_count = 0; + int first_failure = -1; + int src_stride = 64; + int ref_stride = 64; + int msk_stride = 64; + for (int i = 0; i < number_of_iterations; ++i) { + for (int j = 0; j < 4096; j++) { + src_ptr[j] = rnd.Rand16()&0xfff; + ref_ptr[j] = rnd.Rand16()&0xfff; + msk_ptr[j] = ((rnd.Rand8()&0x7f) > 64) ? rnd.Rand8()&0x3f : 64; + } + + ref_ret = ref_maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride, + msk_ptr, msk_stride); + ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride, + ref8_ptr, ref_stride, + msk_ptr, msk_stride)); + if (ret != ref_ret) { + err_count++; + if (first_failure == -1) + first_failure = i; + } + } + EXPECT_EQ(0, err_count) + << "Error: High BD Masked SAD Test, C output doesn't match SSSE3 output. " + << "First failed at test case " << first_failure; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +using std::tr1::make_tuple; + +#if HAVE_SSSE3 +INSTANTIATE_TEST_CASE_P( + SSSE3_C_COMPARE, MaskedSADTest, + ::testing::Values( + make_tuple(&vp9_masked_sad64x64_ssse3, + &vp9_masked_sad64x64_c), + make_tuple(&vp9_masked_sad64x32_ssse3, + &vp9_masked_sad64x32_c), + make_tuple(&vp9_masked_sad32x64_ssse3, + &vp9_masked_sad32x64_c), + make_tuple(&vp9_masked_sad32x32_ssse3, + &vp9_masked_sad32x32_c), + make_tuple(&vp9_masked_sad32x16_ssse3, + &vp9_masked_sad32x16_c), + make_tuple(&vp9_masked_sad16x32_ssse3, + &vp9_masked_sad16x32_c), + make_tuple(&vp9_masked_sad16x16_ssse3, + &vp9_masked_sad16x16_c), + make_tuple(&vp9_masked_sad16x8_ssse3, + &vp9_masked_sad16x8_c), + make_tuple(&vp9_masked_sad8x16_ssse3, + &vp9_masked_sad8x16_c), + make_tuple(&vp9_masked_sad8x8_ssse3, + &vp9_masked_sad8x8_c), + make_tuple(&vp9_masked_sad8x4_ssse3, + &vp9_masked_sad8x4_c), + make_tuple(&vp9_masked_sad4x8_ssse3, + &vp9_masked_sad4x8_c), + make_tuple(&vp9_masked_sad4x4_ssse3, + &vp9_masked_sad4x4_c))); +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + SSSE3_C_COMPARE, HighbdMaskedSADTest, + ::testing::Values( + make_tuple(&vp9_highbd_masked_sad64x64_ssse3, + &vp9_highbd_masked_sad64x64_c), + make_tuple(&vp9_highbd_masked_sad64x32_ssse3, + &vp9_highbd_masked_sad64x32_c), + make_tuple(&vp9_highbd_masked_sad32x64_ssse3, + &vp9_highbd_masked_sad32x64_c), + make_tuple(&vp9_highbd_masked_sad32x32_ssse3, + &vp9_highbd_masked_sad32x32_c), + make_tuple(&vp9_highbd_masked_sad32x16_ssse3, + &vp9_highbd_masked_sad32x16_c), + make_tuple(&vp9_highbd_masked_sad16x32_ssse3, + &vp9_highbd_masked_sad16x32_c), + make_tuple(&vp9_highbd_masked_sad16x16_ssse3, + &vp9_highbd_masked_sad16x16_c), + make_tuple(&vp9_highbd_masked_sad16x8_ssse3, + &vp9_highbd_masked_sad16x8_c), + make_tuple(&vp9_highbd_masked_sad8x16_ssse3, + &vp9_highbd_masked_sad8x16_c), + make_tuple(&vp9_highbd_masked_sad8x8_ssse3, + &vp9_highbd_masked_sad8x8_c), + make_tuple(&vp9_highbd_masked_sad8x4_ssse3, + &vp9_highbd_masked_sad8x4_c), + make_tuple(&vp9_highbd_masked_sad4x8_ssse3, + &vp9_highbd_masked_sad4x8_c), + make_tuple(&vp9_highbd_masked_sad4x4_ssse3, + &vp9_highbd_masked_sad4x4_c))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SSSE3 +} // namespace diff --git a/test/test.mk b/test/test.mk index 625a97782..15ed6a940 100644 --- a/test/test.mk +++ b/test/test.mk @@ -148,6 +148,10 @@ ifeq ($(CONFIG_VP9)$(CONFIG_WEDGE_PARTITION),yesyes) LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc endif +ifeq ($(CONFIG_VP9)$(CONFIG_WEDGE_PARTITION),yesyes) +LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc +endif + endif # VP9 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index b7789e573..c24dff945 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1526,43 +1526,43 @@ if (vpx_config("CONFIG_WEDGE_PARTITION") eq "yes") { specialize qw/vp9_masked_sub_pixel_variance4x4 ssse3/; add_proto qw/unsigned int vp9_masked_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_masked_sad64x64/; + specialize qw/vp9_masked_sad64x64 ssse3/; add_proto qw/unsigned int vp9_masked_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_masked_sad32x64/; + specialize qw/vp9_masked_sad32x64 ssse3/; add_proto qw/unsigned int vp9_masked_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_masked_sad64x32/; + specialize qw/vp9_masked_sad64x32 ssse3/; add_proto qw/unsigned int vp9_masked_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_masked_sad32x16/; + specialize qw/vp9_masked_sad32x16 ssse3/; add_proto qw/unsigned int vp9_masked_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_masked_sad16x32/; + specialize qw/vp9_masked_sad16x32 ssse3/; add_proto qw/unsigned int vp9_masked_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_masked_sad32x32/; + specialize qw/vp9_masked_sad32x32 ssse3/; add_proto qw/unsigned int vp9_masked_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_masked_sad16x16/; + specialize qw/vp9_masked_sad16x16 ssse3/; add_proto qw/unsigned int vp9_masked_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_masked_sad16x8/; + specialize qw/vp9_masked_sad16x8 ssse3/; add_proto qw/unsigned int vp9_masked_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_masked_sad8x16/; + specialize qw/vp9_masked_sad8x16 ssse3/; add_proto qw/unsigned int vp9_masked_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_masked_sad8x8/; + specialize qw/vp9_masked_sad8x8 ssse3/; add_proto qw/unsigned int vp9_masked_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_masked_sad8x4/; + specialize qw/vp9_masked_sad8x4 ssse3/; add_proto qw/unsigned int vp9_masked_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_masked_sad4x8/; + specialize qw/vp9_masked_sad4x8 ssse3/; add_proto qw/unsigned int vp9_masked_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_masked_sad4x4/; + specialize qw/vp9_masked_sad4x4 ssse3/; if (vpx_config("CONFIG_EXT_CODING_UNIT_SIZE") eq "yes") { add_proto qw/unsigned int vp9_masked_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse"; @@ -2789,43 +2789,43 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_12_masked_sub_pixel_variance4x4/; add_proto qw/unsigned int vp9_highbd_masked_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_highbd_masked_sad64x64/; + specialize qw/vp9_highbd_masked_sad64x64 ssse3/; add_proto qw/unsigned int vp9_highbd_masked_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_highbd_masked_sad32x64/; + specialize qw/vp9_highbd_masked_sad32x64 ssse3/; add_proto qw/unsigned int vp9_highbd_masked_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_highbd_masked_sad64x32/; + specialize qw/vp9_highbd_masked_sad64x32 ssse3/; add_proto qw/unsigned int vp9_highbd_masked_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_highbd_masked_sad32x16/; + specialize qw/vp9_highbd_masked_sad32x16 ssse3/; add_proto qw/unsigned int vp9_highbd_masked_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_highbd_masked_sad16x32/; + specialize qw/vp9_highbd_masked_sad16x32 ssse3/; add_proto qw/unsigned int vp9_highbd_masked_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_highbd_masked_sad32x32/; + specialize qw/vp9_highbd_masked_sad32x32 ssse3/; add_proto qw/unsigned int vp9_highbd_masked_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_highbd_masked_sad16x16/; + specialize qw/vp9_highbd_masked_sad16x16 ssse3/; add_proto qw/unsigned int vp9_highbd_masked_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_highbd_masked_sad16x8/; + specialize qw/vp9_highbd_masked_sad16x8 ssse3/; add_proto qw/unsigned int vp9_highbd_masked_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_highbd_masked_sad8x16/; + specialize qw/vp9_highbd_masked_sad8x16 ssse3/; add_proto qw/unsigned int vp9_highbd_masked_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_highbd_masked_sad8x8/; + specialize qw/vp9_highbd_masked_sad8x8 ssse3/; add_proto qw/unsigned int vp9_highbd_masked_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_highbd_masked_sad8x4/; + specialize qw/vp9_highbd_masked_sad8x4 ssse3/; add_proto qw/unsigned int vp9_highbd_masked_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_highbd_masked_sad4x8/; + specialize qw/vp9_highbd_masked_sad4x8 ssse3/; add_proto qw/unsigned int vp9_highbd_masked_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; - specialize qw/vp9_highbd_masked_sad4x4/; + specialize qw/vp9_highbd_masked_sad4x4 ssse3/; if (vpx_config("CONFIG_EXT_CODING_UNIT_SIZE") eq "yes") { add_proto qw/unsigned int vp9_highbd_masked_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse"; diff --git a/vp9/encoder/x86/vp9_sad_intrin_ssse3.c b/vp9/encoder/x86/vp9_sad_intrin_ssse3.c new file mode 100644 index 000000000..de336eb6c --- /dev/null +++ b/vp9/encoder/x86/vp9_sad_intrin_ssse3.c @@ -0,0 +1,365 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "vpx_ports/mem.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) { + __m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr); + __m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride)); + return _mm_unpacklo_epi64(temp1, temp2); +} + +static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) { + __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t*)ptr); + __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride)); + __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2); + temp1 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 2)); + temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 3)); + temp1 = _mm_unpacklo_epi32(temp1, temp2); + return _mm_unpacklo_epi64(temp3, temp1); +} + +static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height); + +static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr, + int a_stride, + const uint8_t *b_ptr, + int b_stride, + const uint8_t *m_ptr, + int m_stride, + int height); + +static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr, + int a_stride, + const uint8_t *b_ptr, + int b_stride, + const uint8_t *m_ptr, + int m_stride, + int height); + +#define MASKSADMXN_SSSE3(m, n) \ +unsigned int vp9_masked_sad##m##x##n##_ssse3(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + const uint8_t *msk, \ + int msk_stride) { \ + return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \ + m, n); \ +} + +MASKSADMXN_SSSE3(64, 64) +MASKSADMXN_SSSE3(64, 32) +MASKSADMXN_SSSE3(32, 64) +MASKSADMXN_SSSE3(32, 32) +MASKSADMXN_SSSE3(32, 16) +MASKSADMXN_SSSE3(16, 32) +MASKSADMXN_SSSE3(16, 16) +MASKSADMXN_SSSE3(16, 8) + +#define MASKSAD8XN_SSSE3(n) \ +unsigned int vp9_masked_sad8x##n##_ssse3(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + const uint8_t *msk, \ + int msk_stride) { \ + return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, n); \ +} + +MASKSAD8XN_SSSE3(16) +MASKSAD8XN_SSSE3(8) +MASKSAD8XN_SSSE3(4) + +#define MASKSAD4XN_SSSE3(n) \ +unsigned int vp9_masked_sad4x##n##_ssse3(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, n); \ +} + +MASKSAD4XN_SSSE3(8) +MASKSAD4XN_SSSE3(4) + +// For width a multiple of 16 +// Assumes values in m are <=64 and w = 16, 32, or 64 +static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height) { + int y, x; + __m128i a, b, m, temp1, temp2; + __m128i res = _mm_setzero_si128(); + __m128i one = _mm_set1_epi16(1); + // For each row + for (y = 0; y < height; y++) { + // Covering the full width + for (x = 0; x < width; x += 16) { + // Load a, b, m in xmm registers + a = _mm_loadu_si128((const __m128i*)(a_ptr + x)); + b = _mm_loadu_si128((const __m128i*)(b_ptr + x)); + m = _mm_loadu_si128((const __m128i*)(m_ptr + x)); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu8(a, b); + temp2 = _mm_subs_epu8(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Multiply by m and add together + temp2 = _mm_maddubs_epi16(temp1, m); + // Pad out row result to 32 bit integers & add to running total + res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one)); + } + // Move onto the next row + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} + +static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr, + int a_stride, + const uint8_t *b_ptr, + int b_stride, + const uint8_t *m_ptr, + int m_stride, + int height) { + int y; + __m128i a, b, m, temp1, temp2, row_res; + __m128i res = _mm_setzero_si128(); + __m128i one = _mm_set1_epi16(1); + // Add the masked SAD for 2 rows at a time + for (y = 0; y < height; y += 2) { + // Load a, b, m in xmm registers + a = width8_load_2rows(a_ptr, a_stride); + b = width8_load_2rows(b_ptr, b_stride); + m = width8_load_2rows(m_ptr, m_stride); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu8(a, b); + temp2 = _mm_subs_epu8(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Multiply by m and add together + row_res = _mm_maddubs_epi16(temp1, m); + + // Pad out row result to 32 bit integers & add to running total + res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one)); + + // Move onto the next rows + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; + } + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} + +static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr, + int a_stride, + const uint8_t *b_ptr, + int b_stride, + const uint8_t *m_ptr, + int m_stride, + int height) { + int y; + __m128i a, b, m, temp1, temp2; + __m128i res = _mm_setzero_si128(); + __m128i one = _mm_set1_epi16(1); + // Add the masked SAD for 4 rows at a time + for (y = 0; y < height; y += 4) { + // Load a, b, m in xmm registers + a = width4_load_4rows(a_ptr, a_stride); + b = width4_load_4rows(b_ptr, b_stride); + m = width4_load_4rows(m_ptr, m_stride); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu8(a, b); + temp2 = _mm_subs_epu8(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Multiply by m and add together + res = _mm_add_epi32(res, _mm_maddubs_epi16(temp1, m)); + + // Move onto the next rows + a_ptr += a_stride * 4; + b_ptr += b_stride * 4; + m_ptr += m_stride * 4; + } + // Pad out row result to 32 bit integers & add to running total + res = _mm_madd_epi16(res, one); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr, + int stride) { + __m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr); + __m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride)); + return _mm_unpacklo_epi64(temp1, temp2); +} + +static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr, + int a_stride, + const uint8_t *b8_ptr, + int b_stride, + const uint8_t *m_ptr, + int m_stride, + int width, int height); + +static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr, + int a_stride, + const uint8_t *b8_ptr, + int b_stride, + const uint8_t *m_ptr, + int m_stride, + int height); + +#define HIGHBD_MASKSADMXN_SSSE3(m, n) \ +unsigned int vp9_highbd_masked_sad##m##x##n##_ssse3(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + const uint8_t *msk, \ + int msk_stride) { \ + return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, m, n); \ +} + +HIGHBD_MASKSADMXN_SSSE3(64, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 64) +HIGHBD_MASKSADMXN_SSSE3(32, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 16) +HIGHBD_MASKSADMXN_SSSE3(16, 32) +HIGHBD_MASKSADMXN_SSSE3(16, 16) +HIGHBD_MASKSADMXN_SSSE3(16, 8) +HIGHBD_MASKSADMXN_SSSE3(8, 16) +HIGHBD_MASKSADMXN_SSSE3(8, 8) +HIGHBD_MASKSADMXN_SSSE3(8, 4) + +#define HIGHBD_MASKSAD4XN_SSSE3(n) \ +unsigned int vp9_highbd_masked_sad4x##n##_ssse3(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + const uint8_t *msk, \ + int msk_stride) { \ + return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, n); \ +} + +HIGHBD_MASKSAD4XN_SSSE3(8) +HIGHBD_MASKSAD4XN_SSSE3(4) + +// For width a multiple of 8 +// Assumes values in m are <=64 +static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr, + int a_stride, + const uint8_t *b8_ptr, + int b_stride, + const uint8_t *m_ptr, + int m_stride, + int width, int height) { + int y, x; + __m128i a, b, m, temp1, temp2; + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr); + __m128i res = _mm_setzero_si128(); + // For each row + for (y = 0; y < height; y++) { + // Covering the full width + for (x = 0; x < width; x += 8) { + // Load a, b, m in xmm registers + a = _mm_loadu_si128((const __m128i*)(a_ptr + x)); + b = _mm_loadu_si128((const __m128i*)(b_ptr + x)); + m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(m_ptr + x)), + _mm_setzero_si128()); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu16(a, b); + temp2 = _mm_subs_epu16(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Add result of multiplying by m and add pairs together to running total + res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m)); + } + // Move onto the next row + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} + +static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr, + int a_stride, + const uint8_t *b8_ptr, + int b_stride, + const uint8_t *m_ptr, + int m_stride, + int height) { + int y; + __m128i a, b, m, temp1, temp2; + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr); + __m128i res = _mm_setzero_si128(); + // Add the masked SAD for 2 rows at a time + for (y = 0; y < height; y += 2) { + // Load a, b, m in xmm registers + a = highbd_width4_load_2rows(a_ptr, a_stride); + b = highbd_width4_load_2rows(b_ptr, b_stride); + temp1 = _mm_loadl_epi64((const __m128i*)m_ptr); + temp2 = _mm_loadl_epi64((const __m128i*)(m_ptr + m_stride)); + m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2), + _mm_setzero_si128()); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu16(a, b); + temp2 = _mm_subs_epu16(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Multiply by m and add together + res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m)); + + // Move onto the next rows + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; + } + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 3de596c28..634fa1bfe 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -137,6 +137,7 @@ VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3_x86_64.asm endif VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm +VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_intrin_ssse3.c VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_masked_variance_intrin_ssse3.c VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad_intrin_avx2.c -- 2.40.0