From bfa59b4a5f3f96526b93e758d753ee9b8e389c22 Mon Sep 17 00:00:00 2001 From: Geza Lore Date: Mon, 11 Jul 2016 12:43:47 +0100 Subject: [PATCH] Improve vpx_blend_* functions. - Made source buffers pointers to const. - Renamed vpx_blend_mask6b to vpx_blend_a64_mask. This is more indicative that the function does alpha blending. The 6, or 6b suffix was misleading, as the max mask value (64) does not fit into 6 bits. - Added VPX_BLEND_* macros to use when needing to blend scalars. - Use VPX_BLEND_A256 in combine_interintra to be more explicit about the operation being done. - Added versions of vpx_blend_a64_* which take 1D horizontal/vertical masks directly and apply them to all rows/columns (vpx_blend_a64_hmask and vpx_blend_a64_vmask). The SSE4.1 optimzied horizontal version now falls back on the 2D version. This can be improved upon if it show up high enough in a profile. - All vpx_blend_a64_* functions now support block sizes down to 1x1 (ie: a single pixel). This is for usage convenience. The SSE4.1 optimized versions fall back on the C implementation if w <= 2 or h <= 2. This can again be improved if it becomes hot code. Change-Id: I13ab3835146ffafe3e1d74d8e9cf64a5abe4144d --- test/blend_a64_mask_1d_test.cc | 374 +++++++++++ ..._mask6b_test.cc => blend_a64_mask_test.cc} | 65 +- test/test.mk | 3 +- test/vp10_wedge_utils_test.cc | 2 +- vp10/common/reconinter.c | 211 +++--- vp10/common/reconinter.h | 4 +- vpx_dsp/blend.h | 40 ++ vpx_dsp/blend_a64_hmask.c | 73 +++ vpx_dsp/blend_a64_mask.c | 151 +++++ vpx_dsp/blend_a64_vmask.c | 75 +++ vpx_dsp/blend_mask.h | 17 - vpx_dsp/blend_mask6b.c | 151 ----- vpx_dsp/vpx_dsp.mk | 13 +- vpx_dsp/vpx_dsp_rtcd_defs.pl | 29 +- vpx_dsp/x86/blend_a64_hmask_sse4.c | 41 ++ ...nd_mask6b_sse4.c => blend_a64_mask_sse4.c} | 603 +++++++----------- vpx_dsp/x86/blend_a64_vmask_sse4.c | 293 +++++++++ vpx_dsp/x86/blend_sse4.h | 145 +++++ 18 files changed, 1605 insertions(+), 685 deletions(-) create mode 100644 test/blend_a64_mask_1d_test.cc rename test/{blend_mask6b_test.cc => blend_a64_mask_test.cc} (77%) create mode 100644 vpx_dsp/blend.h create mode 100644 vpx_dsp/blend_a64_hmask.c create mode 100644 vpx_dsp/blend_a64_mask.c create mode 100644 vpx_dsp/blend_a64_vmask.c delete mode 100644 vpx_dsp/blend_mask.h delete mode 100644 vpx_dsp/blend_mask6b.c create mode 100644 vpx_dsp/x86/blend_a64_hmask_sse4.c rename vpx_dsp/x86/{blend_mask6b_sse4.c => blend_a64_mask_sse4.c} (59%) create mode 100644 vpx_dsp/x86/blend_a64_vmask_sse4.c create mode 100644 vpx_dsp/x86/blend_sse4.h diff --git a/test/blend_a64_mask_1d_test.cc b/test/blend_a64_mask_1d_test.cc new file mode 100644 index 000000000..03e9b7d2a --- /dev/null +++ b/test/blend_a64_mask_1d_test.cc @@ -0,0 +1,374 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/register_state_check.h" + +#include "test/function_equivalence_test.h" + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +#include "./vp10_rtcd.h" + +#include "test/acm_random.h" +#include "vp10/common/enums.h" + +#include "vpx_dsp/blend.h" + +using libvpx_test::ACMRandom; +using libvpx_test::FunctionEquivalenceTest; +using std::tr1::make_tuple; + +namespace { + +template +class BlendA64Mask1DTest : public FunctionEquivalenceTest { + public: + static const int kIterations = 10000; + static const int kMaxWidth = MAX_SB_SIZE * 5; // * 5 to cover longer strides + static const int kMaxHeight = MAX_SB_SIZE; + static const int kBufSize = kMaxWidth * kMaxHeight; + static const int kMaxMaskWidth = 2 * MAX_SB_SIZE; + static const int kMaxMaskSize = kMaxMaskWidth; + + BlendA64Mask1DTest() : rng_(ACMRandom::DeterministicSeed()) {} + + virtual ~BlendA64Mask1DTest() {} + + virtual void Execute(T *p_src0, T *p_src1) = 0; + + void Common() { + w_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1); + h_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1); + + dst_offset_ = rng_(33); + dst_stride_ = rng_(kMaxWidth + 1 - w_) + w_; + + src0_offset_ = rng_(33); + src0_stride_ = rng_(kMaxWidth + 1 - w_) + w_; + + src1_offset_ = rng_(33); + src1_stride_ = rng_(kMaxWidth + 1 - w_) + w_; + + T *p_src0; + T *p_src1; + + switch (rng_(3)) { + case 0: // Separate sources + p_src0 = src0_; + p_src1 = src1_; + break; + case 1: // src0 == dst + p_src0 = dst_tst_; + src0_stride_ = dst_stride_; + src0_offset_ = dst_offset_; + p_src1 = src1_; + break; + case 2: // src1 == dst + p_src0 = src0_; + p_src1 = dst_tst_; + src1_stride_ = dst_stride_; + src1_offset_ = dst_offset_; + break; + default: + FAIL(); + } + + Execute(p_src0, p_src1); + + for (int r = 0 ; r < h_ ; ++r) { + for (int c = 0 ; c < w_ ; ++c) { + ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c], + dst_tst_[dst_offset_ + r * dst_stride_ + c]); + } + } + } + + ACMRandom rng_; + + T dst_ref_[kBufSize]; + T dst_tst_[kBufSize]; + size_t dst_stride_; + size_t dst_offset_; + + T src0_[kBufSize]; + size_t src0_stride_; + size_t src0_offset_; + + T src1_[kBufSize]; + size_t src1_stride_; + size_t src1_offset_; + + uint8_t mask_[kMaxMaskSize]; + + int w_; + int h_; +}; + +////////////////////////////////////////////////////////////////////////////// +// 8 bit version +////////////////////////////////////////////////////////////////////////////// + +typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w); + +class BlendA64Mask1DTest8B : public BlendA64Mask1DTest { + protected: + void Execute(uint8_t *p_src0, uint8_t *p_src1) { + ref_func_(dst_ref_ + dst_offset_, dst_stride_, + p_src0 + src0_offset_, src0_stride_, + p_src1 + src1_offset_, src1_stride_, + mask_, h_, w_); + + tst_func_(dst_tst_ + dst_offset_, dst_stride_, + p_src0 + src0_offset_, src0_stride_, + p_src1 + src1_offset_, src1_stride_, + mask_, h_, w_); + } +}; + +TEST_P(BlendA64Mask1DTest8B, RandomValues) { + for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) { + for (int i = 0 ; i < kBufSize ; ++i) { + dst_ref_[i] = rng_.Rand8(); + dst_tst_[i] = rng_.Rand8(); + + src0_[i] = rng_.Rand8(); + src1_[i] = rng_.Rand8(); + } + + for (int i = 0 ; i < kMaxMaskSize ; ++i) + mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1); + + Common(); + } +} + +TEST_P(BlendA64Mask1DTest8B, ExtremeValues) { + for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) { + for (int i = 0 ; i < kBufSize ; ++i) { + dst_ref_[i] = rng_(2) + 254; + dst_tst_[i] = rng_(2) + 254; + src0_[i] = rng_(2) + 254; + src1_[i] = rng_(2) + 254; + } + + for (int i = 0 ; i < kMaxMaskSize ; ++i) + mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1; + + Common(); + } +} + +static void blend_a64_hmask_ref( + uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize] + [BlendA64Mask1DTest8B::kMaxMaskSize]; + + for (int row = 0 ; row < h ; ++row) + for (int col = 0 ; col < w ; ++col) + mask2d[row][col] = mask[col]; + + vpx_blend_a64_mask_c(dst, dst_stride, + src0, src0_stride, + src1, src1_stride, + &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize, + h, w, 0, 0); +} + +static void blend_a64_vmask_ref( + uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize] + [BlendA64Mask1DTest8B::kMaxMaskSize]; + + for (int row = 0 ; row < h ; ++row) + for (int col = 0 ; col < w ; ++col) + mask2d[row][col] = mask[row]; + + vpx_blend_a64_mask_c(dst, dst_stride, + src0, src0_stride, + src1, src1_stride, + &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize, + h, w, 0, 0); +} + +INSTANTIATE_TEST_CASE_P( + C, BlendA64Mask1DTest8B, + ::testing::Values( + make_tuple(blend_a64_hmask_ref, vpx_blend_a64_hmask_c), + make_tuple(blend_a64_vmask_ref, vpx_blend_a64_vmask_c))); + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_CASE_P( + SSE4_1, BlendA64Mask1DTest8B, + ::testing::Values( + make_tuple(blend_a64_hmask_ref, vpx_blend_a64_hmask_sse4_1), + make_tuple(blend_a64_vmask_ref, vpx_blend_a64_vmask_sse4_1))); +#endif // HAVE_SSE4_1 + +#if CONFIG_VP9_HIGHBITDEPTH +////////////////////////////////////////////////////////////////////////////// +// High bit-depth version +////////////////////////////////////////////////////////////////////////////// + +typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w, int bd); + +class BlendA64Mask1DTestHBD : public BlendA64Mask1DTest { + protected: + void Execute(uint16_t *p_src0, uint16_t *p_src1) { + ref_func_(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_, + CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_, + CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, + mask_, h_, w_, bit_depth_); + + ASM_REGISTER_STATE_CHECK( + tst_func_(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_, + CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_, + CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, + mask_, h_, w_, bit_depth_)); + } + + int bit_depth_; +}; + +TEST_P(BlendA64Mask1DTestHBD, RandomValues) { + for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) { + switch (rng_(3)) { + case 0: + bit_depth_ = 8; + break; + case 1: + bit_depth_ = 10; + break; + default: + bit_depth_ = 12; + break; + } + + const int hi = 1 << bit_depth_; + + for (int i = 0 ; i < kBufSize ; ++i) { + dst_ref_[i] = rng_(hi); + dst_tst_[i] = rng_(hi); + src0_[i] = rng_(hi); + src1_[i] = rng_(hi); + } + + for (int i = 0 ; i < kMaxMaskSize ; ++i) + mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1); + + Common(); + } +} + +TEST_P(BlendA64Mask1DTestHBD, ExtremeValues) { + for (int iter = 0 ; iter < 1000 && !HasFatalFailure(); ++iter) { + switch (rng_(3)) { + case 0: + bit_depth_ = 8; + break; + case 1: + bit_depth_ = 10; + break; + default: + bit_depth_ = 12; + break; + } + + const int hi = 1 << bit_depth_; + const int lo = hi - 2; + + for (int i = 0 ; i < kBufSize ; ++i) { + dst_ref_[i] = rng_(hi - lo) + lo; + dst_tst_[i] = rng_(hi - lo) + lo; + src0_[i] = rng_(hi - lo) + lo; + src1_[i] = rng_(hi - lo) + lo; + } + + for (int i = 0 ; i < kMaxMaskSize ; ++i) + mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1; + + Common(); + } +} + +static void highbd_blend_a64_hmask_ref( + uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w, int bd) { + uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize] + [BlendA64Mask1DTestHBD::kMaxMaskSize]; + + for (int row = 0 ; row < h ; ++row) + for (int col = 0 ; col < w ; ++col) + mask2d[row][col] = mask[col]; + + vpx_highbd_blend_a64_mask_c(dst, dst_stride, + src0, src0_stride, + src1, src1_stride, + &mask2d[0][0], + BlendA64Mask1DTestHBD::kMaxMaskSize, + h, w, 0, 0, bd); +} + +static void highbd_blend_a64_vmask_ref( + uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w, int bd) { + uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize] + [BlendA64Mask1DTestHBD::kMaxMaskSize]; + + for (int row = 0 ; row < h ; ++row) + for (int col = 0 ; col < w ; ++col) + mask2d[row][col] = mask[row]; + + vpx_highbd_blend_a64_mask_c(dst, dst_stride, + src0, src0_stride, + src1, src1_stride, + &mask2d[0][0], + BlendA64Mask1DTestHBD::kMaxMaskSize, + h, w, 0, 0, bd); +} + +INSTANTIATE_TEST_CASE_P( + C, BlendA64Mask1DTestHBD, + ::testing::Values( + make_tuple(highbd_blend_a64_hmask_ref, vpx_highbd_blend_a64_hmask_c), + make_tuple(highbd_blend_a64_vmask_ref, vpx_highbd_blend_a64_vmask_c))); + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_CASE_P( + SSE4_1, BlendA64Mask1DTestHBD, + ::testing::Values( + make_tuple(highbd_blend_a64_hmask_ref, vpx_highbd_blend_a64_hmask_sse4_1), + make_tuple(highbd_blend_a64_vmask_ref, vpx_highbd_blend_a64_vmask_sse4_1))); +#endif // HAVE_SSE4_1 + +#endif // CONFIG_VP9_HIGHBITDEPTH +} // namespace diff --git a/test/blend_mask6b_test.cc b/test/blend_a64_mask_test.cc similarity index 77% rename from test/blend_mask6b_test.cc rename to test/blend_a64_mask_test.cc index 5cd7bf7ac..08ee91d5c 100644 --- a/test/blend_mask6b_test.cc +++ b/test/blend_a64_mask_test.cc @@ -26,6 +26,8 @@ #include "test/acm_random.h" #include "vp10/common/enums.h" +#include "vpx_dsp/blend.h" + using libvpx_test::ACMRandom; using libvpx_test::FunctionEquivalenceTest; using std::tr1::make_tuple; @@ -33,7 +35,7 @@ using std::tr1::make_tuple; namespace { template -class BlendMask6Test : public FunctionEquivalenceTest { +class BlendA64MaskTest : public FunctionEquivalenceTest { protected: static const int kIterations = 10000; static const int kMaxWidth = MAX_SB_SIZE * 5; // * 5 to cover longer strides @@ -42,15 +44,15 @@ class BlendMask6Test : public FunctionEquivalenceTest { static const int kMaxMaskWidth = 2 * MAX_SB_SIZE; static const int kMaxMaskSize = kMaxMaskWidth * kMaxMaskWidth; - BlendMask6Test() : rng_(ACMRandom::DeterministicSeed()) {} + BlendA64MaskTest() : rng_(ACMRandom::DeterministicSeed()) {} - virtual ~BlendMask6Test() {} + virtual ~BlendA64MaskTest() {} - virtual void Execute(T *p_src0, T *p_src1) = 0; + virtual void Execute(const T *p_src0, const T *p_src1) = 0; void Common() { - w_ = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 2) + 2); - h_ = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 2) + 2); + w_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1); + h_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1); subx_ = rng_(2); suby_ = rng_(2); @@ -131,14 +133,14 @@ class BlendMask6Test : public FunctionEquivalenceTest { ////////////////////////////////////////////////////////////////////////////// typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride, - uint8_t *src0, uint32_t src0_stride, - uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w, int suby, int subx); + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, int suby, int subx); -class BlendMask6Test8B : public BlendMask6Test { +class BlendA64MaskTest8B : public BlendA64MaskTest { protected: - void Execute(uint8_t *p_src0, uint8_t *p_src1) { + void Execute(const uint8_t *p_src0, const uint8_t *p_src1) { ref_func_(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_, src0_stride_, p_src1 + src1_offset_, src1_stride_, @@ -153,7 +155,7 @@ class BlendMask6Test8B : public BlendMask6Test { } }; -TEST_P(BlendMask6Test8B, RandomValues) { +TEST_P(BlendA64MaskTest8B, RandomValues) { for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) { for (int i = 0 ; i < kBufSize ; ++i) { dst_ref_[i] = rng_.Rand8(); @@ -164,13 +166,13 @@ TEST_P(BlendMask6Test8B, RandomValues) { } for (int i = 0 ; i < kMaxMaskSize ; ++i) - mask_[i] = rng_(65); + mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1); Common(); } } -TEST_P(BlendMask6Test8B, ExtremeValues) { +TEST_P(BlendA64MaskTest8B, ExtremeValues) { for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) { for (int i = 0 ; i < kBufSize ; ++i) { dst_ref_[i] = rng_(2) + 254; @@ -180,7 +182,7 @@ TEST_P(BlendMask6Test8B, ExtremeValues) { } for (int i = 0 ; i < kMaxMaskSize ; ++i) - mask_[i] = rng_(2) + 63; + mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1; Common(); } @@ -188,8 +190,9 @@ TEST_P(BlendMask6Test8B, ExtremeValues) { #if HAVE_SSE4_1 INSTANTIATE_TEST_CASE_P( - SSE4_1_C_COMPARE, BlendMask6Test8B, - ::testing::Values(make_tuple(&vpx_blend_mask6b_c, &vpx_blend_mask6b_sse4_1))); + SSE4_1_C_COMPARE, BlendA64MaskTest8B, + ::testing::Values(make_tuple(vpx_blend_a64_mask_c, + vpx_blend_a64_mask_sse4_1))); #endif // HAVE_SSE4_1 #if CONFIG_VP9_HIGHBITDEPTH @@ -198,14 +201,14 @@ INSTANTIATE_TEST_CASE_P( ////////////////////////////////////////////////////////////////////////////// typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride, - uint8_t *src0, uint32_t src0_stride, - uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w, int suby, int subx, int bd); + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, int suby, int subx, int bd); -class BlendMask6TestHBD : public BlendMask6Test { +class BlendA64MaskTestHBD : public BlendA64MaskTest { protected: - void Execute(uint16_t *p_src0, uint16_t *p_src1) { + void Execute(const uint16_t *p_src0, const uint16_t *p_src1) { ref_func_(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_, CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_, CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, @@ -223,7 +226,7 @@ class BlendMask6TestHBD : public BlendMask6Test { int bit_depth_; }; -TEST_P(BlendMask6TestHBD, RandomValues) { +TEST_P(BlendA64MaskTestHBD, RandomValues) { for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) { switch (rng_(3)) { case 0: @@ -247,13 +250,13 @@ TEST_P(BlendMask6TestHBD, RandomValues) { } for (int i = 0 ; i < kMaxMaskSize ; ++i) - mask_[i] = rng_(65); + mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1); Common(); } } -TEST_P(BlendMask6TestHBD, ExtremeValues) { +TEST_P(BlendA64MaskTestHBD, ExtremeValues) { for (int iter = 0 ; iter < 1000 && !HasFatalFailure(); ++iter) { switch (rng_(3)) { case 0: @@ -278,7 +281,7 @@ TEST_P(BlendMask6TestHBD, ExtremeValues) { } for (int i = 0 ; i < kMaxMaskSize ; ++i) - mask_[i] = rng_(65); + mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1; Common(); } @@ -286,9 +289,9 @@ TEST_P(BlendMask6TestHBD, ExtremeValues) { #if HAVE_SSE4_1 INSTANTIATE_TEST_CASE_P( - SSE4_1_C_COMPARE, BlendMask6TestHBD, - ::testing::Values(make_tuple(&vpx_highbd_blend_mask6b_c, - &vpx_highbd_blend_mask6b_sse4_1))); + SSE4_1_C_COMPARE, BlendA64MaskTestHBD, + ::testing::Values(make_tuple(vpx_highbd_blend_a64_mask_c, + vpx_highbd_blend_a64_mask_sse4_1))); #endif // HAVE_SSE4_1 #endif // CONFIG_VP9_HIGHBITDEPTH } // namespace diff --git a/test/test.mk b/test/test.mk index dd6abe123..67fe70507 100644 --- a/test/test.mk +++ b/test/test.mk @@ -178,11 +178,12 @@ LIBVPX_TEST_SRCS-$(CONFIG_EXT_TILE) += vp10_ext_tile_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += subtract_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_a64_mask_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_a64_mask_1d_test.cc ifeq ($(CONFIG_EXT_INTER),yes) LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_mask6b_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_wedge_utils_test.cc endif diff --git a/test/vp10_wedge_utils_test.cc b/test/vp10_wedge_utils_test.cc index 4659c9aa8..7a541b266 100644 --- a/test/vp10_wedge_utils_test.cc +++ b/test/vp10_wedge_utils_test.cc @@ -104,7 +104,7 @@ TEST_F(WedgeUtilsSSEFuncTest, ResidualBlendingEquiv) { p1[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX); } - vpx_blend_mask6b(p, w, p0, w, p1, w, m, w, h, w, 0, 0); + vpx_blend_a64_mask(p, w, p0, w, p1, w, m, w, h, w, 0, 0); vpx_subtract_block(h, w, r0, w, s, w, p0, w); vpx_subtract_block(h, w, r1, w, s, w, p1, w); diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c index 7d0065319..3a196a57e 100644 --- a/vp10/common/reconinter.c +++ b/vp10/common/reconinter.c @@ -15,6 +15,7 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/blend.h" #include "vp10/common/blockd.h" #include "vp10/common/reconinter.h" @@ -448,8 +449,8 @@ void vp10_init_wedge_masks() { #if CONFIG_SUPERTX static void build_masked_compound_wedge_extend( uint8_t *dst, int dst_stride, - uint8_t *src0, int src0_stride, - uint8_t *src1, int src1_stride, + const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, int wedge_index, int wedge_sign, BLOCK_SIZE sb_type, @@ -459,18 +460,18 @@ static void build_masked_compound_wedge_extend( const int subw = (2 << b_width_log2_lookup[sb_type]) == w; const uint8_t *mask = vp10_get_soft_mask( wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y); - vpx_blend_mask6b(dst, dst_stride, - src0, src0_stride, - src1, src1_stride, - mask, MASK_MASTER_STRIDE, - h, w, subh, subw); + vpx_blend_a64_mask(dst, dst_stride, + src0, src0_stride, + src1, src1_stride, + mask, MASK_MASTER_STRIDE, + h, w, subh, subw); } #if CONFIG_VP9_HIGHBITDEPTH static void build_masked_compound_wedge_extend_highbd( uint8_t *dst_8, int dst_stride, - uint8_t *src0_8, int src0_stride, - uint8_t *src1_8, int src1_stride, + const uint8_t *src0_8, int src0_stride, + const uint8_t *src1_8, int src1_stride, int wedge_index, int wedge_sign, BLOCK_SIZE sb_type, int wedge_offset_x, int wedge_offset_y, @@ -479,52 +480,54 @@ static void build_masked_compound_wedge_extend_highbd( const int subw = (2 << b_width_log2_lookup[sb_type]) == w; const uint8_t *mask = vp10_get_soft_mask( wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y); - vpx_highbd_blend_mask6b(dst_8, dst_stride, - src0_8, src0_stride, - src1_8, src1_stride, - mask, MASK_MASTER_STRIDE, - h, w, subh, subw, bd); + vpx_highbd_blend_a64_mask(dst_8, dst_stride, + src0_8, src0_stride, + src1_8, src1_stride, + mask, MASK_MASTER_STRIDE, + h, w, subh, subw, bd); } #endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_SUPERTX -static void build_masked_compound_wedge(uint8_t *dst, int dst_stride, - uint8_t *src0, int src0_stride, - uint8_t *src1, int src1_stride, - int wedge_index, int wedge_sign, - BLOCK_SIZE sb_type, - int h, int w) { +static void build_masked_compound_wedge( + uint8_t *dst, int dst_stride, + const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + int wedge_index, int wedge_sign, + BLOCK_SIZE sb_type, + int h, int w) { // Derive subsampling from h and w passed in. May be refactored to // pass in subsampling factors directly. const int subh = (2 << b_height_log2_lookup[sb_type]) == h; const int subw = (2 << b_width_log2_lookup[sb_type]) == w; const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type); - vpx_blend_mask6b(dst, dst_stride, - src0, src0_stride, - src1, src1_stride, - mask, 4 * num_4x4_blocks_wide_lookup[sb_type], - h, w, subh, subw); + vpx_blend_a64_mask(dst, dst_stride, + src0, src0_stride, + src1, src1_stride, + mask, 4 * num_4x4_blocks_wide_lookup[sb_type], + h, w, subh, subw); } #if CONFIG_VP9_HIGHBITDEPTH -static void build_masked_compound_wedge_highbd(uint8_t *dst_8, int dst_stride, - uint8_t *src0_8, int src0_stride, - uint8_t *src1_8, int src1_stride, - int wedge_index, int wedge_sign, - BLOCK_SIZE sb_type, - int h, int w, int bd) { +static void build_masked_compound_wedge_highbd( + uint8_t *dst_8, int dst_stride, + const uint8_t *src0_8, int src0_stride, + const uint8_t *src1_8, int src1_stride, + int wedge_index, int wedge_sign, + BLOCK_SIZE sb_type, + int h, int w, int bd) { // Derive subsampling from h and w passed in. May be refactored to // pass in subsampling factors directly. const int subh = (2 << b_height_log2_lookup[sb_type]) == h; const int subw = (2 << b_width_log2_lookup[sb_type]) == w; const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type); - vpx_highbd_blend_mask6b(dst_8, dst_stride, - src0_8, src0_stride, - src1_8, src1_stride, - mask, 4 * num_4x4_blocks_wide_lookup[sb_type], - h, w, subh, subw, bd); + vpx_highbd_blend_a64_mask(dst_8, dst_stride, + src0_8, src0_stride, + src1_8, src1_stride, + mask, 4 * num_4x4_blocks_wide_lookup[sb_type], + h, w, subh, subw, bd); } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -1878,12 +1881,10 @@ static void combine_interintra(INTERINTRA_MODE mode, BLOCK_SIZE plane_bsize, uint8_t *comppred, int compstride, - uint8_t *interpred, + const uint8_t *interpred, int interstride, - uint8_t *intrapred, + const uint8_t *intrapred, int intrastride) { - const int scale_bits = 8; - const int scale_max = (1 << scale_bits); const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize]; const int size_scale = ii_size_scales[plane_bsize]; @@ -1896,11 +1897,11 @@ static void combine_interintra(INTERINTRA_MODE mode, bsize); const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw; const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh; - vpx_blend_mask6b(comppred, compstride, - intrapred, intrastride, - interpred, interstride, - mask, 4 * num_4x4_blocks_wide_lookup[bsize], - bh, bw, subh, subw); + vpx_blend_a64_mask(comppred, compstride, + intrapred, intrastride, + interpred, interstride, + mask, 4 * num_4x4_blocks_wide_lookup[bsize], + bh, bw, subh, subw); } return; } @@ -1911,10 +1912,9 @@ static void combine_interintra(INTERINTRA_MODE mode, for (j = 0; j < bw; ++j) { int scale = ii_weights1d[i * size_scale]; comppred[i * compstride + j] = - ROUND_POWER_OF_TWO( - (scale_max - scale) * interpred[i * interstride + j] + - scale * intrapred[i * intrastride + j], - scale_bits); + VPX_BLEND_A256(scale, + intrapred[i * intrastride + j], + interpred[i * interstride + j]); } } break; @@ -1924,10 +1924,9 @@ static void combine_interintra(INTERINTRA_MODE mode, for (j = 0; j < bw; ++j) { int scale = ii_weights1d[j * size_scale]; comppred[i * compstride + j] = - ROUND_POWER_OF_TWO( - (scale_max - scale) * interpred[i * interstride + j] + - scale * intrapred[i * intrastride + j], - scale_bits); + VPX_BLEND_A256(scale, + intrapred[i * intrastride + j], + interpred[i * interstride + j]); } } break; @@ -1939,10 +1938,9 @@ static void combine_interintra(INTERINTRA_MODE mode, int scale = (ii_weights1d[i * size_scale] * 3 + ii_weights1d[j * size_scale]) >> 2; comppred[i * compstride + j] = - ROUND_POWER_OF_TWO( - (scale_max - scale) * interpred[i * interstride + j] + - scale * intrapred[i * intrastride + j], - scale_bits); + VPX_BLEND_A256(scale, + intrapred[i * intrastride + j], + interpred[i * interstride + j]); } } break; @@ -1954,10 +1952,9 @@ static void combine_interintra(INTERINTRA_MODE mode, int scale = (ii_weights1d[j * size_scale] * 3 + ii_weights1d[i * size_scale]) >> 2; comppred[i * compstride + j] = - ROUND_POWER_OF_TWO( - (scale_max - scale) * interpred[i * interstride + j] + - scale * intrapred[i * intrastride + j], - scale_bits); + VPX_BLEND_A256(scale, + intrapred[i * intrastride + j], + interpred[i * interstride + j]); } } break; @@ -1967,10 +1964,9 @@ static void combine_interintra(INTERINTRA_MODE mode, for (j = 0; j < bw; ++j) { int scale = ii_weights1d[(i < j ? i : j) * size_scale]; comppred[i * compstride + j] = - ROUND_POWER_OF_TWO( - (scale_max - scale) * interpred[i * interstride + j] + - scale * intrapred[i * intrastride + j], - scale_bits); + VPX_BLEND_A256(scale, + intrapred[i * intrastride + j], + interpred[i * interstride + j]); } } break; @@ -1981,10 +1977,9 @@ static void combine_interintra(INTERINTRA_MODE mode, int scale = (ii_weights1d[i * size_scale] + ii_weights1d[j * size_scale]) >> 1; comppred[i * compstride + j] = - ROUND_POWER_OF_TWO( - (scale_max - scale) * interpred[i * interstride + j] + - scale * intrapred[i * intrastride + j], - scale_bits); + VPX_BLEND_A256(scale, + intrapred[i * intrastride + j], + interpred[i * interstride + j]); } } break; @@ -1995,10 +1990,8 @@ static void combine_interintra(INTERINTRA_MODE mode, for (i = 0; i < bh; ++i) { for (j = 0; j < bw; ++j) { comppred[i * compstride + j] = - ROUND_POWER_OF_TWO( - interpred[i * interstride + j] + - intrapred[i * intrastride + j], - 1); + VPX_BLEND_AVG(intrapred[i * intrastride + j], + interpred[i * interstride + j]); } } break; @@ -2014,20 +2007,18 @@ static void combine_interintra_highbd(INTERINTRA_MODE mode, BLOCK_SIZE plane_bsize, uint8_t *comppred8, int compstride, - uint8_t *interpred8, + const uint8_t *interpred8, int interstride, - uint8_t *intrapred8, + const uint8_t *intrapred8, int intrastride, int bd) { - const int scale_bits = 8; - const int scale_max = (1 << scale_bits); const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize]; const int size_scale = ii_size_scales[plane_bsize]; int i, j; uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8); - uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8); - uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8); + const uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8); + const uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8); if (use_wedge_interintra) { if (is_interintra_wedge_used(bsize)) { @@ -2036,11 +2027,11 @@ static void combine_interintra_highbd(INTERINTRA_MODE mode, bsize); const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh; const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw; - vpx_highbd_blend_mask6b(comppred8, compstride, - intrapred8, intrastride, - interpred8, interstride, - mask, bw, - bh, bw, subh, subw, bd); + vpx_highbd_blend_a64_mask(comppred8, compstride, + intrapred8, intrastride, + interpred8, interstride, + mask, bw, + bh, bw, subh, subw, bd); } return; } @@ -2051,10 +2042,9 @@ static void combine_interintra_highbd(INTERINTRA_MODE mode, for (j = 0; j < bw; ++j) { int scale = ii_weights1d[i * size_scale]; comppred[i * compstride + j] = - ROUND_POWER_OF_TWO( - (scale_max - scale) * interpred[i * interstride + j] + - scale * intrapred[i * intrastride + j], - scale_bits); + VPX_BLEND_A256(scale, + intrapred[i * intrastride + j], + interpred[i * interstride + j]); } } break; @@ -2064,10 +2054,9 @@ static void combine_interintra_highbd(INTERINTRA_MODE mode, for (j = 0; j < bw; ++j) { int scale = ii_weights1d[j * size_scale]; comppred[i * compstride + j] = - ROUND_POWER_OF_TWO( - (scale_max - scale) * interpred[i * interstride + j] + - scale * intrapred[i * intrastride + j], - scale_bits); + VPX_BLEND_A256(scale, + intrapred[i * intrastride + j], + interpred[i * interstride + j]); } } break; @@ -2079,10 +2068,9 @@ static void combine_interintra_highbd(INTERINTRA_MODE mode, int scale = (ii_weights1d[i * size_scale] * 3 + ii_weights1d[j * size_scale]) >> 2; comppred[i * compstride + j] = - ROUND_POWER_OF_TWO( - (scale_max - scale) * interpred[i * interstride + j] + - scale * intrapred[i * intrastride + j], - scale_bits); + VPX_BLEND_A256(scale, + intrapred[i * intrastride + j], + interpred[i * interstride + j]); } } break; @@ -2094,10 +2082,9 @@ static void combine_interintra_highbd(INTERINTRA_MODE mode, int scale = (ii_weights1d[j * size_scale] * 3 + ii_weights1d[i * size_scale]) >> 2; comppred[i * compstride + j] = - ROUND_POWER_OF_TWO( - (scale_max - scale) * interpred[i * interstride + j] + - scale * intrapred[i * intrastride + j], - scale_bits); + VPX_BLEND_A256(scale, + intrapred[i * intrastride + j], + interpred[i * interstride + j]); } } break; @@ -2107,10 +2094,9 @@ static void combine_interintra_highbd(INTERINTRA_MODE mode, for (j = 0; j < bw; ++j) { int scale = ii_weights1d[(i < j ? i : j) * size_scale]; comppred[i * compstride + j] = - ROUND_POWER_OF_TWO( - (scale_max - scale) * interpred[i * interstride + j] + - scale * intrapred[i * intrastride + j], - scale_bits); + VPX_BLEND_A256(scale, + intrapred[i * intrastride + j], + interpred[i * interstride + j]); } } break; @@ -2121,10 +2107,9 @@ static void combine_interintra_highbd(INTERINTRA_MODE mode, int scale = (ii_weights1d[i * size_scale] + ii_weights1d[j * size_scale]) >> 1; comppred[i * compstride + j] = - ROUND_POWER_OF_TWO( - (scale_max - scale) * interpred[i * interstride + j] + - scale * intrapred[i * intrastride + j], - scale_bits); + VPX_BLEND_A256(scale, + intrapred[i * intrastride + j], + interpred[i * interstride + j]); } } break; @@ -2135,10 +2120,8 @@ static void combine_interintra_highbd(INTERINTRA_MODE mode, for (i = 0; i < bh; ++i) { for (j = 0; j < bw; ++j) { comppred[i * compstride + j] = - ROUND_POWER_OF_TWO( - interpred[i * interstride + j] + - intrapred[i * intrastride + j], - 1); + VPX_BLEND_AVG(interpred[i * interstride + j], + intrapred[i * intrastride + j]); } } break; @@ -2239,8 +2222,8 @@ void vp10_build_intra_predictors_for_interintra( void vp10_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, - uint8_t *inter_pred, int inter_stride, - uint8_t *intra_pred, int intra_stride) { + const uint8_t *inter_pred, int inter_stride, + const uint8_t *intra_pred, int intra_stride) { const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { diff --git a/vp10/common/reconinter.h b/vp10/common/reconinter.h index 4ede3e9e9..73218316c 100644 --- a/vp10/common/reconinter.h +++ b/vp10/common/reconinter.h @@ -631,8 +631,8 @@ void vp10_build_intra_predictors_for_interintra( void vp10_combine_interintra( MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, - uint8_t *inter_pred, int inter_stride, - uint8_t *intra_pred, int intra_stride); + const uint8_t *inter_pred, int inter_stride, + const uint8_t *intra_pred, int intra_stride); void vp10_build_interintra_predictors_sbuv(MACROBLOCKD *xd, uint8_t *upred, uint8_t *vpred, diff --git a/vpx_dsp/blend.h b/vpx_dsp/blend.h new file mode 100644 index 000000000..109183acc --- /dev/null +++ b/vpx_dsp/blend.h @@ -0,0 +1,40 @@ +/* +* Copyright (c) 2016 The WebM project authors. All Rights Reserved. +* +* Use of this source code is governed by a BSD-style license +* that can be found in the LICENSE file in the root of the source +* tree. An additional intellectual property rights grant can be found +* in the file PATENTS. All contributing project authors may +* be found in the AUTHORS file in the root of the source tree. +*/ + +#ifndef VPX_DSP_BLEND_H_ +#define VPX_DSP_BLEND_H_ + +#include "vpx_ports/mem.h" + +// Various blending functions and macros. +// See also the vpx_blend_* functions in vpx_dsp_rtcd.h + +// Alpha blending with alpha values from the range [0, 64], where 64 +// means use the first input and 0 means use the second input. +#define VPX_BLEND_A64_ROUND_BITS 6 +#define VPX_BLEND_A64_MAX_ALPHA (1 << VPX_BLEND_A64_ROUND_BITS) // 64 + +#define VPX_BLEND_A64(a, v0, v1) \ + ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A64_MAX_ALPHA - (a)) * (v1), \ + VPX_BLEND_A64_ROUND_BITS) + +// Alpha blending with alpha values from the range [0, 256], where 256 +// means use the first input and 0 means use the second input. +#define VPX_BLEND_A256_ROUND_BITS 8 +#define VPX_BLEND_A256_MAX_ALPHA (1 << VPX_BLEND_A256_ROUND_BITS) // 256 + +#define VPX_BLEND_A256(a, v0, v1) \ + ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A256_MAX_ALPHA - (a)) * (v1), \ + VPX_BLEND_A256_ROUND_BITS) + +// Blending by averaging. +#define VPX_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1) + +#endif // VPX_DSP_BLEND_H_ diff --git a/vpx_dsp/blend_a64_hmask.c b/vpx_dsp/blend_a64_hmask.c new file mode 100644 index 000000000..90f3415ff --- /dev/null +++ b/vpx_dsp/blend_a64_hmask.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/blend.h" + +#include "./vpx_dsp_rtcd.h" + +void vpx_blend_a64_hmask_c( + uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + int i, j; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = VPX_BLEND_A64(mask[j], + src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_blend_a64_hmask_c( + uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int h, int w, int bd) { + int i, j; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = VPX_BLEND_A64(mask[j], + src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/blend_a64_mask.c b/vpx_dsp/blend_a64_mask.c new file mode 100644 index 000000000..1649798e4 --- /dev/null +++ b/vpx_dsp/blend_a64_mask.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vpx_dsp/blend.h" +#include "vpx_dsp/vpx_dsp_common.h" + +#include "./vpx_dsp_rtcd.h" + +// Blending with alpha mask. Mask values come from the range [0, 64], +// as described for VPX_BLEND_A64 in vpx_dsp/blned.h. src0 or src1 can +// be the same as dst, or dst can be different from both sources. + +void vpx_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, int subh, int subw) { + int i, j; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (subw == 0 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = mask[i * mask_stride + j]; + dst[i * dst_stride + j] = VPX_BLEND_A64(m, + src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 1) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = + ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] + + mask[(2 * i + 1) * mask_stride + (2 * j)] + + mask[(2 * i) * mask_stride + (2 * j + 1)] + + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + 2); + dst[i * dst_stride + j] = VPX_BLEND_A64(m, + src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = VPX_BLEND_AVG(mask[i * mask_stride + (2 * j)], + mask[i * mask_stride + (2 * j + 1)]); + dst[i * dst_stride + j] = VPX_BLEND_A64(m, + src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = VPX_BLEND_AVG(mask[(2 * i) * mask_stride + j], + mask[(2 * i + 1) * mask_stride + j]); + dst[i * dst_stride + j] = VPX_BLEND_A64(m, + src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, int subh, int subw, int bd) { + int i, j; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + if (subw == 0 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = mask[i * mask_stride + j]; + dst[i * dst_stride + j] = VPX_BLEND_A64(m, + src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 1) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = + ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] + + mask[(2 * i + 1) * mask_stride + (2 * j)] + + mask[(2 * i) * mask_stride + (2 * j + 1)] + + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + 2); + dst[i * dst_stride + j] = VPX_BLEND_A64(m, + src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = VPX_BLEND_AVG(mask[i * mask_stride + (2 * j)], + mask[i * mask_stride + (2 * j + 1)]); + dst[i * dst_stride + j] = VPX_BLEND_A64(m, + src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = VPX_BLEND_AVG(mask[(2 * i) * mask_stride + j], + mask[(2 * i + 1) * mask_stride + j]); + dst[i * dst_stride + j] = VPX_BLEND_A64(m, + src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/blend_a64_vmask.c b/vpx_dsp/blend_a64_vmask.c new file mode 100644 index 000000000..5d48a8336 --- /dev/null +++ b/vpx_dsp/blend_a64_vmask.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/blend.h" + +#include "./vpx_dsp_rtcd.h" + +void vpx_blend_a64_vmask_c( + uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + int i, j; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + for (i = 0; i < h; ++i) { + const int m = mask[i]; + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = VPX_BLEND_A64(m, + src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_blend_a64_vmask_c( + uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int h, int w, int bd) { + int i, j; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + for (i = 0; i < h; ++i) { + const int m = mask[i]; + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = VPX_BLEND_A64(m, + src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/blend_mask.h b/vpx_dsp/blend_mask.h deleted file mode 100644 index 91c5f4d9e..000000000 --- a/vpx_dsp/blend_mask.h +++ /dev/null @@ -1,17 +0,0 @@ -/* -* Copyright (c) 2016 The WebM project authors. All Rights Reserved. -* -* Use of this source code is governed by a BSD-style license -* that can be found in the LICENSE file in the root of the source -* tree. An additional intellectual property rights grant can be found -* in the file PATENTS. All contributing project authors may -* be found in the AUTHORS file in the root of the source tree. -*/ - -#ifndef VPX_DSP_BLEND_MASK_H_ -#define VPX_DSP_BLEND_MASK_H_ - -// Use blend_mask6b() for 6 bit masks -#define MASK_BITS6 6 - -#endif // VPX_DSP_BLEND_MASK_H_ diff --git a/vpx_dsp/blend_mask6b.c b/vpx_dsp/blend_mask6b.c deleted file mode 100644 index abdefa300..000000000 --- a/vpx_dsp/blend_mask6b.c +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Copyright (c) 2016 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "vpx/vpx_integer.h" -#include "vpx_ports/mem.h" -#include "vpx_dsp/blend_mask.h" -#include "vpx_dsp/vpx_dsp_common.h" - -#include "./vpx_dsp_rtcd.h" - -void vpx_blend_mask6b_c(uint8_t *dst, uint32_t dst_stride, - uint8_t *src0, uint32_t src0_stride, - uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w, int subh, int subw) { - int i, j; - - assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); - - assert(h >= 4); - assert(w >= 4); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - - if (subw == 0 && subh == 0) { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - const int m0 = mask[i * mask_stride + j]; - const int m1 = ((1 << MASK_BITS6) - m0); - dst[i * dst_stride + j] = - ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 + - src1[i * src1_stride + j] * m1, MASK_BITS6); - } - } else if (subw == 1 && subh == 1) { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - const int m0 = - ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] + - mask[(2 * i + 1) * mask_stride + (2 * j)] + - mask[(2 * i) * mask_stride + (2 * j + 1)] + - mask[(2 * i + 1) * mask_stride + (2 * j + 1)], - 2); - const int m1 = ((1 << MASK_BITS6) - m0); - dst[i * dst_stride + j] = - ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 + - src1[i * src1_stride + j] * m1, MASK_BITS6); - } - } else if (subw == 1 && subh == 0) { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - const int m0 = - ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] + - mask[i * mask_stride + (2 * j + 1)], 1); - const int m1 = ((1 << MASK_BITS6) - m0); - dst[i * dst_stride + j] = - ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 + - src1[i * src1_stride + j] * m1, MASK_BITS6); - } - } else { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - const int m0 = - ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] + - mask[(2 * i + 1) * mask_stride + j], 1); - const int m1 = ((1 << MASK_BITS6) - m0); - dst[i * dst_stride + j] = - ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 + - src1[i * src1_stride + j] * m1, MASK_BITS6); - } - } -} - -#if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_blend_mask6b_c(uint8_t *dst_8, uint32_t dst_stride, - uint8_t *src0_8, uint32_t src0_stride, - uint8_t *src1_8, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w, int subh, int subw, int bd) { - int i, j; - uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); - uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); - uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); - - assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); - - assert(h >= 4); - assert(w >= 4); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - - assert(bd == 8 || bd == 10 || bd == 12); - - if (subw == 0 && subh == 0) { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - const int m0 = mask[i * mask_stride + j]; - const int m1 = ((1 << MASK_BITS6) - m0); - dst[i * dst_stride + j] = - ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 + - src1[i * src1_stride + j] * m1, MASK_BITS6); - } - } else if (subw == 1 && subh == 1) { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - const int m0 = - ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] + - mask[(2 * i + 1) * mask_stride + (2 * j)] + - mask[(2 * i) * mask_stride + (2 * j + 1)] + - mask[(2 * i + 1) * mask_stride + (2 * j + 1)], - 2); - const int m1 = ((1 << MASK_BITS6) - m0); - dst[i * dst_stride + j] = - ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 + - src1[i * src1_stride + j] * m1, MASK_BITS6); - } - } else if (subw == 1 && subh == 0) { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - const int m0 = - ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] + - mask[i * mask_stride + (2 * j + 1)], 1); - const int m1 = ((1 << MASK_BITS6) - m0); - dst[i * dst_stride + j] = - ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 + - src1[i * src1_stride + j] * m1, MASK_BITS6); - } - } else { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - const int m0 = - ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] + - mask[(2 * i + 1) * mask_stride + j], 1); - const int m1 = ((1 << MASK_BITS6) - m0); - dst[i * dst_stride + j] = - ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 + - src1[i * src1_stride + j] * m1, MASK_BITS6); - } - } -} -#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index f48dd060a..3eb7a9fdf 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -70,11 +70,14 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.c # inter predictions ifeq ($(CONFIG_VP10),yes) -ifeq ($(CONFIG_EXT_INTER),yes) -DSP_SRCS-yes += blend_mask6b.c -DSP_SRCS-yes += blend_mask.h -DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_mask6b_sse4.c -endif #CONFIG_EXT_INTER +DSP_SRCS-yes += blend.h +DSP_SRCS-yes += blend_a64_mask.c +DSP_SRCS-yes += blend_a64_hmask.c +DSP_SRCS-yes += blend_a64_vmask.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_sse4.h +DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c endif #CONFIG_VP10 # interpolation filters diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 59f137e25..02c8727c6 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -959,6 +959,27 @@ if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCO } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER +if (vpx_config("CONFIG_VP10") eq "yes") { + # + # Alpha blending with mask + # + add_proto qw/void vpx_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx"; + add_proto qw/void vpx_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w"; + add_proto qw/void vpx_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w"; + specialize "vpx_blend_a64_mask", qw/sse4_1/; + specialize "vpx_blend_a64_hmask", qw/sse4_1/; + specialize "vpx_blend_a64_vmask", qw/sse4_1/; + + if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + add_proto qw/void vpx_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd"; + add_proto qw/void vpx_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd"; + add_proto qw/void vpx_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd"; + specialize "vpx_highbd_blend_a64_mask", qw/sse4_1/; + specialize "vpx_highbd_blend_a64_hmask", qw/sse4_1/; + specialize "vpx_highbd_blend_a64_vmask", qw/sse4_1/; + } +} # CONFIG_VP10 + if (vpx_config("CONFIG_ENCODERS") eq "yes") { # # Block subtraction @@ -1384,14 +1405,6 @@ if (vpx_config("CONFIG_EXT_INTER") eq "yes") { } } } - - add_proto qw/void vpx_blend_mask6b/, "uint8_t *dst, uint32_t dst_stride, uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx"; - specialize "vpx_blend_mask6b", qw/sse4_1/; - - if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - add_proto qw/void vpx_highbd_blend_mask6b/, "uint8_t *dst, uint32_t dst_stride, uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd"; - specialize "vpx_highbd_blend_mask6b", qw/sse4_1/; - } } # diff --git a/vpx_dsp/x86/blend_a64_hmask_sse4.c b/vpx_dsp/x86/blend_a64_hmask_sse4.c new file mode 100644 index 000000000..a10e0771b --- /dev/null +++ b/vpx_dsp/x86/blend_a64_hmask_sse4.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx/vpx_integer.h" + +#include "./vpx_dsp_rtcd.h" + +// To start out, just dispatch to the function using the 2D mask and +// pass mask stride as 0. This can be improved upon if necessary. + +void vpx_blend_a64_hmask_sse4_1( + uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + vpx_blend_a64_mask_sse4_1(dst, dst_stride, + src0, src0_stride, + src1, src1_stride, + mask, 0, h, w, 0, 0); +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_blend_a64_hmask_sse4_1( + uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int h, int w, + int bd) { + vpx_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, + src0_8, src0_stride, + src1_8, src1_stride, + mask, 0, h, w, 0, 0, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/blend_mask6b_sse4.c b/vpx_dsp/x86/blend_a64_mask_sse4.c similarity index 59% rename from vpx_dsp/x86/blend_mask6b_sse4.c rename to vpx_dsp/x86/blend_a64_mask_sse4.c index 0b1285632..cdb40c2f5 100644 --- a/vpx_dsp/x86/blend_mask6b_sse4.c +++ b/vpx_dsp/x86/blend_a64_mask_sse4.c @@ -15,61 +15,24 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" #include "vpx_dsp/vpx_dsp_common.h" -#include "vpx_dsp/blend_mask.h" +#include "vpx_dsp/blend.h" #include "vpx_dsp/x86/synonyms.h" +#include "vpx_dsp/x86/blend_sse4.h" #include "./vpx_dsp_rtcd.h" -////////////////////////////////////////////////////////////////////////////// -// Common kernels -////////////////////////////////////////////////////////////////////////////// - -static INLINE __m128i blend_4(uint8_t*src0, uint8_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { - const __m128i v_s0_b = xx_loadl_32(src0); - const __m128i v_s1_b = xx_loadl_32(src1); - const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); - const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); - - const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); - const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); - - const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); - - const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS6); - - return v_res_w; -} - -static INLINE __m128i blend_8(uint8_t*src0, uint8_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { - const __m128i v_s0_b = xx_loadl_64(src0); - const __m128i v_s1_b = xx_loadl_64(src1); - const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); - const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); - - const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); - const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); - - const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); - - const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS6); - - return v_res_w; -} - ////////////////////////////////////////////////////////////////////////////// // No sub-sampling ////////////////////////////////////////////////////////////////////////////// -static void blend_mask6b_w4_sse4_1( +static void blend_a64_mask_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, - uint8_t *src0, uint32_t src0_stride, - uint8_t *src1, uint32_t src1_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -91,13 +54,13 @@ static void blend_mask6b_w4_sse4_1( } while (--h); } -static void blend_mask6b_w8_sse4_1( +static void blend_a64_mask_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, - uint8_t *src0, uint32_t src0_stride, - uint8_t *src1, uint32_t src1_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -119,13 +82,13 @@ static void blend_mask6b_w8_sse4_1( } while (--h); } -static void blend_mask6b_w16n_sse4_1( +static void blend_a64_mask_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, - uint8_t *src0, uint32_t src0_stride, - uint8_t *src1, uint32_t src1_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { int c; @@ -157,15 +120,15 @@ static void blend_mask6b_w16n_sse4_1( // Horizontal sub-sampling ////////////////////////////////////////////////////////////////////////////// -static void blend_mask6b_sx_w4_sse4_1( +static void blend_a64_mask_sx_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, - uint8_t *src0, uint32_t src0_stride, - uint8_t *src1, uint32_t src1_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -189,15 +152,15 @@ static void blend_mask6b_sx_w4_sse4_1( } while (--h); } -static void blend_mask6b_sx_w8_sse4_1( +static void blend_a64_mask_sx_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, - uint8_t *src0, uint32_t src0_stride, - uint8_t *src1, uint32_t src1_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -221,15 +184,15 @@ static void blend_mask6b_sx_w8_sse4_1( } while (--h); } -static void blend_mask6b_sx_w16n_sse4_1( +static void blend_a64_mask_sx_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, - uint8_t *src0, uint32_t src0_stride, - uint8_t *src1, uint32_t src1_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { int c; @@ -264,13 +227,13 @@ static void blend_mask6b_sx_w16n_sse4_1( // Vertical sub-sampling ////////////////////////////////////////////////////////////////////////////// -static void blend_mask6b_sy_w4_sse4_1( +static void blend_a64_mask_sy_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, - uint8_t *src0, uint32_t src0_stride, - uint8_t *src1, uint32_t src1_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -295,13 +258,13 @@ static void blend_mask6b_sy_w4_sse4_1( } while (--h); } -static void blend_mask6b_sy_w8_sse4_1( +static void blend_a64_mask_sy_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, - uint8_t *src0, uint32_t src0_stride, - uint8_t *src1, uint32_t src1_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -326,14 +289,14 @@ static void blend_mask6b_sy_w8_sse4_1( } while (--h); } -static void blend_mask6b_sy_w16n_sse4_1( +static void blend_a64_mask_sy_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, - uint8_t *src0, uint32_t src0_stride, - uint8_t *src1, uint32_t src1_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { const __m128i v_zero = _mm_setzero_si128(); - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { int c; @@ -367,15 +330,15 @@ static void blend_mask6b_sy_w16n_sse4_1( // Horizontal and Vertical sub-sampling ////////////////////////////////////////////////////////////////////////////// -static void blend_mask6b_sx_sy_w4_sse4_1( +static void blend_a64_mask_sx_sy_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, - uint8_t *src0, uint32_t src0_stride, - uint8_t *src1, uint32_t src1_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -404,15 +367,15 @@ static void blend_mask6b_sx_sy_w4_sse4_1( } while (--h); } -static void blend_mask6b_sx_sy_w8_sse4_1( +static void blend_a64_mask_sx_sy_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, - uint8_t *src0, uint32_t src0_stride, - uint8_t *src1, uint32_t src1_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -441,15 +404,15 @@ static void blend_mask6b_sx_sy_w8_sse4_1( } while (--h); } -static void blend_mask6b_sx_sy_w16n_sse4_1( +static void blend_a64_mask_sx_sy_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, - uint8_t *src0, uint32_t src0_stride, - uint8_t *src1, uint32_t src1_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { int c; @@ -494,146 +457,67 @@ static void blend_mask6b_sx_sy_w16n_sse4_1( // Dispatch ////////////////////////////////////////////////////////////////////////////// -void vpx_blend_mask6b_sse4_1(uint8_t *dst, uint32_t dst_stride, - uint8_t *src0, uint32_t src0_stride, - uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w, int suby, int subx) { +void vpx_blend_a64_mask_sse4_1( + uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, int suby, int subx) { typedef void (*blend_fn)(uint8_t *dst, uint32_t dst_stride, - uint8_t *src0, uint32_t src0_stride, - uint8_t *src1, uint32_t src1_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w); - static blend_fn blend[3][2][2] = { // width_index X subx X suby + // Dimensions are: width_index X subx X suby + static const blend_fn blend[3][2][2] = { { // w % 16 == 0 - {blend_mask6b_w16n_sse4_1, blend_mask6b_sy_w16n_sse4_1}, - {blend_mask6b_sx_w16n_sse4_1, blend_mask6b_sx_sy_w16n_sse4_1} + {blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1}, + {blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1} }, { // w == 4 - {blend_mask6b_w4_sse4_1, blend_mask6b_sy_w4_sse4_1}, - {blend_mask6b_sx_w4_sse4_1, blend_mask6b_sx_sy_w4_sse4_1} + {blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1}, + {blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1} }, { // w == 8 - {blend_mask6b_w8_sse4_1, blend_mask6b_sy_w8_sse4_1}, - {blend_mask6b_sx_w8_sse4_1, blend_mask6b_sx_sy_w8_sse4_1} + {blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1}, + {blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1} } }; assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); - assert(h >= 4); - assert(w >= 4); + assert(h >= 1); + assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); - blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, - src0, src0_stride, - src1, src1_stride, - mask, mask_stride, - h, w); + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + vpx_blend_a64_mask_c(dst, dst_stride, + src0, src0_stride, + src1, src1_stride, + mask, mask_stride, + h, w, suby, subx); + } else { + blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, + src0, src0_stride, + src1, src1_stride, + mask, mask_stride, + h, w); + } } #if CONFIG_VP9_HIGHBITDEPTH -////////////////////////////////////////////////////////////////////////////// -// Common kernels -////////////////////////////////////////////////////////////////////////////// - -typedef __m128i (*blend_unit_fn)(uint16_t*src0, uint16_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w); - -static INLINE __m128i blend_4_b10(uint16_t*src0, uint16_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { - const __m128i v_s0_w = xx_loadl_64(src0); - const __m128i v_s1_w = xx_loadl_64(src1); - - const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); - const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); - - const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); - - const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS6); - - return v_res_w; -} - -static INLINE __m128i blend_8_b10(uint16_t*src0, uint16_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { - const __m128i v_s0_w = xx_loadu_128(src0); - const __m128i v_s1_w = xx_loadu_128(src1); - - const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); - const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); - - const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); - - const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS6); - - return v_res_w; -} - -static INLINE __m128i blend_4_b12(uint16_t*src0, uint16_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { - const __m128i v_s0_w = xx_loadl_64(src0); - const __m128i v_s1_w = xx_loadl_64(src1); - - // Interleave - const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); - const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); - - // Multiply-Add - const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w); - - // Scale - const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d, MASK_BITS6 - 1); - - // Pack - const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d); - - // Round - const __m128i v_res_w = xx_round_epu16(v_pssum_d); - - return v_res_w; -} - -static INLINE __m128i blend_8_b12(uint16_t*src0, uint16_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { - const __m128i v_s0_w = xx_loadu_128(src0); - const __m128i v_s1_w = xx_loadu_128(src1); - - // Interleave - const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); - const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w); - const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); - const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w); - - // Multiply-Add - const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w); - const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w); - - // Scale - const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d, MASK_BITS6 - 1); - const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d, MASK_BITS6 - 1); - - // Pack - const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d); - - // Round - const __m128i v_res_w = xx_round_epu16(v_pssum_d); - - return v_res_w; -} - ////////////////////////////////////////////////////////////////////////////// // No sub-sampling ////////////////////////////////////////////////////////////////////////////// -static INLINE void blend_mask6b_bn_w4_sse4_1( +static INLINE void blend_a64_mask_bn_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { const __m128i v_m0_b = xx_loadl_32(mask); @@ -651,37 +535,37 @@ static INLINE void blend_mask6b_bn_w4_sse4_1( } while (--h); } -static void blend_mask6b_b10_w4_sse4_1( +static void blend_a64_mask_b10_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { (void)w; - blend_mask6b_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, - blend_4_b10); + blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); } -static void blend_mask6b_b12_w4_sse4_1( +static void blend_a64_mask_b12_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { (void)w; - blend_mask6b_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, - blend_4_b12); + blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); } -static inline void blend_mask6b_bn_w8n_sse4_1( +static inline void blend_a64_mask_bn_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, blend_unit_fn blend) { - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { int c; @@ -701,41 +585,41 @@ static inline void blend_mask6b_bn_w8n_sse4_1( } while (--h); } -static void blend_mask6b_b10_w8n_sse4_1( +static void blend_a64_mask_b10_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { - blend_mask6b_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, - blend_8_b10); + blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b10); } -static void blend_mask6b_b12_w8n_sse4_1( +static void blend_a64_mask_b12_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { - blend_mask6b_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, - blend_8_b12); + blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b12); } ////////////////////////////////////////////////////////////////////////////// // Horizontal sub-sampling ////////////////////////////////////////////////////////////////////////////// -static INLINE void blend_mask6b_bn_sx_w4_sse4_1( +static INLINE void blend_a64_mask_bn_sx_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { const __m128i v_r_b = xx_loadl_64(mask); @@ -755,39 +639,39 @@ static INLINE void blend_mask6b_bn_sx_w4_sse4_1( } while (--h); } -static void blend_mask6b_b10_sx_w4_sse4_1( +static void blend_a64_mask_b10_sx_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { (void)w; - blend_mask6b_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, - blend_4_b10); + blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); } -static void blend_mask6b_b12_sx_w4_sse4_1( +static void blend_a64_mask_b12_sx_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { (void)w; - blend_mask6b_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, - blend_4_b12); + blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); } -static INLINE void blend_mask6b_bn_sx_w8n_sse4_1( +static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, blend_unit_fn blend) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { int c; @@ -809,39 +693,39 @@ static INLINE void blend_mask6b_bn_sx_w8n_sse4_1( } while (--h); } -static void blend_mask6b_b10_sx_w8n_sse4_1( +static void blend_a64_mask_b10_sx_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { - blend_mask6b_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, - blend_8_b10); + blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b10); } -static void blend_mask6b_b12_sx_w8n_sse4_1( +static void blend_a64_mask_b12_sx_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { - blend_mask6b_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, - blend_8_b12); + blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b12); } ////////////////////////////////////////////////////////////////////////////// // Vertical sub-sampling ////////////////////////////////////////////////////////////////////////////// -static INLINE void blend_mask6b_bn_sy_w4_sse4_1( +static INLINE void blend_a64_mask_bn_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { const __m128i v_ra_b = xx_loadl_32(mask); @@ -862,37 +746,37 @@ static INLINE void blend_mask6b_bn_sy_w4_sse4_1( } while (--h); } -static void blend_mask6b_b10_sy_w4_sse4_1( +static void blend_a64_mask_b10_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { (void)w; - blend_mask6b_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, - blend_4_b10); + blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); } -static void blend_mask6b_b12_sy_w4_sse4_1( +static void blend_a64_mask_b12_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { (void)w; - blend_mask6b_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, - blend_4_b12); + blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); } -static INLINE void blend_mask6b_bn_sy_w8n_sse4_1( +static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, blend_unit_fn blend) { - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { int c; @@ -915,41 +799,41 @@ static INLINE void blend_mask6b_bn_sy_w8n_sse4_1( } while (--h); } -static void blend_mask6b_b10_sy_w8n_sse4_1( +static void blend_a64_mask_b10_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { - blend_mask6b_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, - blend_8_b10); + blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b10); } -static void blend_mask6b_b12_sy_w8n_sse4_1( +static void blend_a64_mask_b12_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { - blend_mask6b_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, - blend_8_b12); + blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b12); } ////////////////////////////////////////////////////////////////////////////// // Horizontal and Vertical sub-sampling ////////////////////////////////////////////////////////////////////////////// -static INLINE void blend_mask6b_bn_sx_sy_w4_sse4_1( +static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { const __m128i v_ra_b = xx_loadl_64(mask); @@ -974,39 +858,39 @@ static INLINE void blend_mask6b_bn_sx_sy_w4_sse4_1( } while (--h); } -static void blend_mask6b_b10_sx_sy_w4_sse4_1( +static void blend_a64_mask_b10_sx_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { (void)w; - blend_mask6b_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, - blend_4_b10); + blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); } -static void blend_mask6b_b12_sx_sy_w4_sse4_1( +static void blend_a64_mask_b12_sx_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { (void)w; - blend_mask6b_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, - blend_4_b12); + blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); } -static INLINE void blend_mask6b_bn_sx_sy_w8n_sse4_1( +static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, blend_unit_fn blend) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6); + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { int c; @@ -1033,82 +917,91 @@ static INLINE void blend_mask6b_bn_sx_sy_w8n_sse4_1( } while (--h); } -static void blend_mask6b_b10_sx_sy_w8n_sse4_1( +static void blend_a64_mask_b10_sx_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { - blend_mask6b_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, - blend_8_b10); + blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b10); } -static void blend_mask6b_b12_sx_sy_w8n_sse4_1( +static void blend_a64_mask_b12_sx_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w) { - blend_mask6b_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, - blend_8_b12); + blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b12); } ////////////////////////////////////////////////////////////////////////////// // Dispatch ////////////////////////////////////////////////////////////////////////////// -void vpx_highbd_blend_mask6b_sse4_1(uint8_t *dst_8, uint32_t dst_stride, - uint8_t *src0_8, uint32_t src0_stride, - uint8_t *src1_8, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w, int suby, int subx, int bd) { - uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); - uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); - uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); - +void vpx_highbd_blend_a64_mask_sse4_1( + uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, int suby, int subx, int bd) { typedef void (*blend_fn)(uint16_t *dst, uint32_t dst_stride, - uint16_t *src0, uint32_t src0_stride, - uint16_t *src1, uint32_t src1_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w); - static blend_fn blend[2][2][2][2] = { // bd_index X width_index X subx X suby + // Dimensions are: bd_index X width_index X subx X suby + static const blend_fn blend[2][2][2][2] = { { // bd == 8 or 10 { // w % 8 == 0 - {blend_mask6b_b10_w8n_sse4_1, blend_mask6b_b10_sy_w8n_sse4_1}, - {blend_mask6b_b10_sx_w8n_sse4_1, blend_mask6b_b10_sx_sy_w8n_sse4_1} + {blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1}, + {blend_a64_mask_b10_sx_w8n_sse4_1, blend_a64_mask_b10_sx_sy_w8n_sse4_1} }, { // w == 4 - {blend_mask6b_b10_w4_sse4_1, blend_mask6b_b10_sy_w4_sse4_1}, - {blend_mask6b_b10_sx_w4_sse4_1, blend_mask6b_b10_sx_sy_w4_sse4_1} + {blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1}, + {blend_a64_mask_b10_sx_w4_sse4_1, blend_a64_mask_b10_sx_sy_w4_sse4_1} } }, { // bd == 12 { // w % 8 == 0 - {blend_mask6b_b12_w8n_sse4_1, blend_mask6b_b12_sy_w8n_sse4_1}, - {blend_mask6b_b12_sx_w8n_sse4_1, blend_mask6b_b12_sx_sy_w8n_sse4_1} + {blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1}, + {blend_a64_mask_b12_sx_w8n_sse4_1, blend_a64_mask_b12_sx_sy_w8n_sse4_1} }, { // w == 4 - {blend_mask6b_b12_w4_sse4_1, blend_mask6b_b12_sy_w4_sse4_1}, - {blend_mask6b_b12_sx_w4_sse4_1, blend_mask6b_b12_sx_sy_w4_sse4_1} + {blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1}, + {blend_a64_mask_b12_sx_w4_sse4_1, blend_a64_mask_b12_sx_sy_w4_sse4_1} } } }; - assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); + assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); - assert(h >= 4); - assert(w >= 4); + assert(h >= 1); + assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); assert(bd == 8 || bd == 10 || bd == 12); - - blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride, - src0, src0_stride, - src1, src1_stride, - mask, mask_stride, - h, w); + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + vpx_highbd_blend_a64_mask_c(dst_8, dst_stride, + src0_8, src0_stride, + src1_8, src1_stride, + mask, mask_stride, + h, w, suby, subx, bd); + } else { + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); + + blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride, + src0, src0_stride, + src1, src1_stride, + mask, mask_stride, + h, w); + } } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/blend_a64_vmask_sse4.c b/vpx_dsp/x86/blend_a64_vmask_sse4.c new file mode 100644 index 000000000..4b0f38d83 --- /dev/null +++ b/vpx_dsp/x86/blend_a64_vmask_sse4.c @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE4.1 + +#include + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/blend.h" + +#include "vpx_dsp/x86/synonyms.h" +#include "vpx_dsp/x86/blend_sse4.h" + +#include "./vpx_dsp_rtcd.h" + +////////////////////////////////////////////////////////////////////////////// +// Implementation - No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_vmask_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); + + do { + int c; + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + for (c = 0; c < w; c += 16) { + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, + v_m0_w, v_m1_w); + const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8, + v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void vpx_blend_a64_vmask_sse4_1( + uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + typedef void (*blend_fn)(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w); + + // Dimension: width_index + static const blend_fn blend[9] = { + blend_a64_vmask_w16n_sse4_1, // w % 16 == 0 + vpx_blend_a64_vmask_c, // w == 1 + vpx_blend_a64_vmask_c, // w == 2 + NULL, // INVALID + blend_a64_vmask_w4_sse4_1, // w == 4 + NULL, // INVALID + NULL, // INVALID + NULL, // INVALID + blend_a64_vmask_w8_sse4_1, // w == 8 + }; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + blend[w & 0xf](dst, dst_stride, + src0, src0_stride, + src1, src1_stride, + mask, h, w); +} + +#if CONFIG_VP9_HIGHBITDEPTH +////////////////////////////////////////////////////////////////////////////// +// Implementation - No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_vmask_bn_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_b10_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + (void)w; + blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, + blend_4_b10); +} + +static void blend_a64_vmask_b12_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + (void)w; + blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, + blend_4_b12); +} + +static inline void blend_a64_vmask_bn_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); + + do { + int c; + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + for (c = 0; c < w; c += 8) { + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_b10_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, w, + blend_8_b10); +} + +static void blend_a64_vmask_b12_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, w, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void vpx_highbd_blend_a64_vmask_sse4_1( + uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int h, int w, int bd) { + typedef void (*blend_fn)(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w); + + // Dimensions are: bd_index X width_index + static const blend_fn blend[2][2] = { + { // bd == 8 or 10 + blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0 + blend_a64_vmask_b10_w4_sse4_1, // w == 4 + }, { // bd == 12 + blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0 + blend_a64_vmask_b12_w4_sse4_1, // w == 4 + } + }; + + assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); + assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + vpx_highbd_blend_a64_vmask_c(dst_8, dst_stride, + src0_8, src0_stride, + src1_8, src1_stride, + mask, h, w, bd); + } else { + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); + + blend[bd == 12][(w >> 2) & 1](dst, dst_stride, + src0, src0_stride, + src1, src1_stride, + mask, h, w); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/blend_sse4.h b/vpx_dsp/x86/blend_sse4.h new file mode 100644 index 000000000..9b74f9054 --- /dev/null +++ b/vpx_dsp/x86/blend_sse4.h @@ -0,0 +1,145 @@ +/* +* Copyright (c) 2016 The WebM project authors. All Rights Reserved. +* +* Use of this source code is governed by a BSD-style license +* that can be found in the LICENSE file in the root of the source +* tree. An additional intellectual property rights grant can be found +* in the file PATENTS. All contributing project authors may +* be found in the AUTHORS file in the root of the source tree. +*/ + +#ifndef VPX_DSP_X86_BLEND_SSE4_H_ +#define VPX_DSP_X86_BLEND_SSE4_H_ + +#include "vpx_dsp/blend.h" +#include "vpx_dsp/x86/synonyms.h" + +////////////////////////////////////////////////////////////////////////////// +// Common kernels +////////////////////////////////////////////////////////////////////////////// + +static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_b = xx_loadl_32(src0); + const __m128i v_s1_b = xx_loadl_32(src1); + const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); + const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_b = xx_loadl_64(src0); + const __m128i v_s1_b = xx_loadl_64(src1); + const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); + const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +#if CONFIG_VP9_HIGHBITDEPTH +typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w); + +static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadl_64(src0); + const __m128i v_s1_w = xx_loadl_64(src1); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadu_128(src0); + const __m128i v_s1_w = xx_loadu_128(src1); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadl_64(src0); + const __m128i v_s1_w = xx_loadl_64(src1); + + // Interleave + const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); + const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); + + // Multiply-Add + const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w); + + // Scale + const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d, + VPX_BLEND_A64_ROUND_BITS - 1); + + // Pack + const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d); + + // Round + const __m128i v_res_w = xx_round_epu16(v_pssum_d); + + return v_res_w; +} + +static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadu_128(src0); + const __m128i v_s1_w = xx_loadu_128(src1); + + // Interleave + const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); + const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w); + const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); + const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w); + + // Multiply-Add + const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w); + const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w); + + // Scale + const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d, + VPX_BLEND_A64_ROUND_BITS - 1); + const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d, + VPX_BLEND_A64_ROUND_BITS - 1); + + // Pack + const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d); + + // Round + const __m128i v_res_w = xx_round_epu16(v_pssum_d); + + return v_res_w; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // VPX_DSP_X86_BLEND_SSE4_H_ -- 2.49.0