- Made source buffers pointers to const.
- Renamed vpx_blend_mask6b to vpx_blend_a64_mask. This is more
indicative that the function does alpha blending. The 6, or 6b
suffix was misleading, as the max mask value (64) does not fit into
6 bits.
- Added VPX_BLEND_* macros to use when needing to blend scalars.
- Use VPX_BLEND_A256 in combine_interintra to be more explicit about
the operation being done.
- Added versions of vpx_blend_a64_* which take 1D horizontal/vertical
masks directly and apply them to all rows/columns
(vpx_blend_a64_hmask and vpx_blend_a64_vmask). The SSE4.1 optimzied
horizontal version now falls back on the 2D version. This can be
improved upon if it show up high enough in a profile.
- All vpx_blend_a64_* functions now support block sizes down to 1x1
(ie: a single pixel). This is for usage convenience. The SSE4.1
optimized versions fall back on the C implementation if
w <= 2 or h <= 2. This can again be improved if it becomes hot code.
Change-Id: I13ab3835146ffafe3e1d74d8e9cf64a5abe4144d
--- /dev/null
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+
+#include "test/function_equivalence_test.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "./vp10_rtcd.h"
+
+#include "test/acm_random.h"
+#include "vp10/common/enums.h"
+
+#include "vpx_dsp/blend.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::FunctionEquivalenceTest;
+using std::tr1::make_tuple;
+
+namespace {
+
+template<typename F, typename T>
+class BlendA64Mask1DTest : public FunctionEquivalenceTest<F> {
+ public:
+ static const int kIterations = 10000;
+ static const int kMaxWidth = MAX_SB_SIZE * 5; // * 5 to cover longer strides
+ static const int kMaxHeight = MAX_SB_SIZE;
+ static const int kBufSize = kMaxWidth * kMaxHeight;
+ static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
+ static const int kMaxMaskSize = kMaxMaskWidth;
+
+ BlendA64Mask1DTest() : rng_(ACMRandom::DeterministicSeed()) {}
+
+ virtual ~BlendA64Mask1DTest() {}
+
+ virtual void Execute(T *p_src0, T *p_src1) = 0;
+
+ void Common() {
+ w_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
+ h_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
+
+ dst_offset_ = rng_(33);
+ dst_stride_ = rng_(kMaxWidth + 1 - w_) + w_;
+
+ src0_offset_ = rng_(33);
+ src0_stride_ = rng_(kMaxWidth + 1 - w_) + w_;
+
+ src1_offset_ = rng_(33);
+ src1_stride_ = rng_(kMaxWidth + 1 - w_) + w_;
+
+ T *p_src0;
+ T *p_src1;
+
+ switch (rng_(3)) {
+ case 0: // Separate sources
+ p_src0 = src0_;
+ p_src1 = src1_;
+ break;
+ case 1: // src0 == dst
+ p_src0 = dst_tst_;
+ src0_stride_ = dst_stride_;
+ src0_offset_ = dst_offset_;
+ p_src1 = src1_;
+ break;
+ case 2: // src1 == dst
+ p_src0 = src0_;
+ p_src1 = dst_tst_;
+ src1_stride_ = dst_stride_;
+ src1_offset_ = dst_offset_;
+ break;
+ default:
+ FAIL();
+ }
+
+ Execute(p_src0, p_src1);
+
+ for (int r = 0 ; r < h_ ; ++r) {
+ for (int c = 0 ; c < w_ ; ++c) {
+ ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c],
+ dst_tst_[dst_offset_ + r * dst_stride_ + c]);
+ }
+ }
+ }
+
+ ACMRandom rng_;
+
+ T dst_ref_[kBufSize];
+ T dst_tst_[kBufSize];
+ size_t dst_stride_;
+ size_t dst_offset_;
+
+ T src0_[kBufSize];
+ size_t src0_stride_;
+ size_t src0_offset_;
+
+ T src1_[kBufSize];
+ size_t src1_stride_;
+ size_t src1_offset_;
+
+ uint8_t mask_[kMaxMaskSize];
+
+ int w_;
+ int h_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w);
+
+class BlendA64Mask1DTest8B : public BlendA64Mask1DTest<F8B, uint8_t> {
+ protected:
+ void Execute(uint8_t *p_src0, uint8_t *p_src1) {
+ ref_func_(dst_ref_ + dst_offset_, dst_stride_,
+ p_src0 + src0_offset_, src0_stride_,
+ p_src1 + src1_offset_, src1_stride_,
+ mask_, h_, w_);
+
+ tst_func_(dst_tst_ + dst_offset_, dst_stride_,
+ p_src0 + src0_offset_, src0_stride_,
+ p_src1 + src1_offset_, src1_stride_,
+ mask_, h_, w_);
+ }
+};
+
+TEST_P(BlendA64Mask1DTest8B, RandomValues) {
+ for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+ for (int i = 0 ; i < kBufSize ; ++i) {
+ dst_ref_[i] = rng_.Rand8();
+ dst_tst_[i] = rng_.Rand8();
+
+ src0_[i] = rng_.Rand8();
+ src1_[i] = rng_.Rand8();
+ }
+
+ for (int i = 0 ; i < kMaxMaskSize ; ++i)
+ mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
+
+ Common();
+ }
+}
+
+TEST_P(BlendA64Mask1DTest8B, ExtremeValues) {
+ for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+ for (int i = 0 ; i < kBufSize ; ++i) {
+ dst_ref_[i] = rng_(2) + 254;
+ dst_tst_[i] = rng_(2) + 254;
+ src0_[i] = rng_(2) + 254;
+ src1_[i] = rng_(2) + 254;
+ }
+
+ for (int i = 0 ; i < kMaxMaskSize ; ++i)
+ mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
+
+ Common();
+ }
+}
+
+static void blend_a64_hmask_ref(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
+ [BlendA64Mask1DTest8B::kMaxMaskSize];
+
+ for (int row = 0 ; row < h ; ++row)
+ for (int col = 0 ; col < w ; ++col)
+ mask2d[row][col] = mask[col];
+
+ vpx_blend_a64_mask_c(dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize,
+ h, w, 0, 0);
+}
+
+static void blend_a64_vmask_ref(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
+ [BlendA64Mask1DTest8B::kMaxMaskSize];
+
+ for (int row = 0 ; row < h ; ++row)
+ for (int col = 0 ; col < w ; ++col)
+ mask2d[row][col] = mask[row];
+
+ vpx_blend_a64_mask_c(dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize,
+ h, w, 0, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(
+ C, BlendA64Mask1DTest8B,
+ ::testing::Values(
+ make_tuple(blend_a64_hmask_ref, vpx_blend_a64_hmask_c),
+ make_tuple(blend_a64_vmask_ref, vpx_blend_a64_vmask_c)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1, BlendA64Mask1DTest8B,
+ ::testing::Values(
+ make_tuple(blend_a64_hmask_ref, vpx_blend_a64_hmask_sse4_1),
+ make_tuple(blend_a64_vmask_ref, vpx_blend_a64_vmask_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, int bd);
+
+class BlendA64Mask1DTestHBD : public BlendA64Mask1DTest<FHBD, uint16_t> {
+ protected:
+ void Execute(uint16_t *p_src0, uint16_t *p_src1) {
+ ref_func_(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+ CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+ CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+ mask_, h_, w_, bit_depth_);
+
+ ASM_REGISTER_STATE_CHECK(
+ tst_func_(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
+ CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+ CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+ mask_, h_, w_, bit_depth_));
+ }
+
+ int bit_depth_;
+};
+
+TEST_P(BlendA64Mask1DTestHBD, RandomValues) {
+ for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+ switch (rng_(3)) {
+ case 0:
+ bit_depth_ = 8;
+ break;
+ case 1:
+ bit_depth_ = 10;
+ break;
+ default:
+ bit_depth_ = 12;
+ break;
+ }
+
+ const int hi = 1 << bit_depth_;
+
+ for (int i = 0 ; i < kBufSize ; ++i) {
+ dst_ref_[i] = rng_(hi);
+ dst_tst_[i] = rng_(hi);
+ src0_[i] = rng_(hi);
+ src1_[i] = rng_(hi);
+ }
+
+ for (int i = 0 ; i < kMaxMaskSize ; ++i)
+ mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
+
+ Common();
+ }
+}
+
+TEST_P(BlendA64Mask1DTestHBD, ExtremeValues) {
+ for (int iter = 0 ; iter < 1000 && !HasFatalFailure(); ++iter) {
+ switch (rng_(3)) {
+ case 0:
+ bit_depth_ = 8;
+ break;
+ case 1:
+ bit_depth_ = 10;
+ break;
+ default:
+ bit_depth_ = 12;
+ break;
+ }
+
+ const int hi = 1 << bit_depth_;
+ const int lo = hi - 2;
+
+ for (int i = 0 ; i < kBufSize ; ++i) {
+ dst_ref_[i] = rng_(hi - lo) + lo;
+ dst_tst_[i] = rng_(hi - lo) + lo;
+ src0_[i] = rng_(hi - lo) + lo;
+ src1_[i] = rng_(hi - lo) + lo;
+ }
+
+ for (int i = 0 ; i < kMaxMaskSize ; ++i)
+ mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
+
+ Common();
+ }
+}
+
+static void highbd_blend_a64_hmask_ref(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, int bd) {
+ uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
+ [BlendA64Mask1DTestHBD::kMaxMaskSize];
+
+ for (int row = 0 ; row < h ; ++row)
+ for (int col = 0 ; col < w ; ++col)
+ mask2d[row][col] = mask[col];
+
+ vpx_highbd_blend_a64_mask_c(dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ &mask2d[0][0],
+ BlendA64Mask1DTestHBD::kMaxMaskSize,
+ h, w, 0, 0, bd);
+}
+
+static void highbd_blend_a64_vmask_ref(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, int bd) {
+ uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
+ [BlendA64Mask1DTestHBD::kMaxMaskSize];
+
+ for (int row = 0 ; row < h ; ++row)
+ for (int col = 0 ; col < w ; ++col)
+ mask2d[row][col] = mask[row];
+
+ vpx_highbd_blend_a64_mask_c(dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ &mask2d[0][0],
+ BlendA64Mask1DTestHBD::kMaxMaskSize,
+ h, w, 0, 0, bd);
+}
+
+INSTANTIATE_TEST_CASE_P(
+ C, BlendA64Mask1DTestHBD,
+ ::testing::Values(
+ make_tuple(highbd_blend_a64_hmask_ref, vpx_highbd_blend_a64_hmask_c),
+ make_tuple(highbd_blend_a64_vmask_ref, vpx_highbd_blend_a64_vmask_c)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1, BlendA64Mask1DTestHBD,
+ ::testing::Values(
+ make_tuple(highbd_blend_a64_hmask_ref, vpx_highbd_blend_a64_hmask_sse4_1),
+ make_tuple(highbd_blend_a64_vmask_ref, vpx_highbd_blend_a64_vmask_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+} // namespace
#include "test/acm_random.h"
#include "vp10/common/enums.h"
+#include "vpx_dsp/blend.h"
+
using libvpx_test::ACMRandom;
using libvpx_test::FunctionEquivalenceTest;
using std::tr1::make_tuple;
namespace {
template<typename F, typename T>
-class BlendMask6Test : public FunctionEquivalenceTest<F> {
+class BlendA64MaskTest : public FunctionEquivalenceTest<F> {
protected:
static const int kIterations = 10000;
static const int kMaxWidth = MAX_SB_SIZE * 5; // * 5 to cover longer strides
static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
static const int kMaxMaskSize = kMaxMaskWidth * kMaxMaskWidth;
- BlendMask6Test() : rng_(ACMRandom::DeterministicSeed()) {}
+ BlendA64MaskTest() : rng_(ACMRandom::DeterministicSeed()) {}
- virtual ~BlendMask6Test() {}
+ virtual ~BlendA64MaskTest() {}
- virtual void Execute(T *p_src0, T *p_src1) = 0;
+ virtual void Execute(const T *p_src0, const T *p_src1) = 0;
void Common() {
- w_ = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 2) + 2);
- h_ = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 2) + 2);
+ w_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
+ h_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
subx_ = rng_(2);
suby_ = rng_(2);
//////////////////////////////////////////////////////////////////////////////
typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride,
- int h, int w, int suby, int subx);
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int suby, int subx);
-class BlendMask6Test8B : public BlendMask6Test<F8B, uint8_t> {
+class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t> {
protected:
- void Execute(uint8_t *p_src0, uint8_t *p_src1) {
+ void Execute(const uint8_t *p_src0, const uint8_t *p_src1) {
ref_func_(dst_ref_ + dst_offset_, dst_stride_,
p_src0 + src0_offset_, src0_stride_,
p_src1 + src1_offset_, src1_stride_,
}
};
-TEST_P(BlendMask6Test8B, RandomValues) {
+TEST_P(BlendA64MaskTest8B, RandomValues) {
for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
for (int i = 0 ; i < kBufSize ; ++i) {
dst_ref_[i] = rng_.Rand8();
}
for (int i = 0 ; i < kMaxMaskSize ; ++i)
- mask_[i] = rng_(65);
+ mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
Common();
}
}
-TEST_P(BlendMask6Test8B, ExtremeValues) {
+TEST_P(BlendA64MaskTest8B, ExtremeValues) {
for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
for (int i = 0 ; i < kBufSize ; ++i) {
dst_ref_[i] = rng_(2) + 254;
}
for (int i = 0 ; i < kMaxMaskSize ; ++i)
- mask_[i] = rng_(2) + 63;
+ mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
Common();
}
#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(
- SSE4_1_C_COMPARE, BlendMask6Test8B,
- ::testing::Values(make_tuple(&vpx_blend_mask6b_c, &vpx_blend_mask6b_sse4_1)));
+ SSE4_1_C_COMPARE, BlendA64MaskTest8B,
+ ::testing::Values(make_tuple(vpx_blend_a64_mask_c,
+ vpx_blend_a64_mask_sse4_1)));
#endif // HAVE_SSE4_1
#if CONFIG_VP9_HIGHBITDEPTH
//////////////////////////////////////////////////////////////////////////////
typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride,
- int h, int w, int suby, int subx, int bd);
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int suby, int subx, int bd);
-class BlendMask6TestHBD : public BlendMask6Test<FHBD, uint16_t> {
+class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t> {
protected:
- void Execute(uint16_t *p_src0, uint16_t *p_src1) {
+ void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
ref_func_(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
int bit_depth_;
};
-TEST_P(BlendMask6TestHBD, RandomValues) {
+TEST_P(BlendA64MaskTestHBD, RandomValues) {
for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
switch (rng_(3)) {
case 0:
}
for (int i = 0 ; i < kMaxMaskSize ; ++i)
- mask_[i] = rng_(65);
+ mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
Common();
}
}
-TEST_P(BlendMask6TestHBD, ExtremeValues) {
+TEST_P(BlendA64MaskTestHBD, ExtremeValues) {
for (int iter = 0 ; iter < 1000 && !HasFatalFailure(); ++iter) {
switch (rng_(3)) {
case 0:
}
for (int i = 0 ; i < kMaxMaskSize ; ++i)
- mask_[i] = rng_(65);
+ mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
Common();
}
#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(
- SSE4_1_C_COMPARE, BlendMask6TestHBD,
- ::testing::Values(make_tuple(&vpx_highbd_blend_mask6b_c,
- &vpx_highbd_blend_mask6b_sse4_1)));
+ SSE4_1_C_COMPARE, BlendA64MaskTestHBD,
+ ::testing::Values(make_tuple(vpx_highbd_blend_a64_mask_c,
+ vpx_highbd_blend_a64_mask_sse4_1)));
#endif // HAVE_SSE4_1
#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_a64_mask_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_a64_mask_1d_test.cc
ifeq ($(CONFIG_EXT_INTER),yes)
LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc
LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_mask6b_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_wedge_utils_test.cc
endif
p1[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX);
}
- vpx_blend_mask6b(p, w, p0, w, p1, w, m, w, h, w, 0, 0);
+ vpx_blend_a64_mask(p, w, p0, w, p1, w, m, w, h, w, 0, 0);
vpx_subtract_block(h, w, r0, w, s, w, p0, w);
vpx_subtract_block(h, w, r1, w, s, w, p1, w);
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/blend.h"
#include "vp10/common/blockd.h"
#include "vp10/common/reconinter.h"
#if CONFIG_SUPERTX
static void build_masked_compound_wedge_extend(
uint8_t *dst, int dst_stride,
- uint8_t *src0, int src0_stride,
- uint8_t *src1, int src1_stride,
+ const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
int wedge_index,
int wedge_sign,
BLOCK_SIZE sb_type,
const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
const uint8_t *mask = vp10_get_soft_mask(
wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
- vpx_blend_mask6b(dst, dst_stride,
- src0, src0_stride,
- src1, src1_stride,
- mask, MASK_MASTER_STRIDE,
- h, w, subh, subw);
+ vpx_blend_a64_mask(dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, MASK_MASTER_STRIDE,
+ h, w, subh, subw);
}
#if CONFIG_VP9_HIGHBITDEPTH
static void build_masked_compound_wedge_extend_highbd(
uint8_t *dst_8, int dst_stride,
- uint8_t *src0_8, int src0_stride,
- uint8_t *src1_8, int src1_stride,
+ const uint8_t *src0_8, int src0_stride,
+ const uint8_t *src1_8, int src1_stride,
int wedge_index, int wedge_sign,
BLOCK_SIZE sb_type,
int wedge_offset_x, int wedge_offset_y,
const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
const uint8_t *mask = vp10_get_soft_mask(
wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
- vpx_highbd_blend_mask6b(dst_8, dst_stride,
- src0_8, src0_stride,
- src1_8, src1_stride,
- mask, MASK_MASTER_STRIDE,
- h, w, subh, subw, bd);
+ vpx_highbd_blend_a64_mask(dst_8, dst_stride,
+ src0_8, src0_stride,
+ src1_8, src1_stride,
+ mask, MASK_MASTER_STRIDE,
+ h, w, subh, subw, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_SUPERTX
-static void build_masked_compound_wedge(uint8_t *dst, int dst_stride,
- uint8_t *src0, int src0_stride,
- uint8_t *src1, int src1_stride,
- int wedge_index, int wedge_sign,
- BLOCK_SIZE sb_type,
- int h, int w) {
+static void build_masked_compound_wedge(
+ uint8_t *dst, int dst_stride,
+ const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
+ int wedge_index, int wedge_sign,
+ BLOCK_SIZE sb_type,
+ int h, int w) {
// Derive subsampling from h and w passed in. May be refactored to
// pass in subsampling factors directly.
const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign,
sb_type);
- vpx_blend_mask6b(dst, dst_stride,
- src0, src0_stride,
- src1, src1_stride,
- mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
- h, w, subh, subw);
+ vpx_blend_a64_mask(dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
+ h, w, subh, subw);
}
#if CONFIG_VP9_HIGHBITDEPTH
-static void build_masked_compound_wedge_highbd(uint8_t *dst_8, int dst_stride,
- uint8_t *src0_8, int src0_stride,
- uint8_t *src1_8, int src1_stride,
- int wedge_index, int wedge_sign,
- BLOCK_SIZE sb_type,
- int h, int w, int bd) {
+static void build_masked_compound_wedge_highbd(
+ uint8_t *dst_8, int dst_stride,
+ const uint8_t *src0_8, int src0_stride,
+ const uint8_t *src1_8, int src1_stride,
+ int wedge_index, int wedge_sign,
+ BLOCK_SIZE sb_type,
+ int h, int w, int bd) {
// Derive subsampling from h and w passed in. May be refactored to
// pass in subsampling factors directly.
const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign,
sb_type);
- vpx_highbd_blend_mask6b(dst_8, dst_stride,
- src0_8, src0_stride,
- src1_8, src1_stride,
- mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
- h, w, subh, subw, bd);
+ vpx_highbd_blend_a64_mask(dst_8, dst_stride,
+ src0_8, src0_stride,
+ src1_8, src1_stride,
+ mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
+ h, w, subh, subw, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
BLOCK_SIZE plane_bsize,
uint8_t *comppred,
int compstride,
- uint8_t *interpred,
+ const uint8_t *interpred,
int interstride,
- uint8_t *intrapred,
+ const uint8_t *intrapred,
int intrastride) {
- const int scale_bits = 8;
- const int scale_max = (1 << scale_bits);
const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
const int size_scale = ii_size_scales[plane_bsize];
bsize);
const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
- vpx_blend_mask6b(comppred, compstride,
- intrapred, intrastride,
- interpred, interstride,
- mask, 4 * num_4x4_blocks_wide_lookup[bsize],
- bh, bw, subh, subw);
+ vpx_blend_a64_mask(comppred, compstride,
+ intrapred, intrastride,
+ interpred, interstride,
+ mask, 4 * num_4x4_blocks_wide_lookup[bsize],
+ bh, bw, subh, subw);
}
return;
}
for (j = 0; j < bw; ++j) {
int scale = ii_weights1d[i * size_scale];
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
for (j = 0; j < bw; ++j) {
int scale = ii_weights1d[j * size_scale];
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
int scale = (ii_weights1d[i * size_scale] * 3 +
ii_weights1d[j * size_scale]) >> 2;
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
int scale = (ii_weights1d[j * size_scale] * 3 +
ii_weights1d[i * size_scale]) >> 2;
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
for (j = 0; j < bw; ++j) {
int scale = ii_weights1d[(i < j ? i : j) * size_scale];
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
int scale = (ii_weights1d[i * size_scale] +
ii_weights1d[j * size_scale]) >> 1;
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
for (i = 0; i < bh; ++i) {
for (j = 0; j < bw; ++j) {
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- interpred[i * interstride + j] +
- intrapred[i * intrastride + j],
- 1);
+ VPX_BLEND_AVG(intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
BLOCK_SIZE plane_bsize,
uint8_t *comppred8,
int compstride,
- uint8_t *interpred8,
+ const uint8_t *interpred8,
int interstride,
- uint8_t *intrapred8,
+ const uint8_t *intrapred8,
int intrastride, int bd) {
- const int scale_bits = 8;
- const int scale_max = (1 << scale_bits);
const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
const int size_scale = ii_size_scales[plane_bsize];
int i, j;
uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8);
- uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
- uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
+ const uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
+ const uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
if (use_wedge_interintra) {
if (is_interintra_wedge_used(bsize)) {
bsize);
const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
- vpx_highbd_blend_mask6b(comppred8, compstride,
- intrapred8, intrastride,
- interpred8, interstride,
- mask, bw,
- bh, bw, subh, subw, bd);
+ vpx_highbd_blend_a64_mask(comppred8, compstride,
+ intrapred8, intrastride,
+ interpred8, interstride,
+ mask, bw,
+ bh, bw, subh, subw, bd);
}
return;
}
for (j = 0; j < bw; ++j) {
int scale = ii_weights1d[i * size_scale];
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
for (j = 0; j < bw; ++j) {
int scale = ii_weights1d[j * size_scale];
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
int scale = (ii_weights1d[i * size_scale] * 3 +
ii_weights1d[j * size_scale]) >> 2;
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
int scale = (ii_weights1d[j * size_scale] * 3 +
ii_weights1d[i * size_scale]) >> 2;
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
for (j = 0; j < bw; ++j) {
int scale = ii_weights1d[(i < j ? i : j) * size_scale];
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
int scale = (ii_weights1d[i * size_scale] +
ii_weights1d[j * size_scale]) >> 1;
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
for (i = 0; i < bh; ++i) {
for (j = 0; j < bw; ++j) {
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- interpred[i * interstride + j] +
- intrapred[i * intrastride + j],
- 1);
+ VPX_BLEND_AVG(interpred[i * interstride + j],
+ intrapred[i * intrastride + j]);
}
}
break;
void vp10_combine_interintra(MACROBLOCKD *xd,
BLOCK_SIZE bsize, int plane,
- uint8_t *inter_pred, int inter_stride,
- uint8_t *intra_pred, int intra_stride) {
+ const uint8_t *inter_pred, int inter_stride,
+ const uint8_t *intra_pred, int intra_stride) {
const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
void vp10_combine_interintra(
MACROBLOCKD *xd,
BLOCK_SIZE bsize, int plane,
- uint8_t *inter_pred, int inter_stride,
- uint8_t *intra_pred, int intra_stride);
+ const uint8_t *inter_pred, int inter_stride,
+ const uint8_t *intra_pred, int intra_stride);
void vp10_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
uint8_t *upred,
uint8_t *vpred,
--- /dev/null
+/*
+* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+*
+* Use of this source code is governed by a BSD-style license
+* that can be found in the LICENSE file in the root of the source
+* tree. An additional intellectual property rights grant can be found
+* in the file PATENTS. All contributing project authors may
+* be found in the AUTHORS file in the root of the source tree.
+*/
+
+#ifndef VPX_DSP_BLEND_H_
+#define VPX_DSP_BLEND_H_
+
+#include "vpx_ports/mem.h"
+
+// Various blending functions and macros.
+// See also the vpx_blend_* functions in vpx_dsp_rtcd.h
+
+// Alpha blending with alpha values from the range [0, 64], where 64
+// means use the first input and 0 means use the second input.
+#define VPX_BLEND_A64_ROUND_BITS 6
+#define VPX_BLEND_A64_MAX_ALPHA (1 << VPX_BLEND_A64_ROUND_BITS) // 64
+
+#define VPX_BLEND_A64(a, v0, v1) \
+ ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
+ VPX_BLEND_A64_ROUND_BITS)
+
+// Alpha blending with alpha values from the range [0, 256], where 256
+// means use the first input and 0 means use the second input.
+#define VPX_BLEND_A256_ROUND_BITS 8
+#define VPX_BLEND_A256_MAX_ALPHA (1 << VPX_BLEND_A256_ROUND_BITS) // 256
+
+#define VPX_BLEND_A256(a, v0, v1) \
+ ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A256_MAX_ALPHA - (a)) * (v1), \
+ VPX_BLEND_A256_ROUND_BITS)
+
+// Blending by averaging.
+#define VPX_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
+
+#endif // VPX_DSP_BLEND_H_
--- /dev/null
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_blend_a64_hmask_c(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ int i, j;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ dst[i * dst_stride + j] = VPX_BLEND_A64(mask[j],
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_hmask_c(
+ uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, int bd) {
+ int i, j;
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ dst[i * dst_stride + j] = VPX_BLEND_A64(mask[j],
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
--- /dev/null
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/blend.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+// Blending with alpha mask. Mask values come from the range [0, 64],
+// as described for VPX_BLEND_A64 in vpx_dsp/blned.h. src0 or src1 can
+// be the same as dst, or dst can be different from both sources.
+
+void vpx_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int subh, int subw) {
+ int i, j;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (subw == 0 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = mask[i * mask_stride + j];
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else if (subw == 1 && subh == 1) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m =
+ ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j)] +
+ mask[(2 * i) * mask_stride + (2 * j + 1)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+ 2);
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else if (subw == 1 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = VPX_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+ mask[i * mask_stride + (2 * j + 1)]);
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = VPX_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+ mask[(2 * i + 1) * mask_stride + j]);
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int subh, int subw, int bd) {
+ int i, j;
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (subw == 0 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = mask[i * mask_stride + j];
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else if (subw == 1 && subh == 1) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m =
+ ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j)] +
+ mask[(2 * i) * mask_stride + (2 * j + 1)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+ 2);
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else if (subw == 1 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = VPX_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+ mask[i * mask_stride + (2 * j + 1)]);
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = VPX_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+ mask[(2 * i + 1) * mask_stride + j]);
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
--- /dev/null
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_blend_a64_vmask_c(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ int i, j;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ for (i = 0; i < h; ++i) {
+ const int m = mask[i];
+ for (j = 0; j < w; ++j) {
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_vmask_c(
+ uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, int bd) {
+ int i, j;
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ for (i = 0; i < h; ++i) {
+ const int m = mask[i];
+ for (j = 0; j < w; ++j) {
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+++ /dev/null
-/*
-* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
-*
-* Use of this source code is governed by a BSD-style license
-* that can be found in the LICENSE file in the root of the source
-* tree. An additional intellectual property rights grant can be found
-* in the file PATENTS. All contributing project authors may
-* be found in the AUTHORS file in the root of the source tree.
-*/
-
-#ifndef VPX_DSP_BLEND_MASK_H_
-#define VPX_DSP_BLEND_MASK_H_
-
-// Use blend_mask6b() for 6 bit masks
-#define MASK_BITS6 6
-
-#endif // VPX_DSP_BLEND_MASK_H_
+++ /dev/null
-/*
- * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-#include "vpx_dsp/blend_mask.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-
-#include "./vpx_dsp_rtcd.h"
-
-void vpx_blend_mask6b_c(uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride,
- int h, int w, int subh, int subw) {
- int i, j;
-
- assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 4);
- assert(w >= 4);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- if (subw == 0 && subh == 0) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- const int m0 = mask[i * mask_stride + j];
- const int m1 = ((1 << MASK_BITS6) - m0);
- dst[i * dst_stride + j] =
- ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
- src1[i * src1_stride + j] * m1, MASK_BITS6);
- }
- } else if (subw == 1 && subh == 1) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- const int m0 =
- ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
- mask[(2 * i + 1) * mask_stride + (2 * j)] +
- mask[(2 * i) * mask_stride + (2 * j + 1)] +
- mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
- 2);
- const int m1 = ((1 << MASK_BITS6) - m0);
- dst[i * dst_stride + j] =
- ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
- src1[i * src1_stride + j] * m1, MASK_BITS6);
- }
- } else if (subw == 1 && subh == 0) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- const int m0 =
- ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] +
- mask[i * mask_stride + (2 * j + 1)], 1);
- const int m1 = ((1 << MASK_BITS6) - m0);
- dst[i * dst_stride + j] =
- ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
- src1[i * src1_stride + j] * m1, MASK_BITS6);
- }
- } else {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- const int m0 =
- ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] +
- mask[(2 * i + 1) * mask_stride + j], 1);
- const int m1 = ((1 << MASK_BITS6) - m0);
- dst[i * dst_stride + j] =
- ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
- src1[i * src1_stride + j] * m1, MASK_BITS6);
- }
- }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_blend_mask6b_c(uint8_t *dst_8, uint32_t dst_stride,
- uint8_t *src0_8, uint32_t src0_stride,
- uint8_t *src1_8, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride,
- int h, int w, int subh, int subw, int bd) {
- int i, j;
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
- uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
- uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
-
- assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 4);
- assert(w >= 4);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- assert(bd == 8 || bd == 10 || bd == 12);
-
- if (subw == 0 && subh == 0) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- const int m0 = mask[i * mask_stride + j];
- const int m1 = ((1 << MASK_BITS6) - m0);
- dst[i * dst_stride + j] =
- ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
- src1[i * src1_stride + j] * m1, MASK_BITS6);
- }
- } else if (subw == 1 && subh == 1) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- const int m0 =
- ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
- mask[(2 * i + 1) * mask_stride + (2 * j)] +
- mask[(2 * i) * mask_stride + (2 * j + 1)] +
- mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
- 2);
- const int m1 = ((1 << MASK_BITS6) - m0);
- dst[i * dst_stride + j] =
- ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
- src1[i * src1_stride + j] * m1, MASK_BITS6);
- }
- } else if (subw == 1 && subh == 0) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- const int m0 =
- ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] +
- mask[i * mask_stride + (2 * j + 1)], 1);
- const int m1 = ((1 << MASK_BITS6) - m0);
- dst[i * dst_stride + j] =
- ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
- src1[i * src1_stride + j] * m1, MASK_BITS6);
- }
- } else {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- const int m0 =
- ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] +
- mask[(2 * i + 1) * mask_stride + j], 1);
- const int m1 = ((1 << MASK_BITS6) - m0);
- dst[i * dst_stride + j] =
- ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
- src1[i * src1_stride + j] * m1, MASK_BITS6);
- }
- }
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
# inter predictions
ifeq ($(CONFIG_VP10),yes)
-ifeq ($(CONFIG_EXT_INTER),yes)
-DSP_SRCS-yes += blend_mask6b.c
-DSP_SRCS-yes += blend_mask.h
-DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_mask6b_sse4.c
-endif #CONFIG_EXT_INTER
+DSP_SRCS-yes += blend.h
+DSP_SRCS-yes += blend_a64_mask.c
+DSP_SRCS-yes += blend_a64_hmask.c
+DSP_SRCS-yes += blend_a64_vmask.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_sse4.h
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c
endif #CONFIG_VP10
# interpolation filters
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+if (vpx_config("CONFIG_VP10") eq "yes") {
+ #
+ # Alpha blending with mask
+ #
+ add_proto qw/void vpx_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
+ add_proto qw/void vpx_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
+ add_proto qw/void vpx_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
+ specialize "vpx_blend_a64_mask", qw/sse4_1/;
+ specialize "vpx_blend_a64_hmask", qw/sse4_1/;
+ specialize "vpx_blend_a64_vmask", qw/sse4_1/;
+
+ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void vpx_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
+ add_proto qw/void vpx_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
+ add_proto qw/void vpx_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
+ specialize "vpx_highbd_blend_a64_mask", qw/sse4_1/;
+ specialize "vpx_highbd_blend_a64_hmask", qw/sse4_1/;
+ specialize "vpx_highbd_blend_a64_vmask", qw/sse4_1/;
+ }
+} # CONFIG_VP10
+
if (vpx_config("CONFIG_ENCODERS") eq "yes") {
#
# Block subtraction
}
}
}
-
- add_proto qw/void vpx_blend_mask6b/, "uint8_t *dst, uint32_t dst_stride, uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
- specialize "vpx_blend_mask6b", qw/sse4_1/;
-
- if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- add_proto qw/void vpx_highbd_blend_mask6b/, "uint8_t *dst, uint32_t dst_stride, uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
- specialize "vpx_highbd_blend_mask6b", qw/sse4_1/;
- }
}
#
--- /dev/null
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx/vpx_integer.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+// To start out, just dispatch to the function using the 2D mask and
+// pass mask stride as 0. This can be improved upon if necessary.
+
+void vpx_blend_a64_hmask_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ vpx_blend_a64_mask_sse4_1(dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, 0, h, w, 0, 0);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_hmask_sse4_1(
+ uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w,
+ int bd) {
+ vpx_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride,
+ src0_8, src0_stride,
+ src1_8, src1_stride,
+ mask, 0, h, w, 0, 0, bd);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/blend_mask.h"
+#include "vpx_dsp/blend.h"
#include "vpx_dsp/x86/synonyms.h"
+#include "vpx_dsp/x86/blend_sse4.h"
#include "./vpx_dsp_rtcd.h"
-//////////////////////////////////////////////////////////////////////////////
-// Common kernels
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE __m128i blend_4(uint8_t*src0, uint8_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_b = xx_loadl_32(src0);
- const __m128i v_s1_b = xx_loadl_32(src1);
- const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
- const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
-
- const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
- const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
- const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
- const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS6);
-
- return v_res_w;
-}
-
-static INLINE __m128i blend_8(uint8_t*src0, uint8_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_b = xx_loadl_64(src0);
- const __m128i v_s1_b = xx_loadl_64(src1);
- const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
- const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
-
- const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
- const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
- const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
- const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS6);
-
- return v_res_w;
-}
-
//////////////////////////////////////////////////////////////////////////////
// No sub-sampling
//////////////////////////////////////////////////////////////////////////////
-static void blend_mask6b_w4_sse4_1(
+static void blend_a64_mask_w4_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
(void)w;
} while (--h);
}
-static void blend_mask6b_w8_sse4_1(
+static void blend_a64_mask_w8_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
(void)w;
} while (--h);
}
-static void blend_mask6b_w16n_sse4_1(
+static void blend_a64_mask_w16n_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
int c;
// Horizontal sub-sampling
//////////////////////////////////////////////////////////////////////////////
-static void blend_mask6b_sx_w4_sse4_1(
+static void blend_a64_mask_sx_w4_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
(void)w;
} while (--h);
}
-static void blend_mask6b_sx_w8_sse4_1(
+static void blend_a64_mask_sx_w8_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
(void)w;
} while (--h);
}
-static void blend_mask6b_sx_w16n_sse4_1(
+static void blend_a64_mask_sx_w16n_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
int c;
// Vertical sub-sampling
//////////////////////////////////////////////////////////////////////////////
-static void blend_mask6b_sy_w4_sse4_1(
+static void blend_a64_mask_sy_w4_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
(void)w;
} while (--h);
}
-static void blend_mask6b_sy_w8_sse4_1(
+static void blend_a64_mask_sy_w8_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
(void)w;
} while (--h);
}
-static void blend_mask6b_sy_w16n_sse4_1(
+static void blend_a64_mask_sy_w16n_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
const __m128i v_zero = _mm_setzero_si128();
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
int c;
// Horizontal and Vertical sub-sampling
//////////////////////////////////////////////////////////////////////////////
-static void blend_mask6b_sx_sy_w4_sse4_1(
+static void blend_a64_mask_sx_sy_w4_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
(void)w;
} while (--h);
}
-static void blend_mask6b_sx_sy_w8_sse4_1(
+static void blend_a64_mask_sx_sy_w8_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
(void)w;
} while (--h);
}
-static void blend_mask6b_sx_sy_w16n_sse4_1(
+static void blend_a64_mask_sx_sy_w16n_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
int c;
// Dispatch
//////////////////////////////////////////////////////////////////////////////
-void vpx_blend_mask6b_sse4_1(uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride,
- int h, int w, int suby, int subx) {
+void vpx_blend_a64_mask_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int suby, int subx) {
typedef void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w);
- static blend_fn blend[3][2][2] = { // width_index X subx X suby
+ // Dimensions are: width_index X subx X suby
+ static const blend_fn blend[3][2][2] = {
{ // w % 16 == 0
- {blend_mask6b_w16n_sse4_1, blend_mask6b_sy_w16n_sse4_1},
- {blend_mask6b_sx_w16n_sse4_1, blend_mask6b_sx_sy_w16n_sse4_1}
+ {blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1},
+ {blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1}
}, { // w == 4
- {blend_mask6b_w4_sse4_1, blend_mask6b_sy_w4_sse4_1},
- {blend_mask6b_sx_w4_sse4_1, blend_mask6b_sx_sy_w4_sse4_1}
+ {blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1},
+ {blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1}
}, { // w == 8
- {blend_mask6b_w8_sse4_1, blend_mask6b_sy_w8_sse4_1},
- {blend_mask6b_sx_w8_sse4_1, blend_mask6b_sx_sy_w8_sse4_1}
+ {blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1},
+ {blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1}
}
};
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
- assert(h >= 4);
- assert(w >= 4);
+ assert(h >= 1);
+ assert(w >= 1);
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
- blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride,
- src0, src0_stride,
- src1, src1_stride,
- mask, mask_stride,
- h, w);
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ vpx_blend_a64_mask_c(dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, mask_stride,
+ h, w, suby, subx);
+ } else {
+ blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, mask_stride,
+ h, w);
+ }
}
#if CONFIG_VP9_HIGHBITDEPTH
-//////////////////////////////////////////////////////////////////////////////
-// Common kernels
-//////////////////////////////////////////////////////////////////////////////
-
-typedef __m128i (*blend_unit_fn)(uint16_t*src0, uint16_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w);
-
-static INLINE __m128i blend_4_b10(uint16_t*src0, uint16_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_w = xx_loadl_64(src0);
- const __m128i v_s1_w = xx_loadl_64(src1);
-
- const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
- const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
- const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
- const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS6);
-
- return v_res_w;
-}
-
-static INLINE __m128i blend_8_b10(uint16_t*src0, uint16_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_w = xx_loadu_128(src0);
- const __m128i v_s1_w = xx_loadu_128(src1);
-
- const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
- const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
- const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
- const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS6);
-
- return v_res_w;
-}
-
-static INLINE __m128i blend_4_b12(uint16_t*src0, uint16_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_w = xx_loadl_64(src0);
- const __m128i v_s1_w = xx_loadl_64(src1);
-
- // Interleave
- const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
- const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
-
- // Multiply-Add
- const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
-
- // Scale
- const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d, MASK_BITS6 - 1);
-
- // Pack
- const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
-
- // Round
- const __m128i v_res_w = xx_round_epu16(v_pssum_d);
-
- return v_res_w;
-}
-
-static INLINE __m128i blend_8_b12(uint16_t*src0, uint16_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_w = xx_loadu_128(src0);
- const __m128i v_s1_w = xx_loadu_128(src1);
-
- // Interleave
- const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
- const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
- const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
- const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
-
- // Multiply-Add
- const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
- const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
-
- // Scale
- const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d, MASK_BITS6 - 1);
- const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d, MASK_BITS6 - 1);
-
- // Pack
- const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
-
- // Round
- const __m128i v_res_w = xx_round_epu16(v_pssum_d);
-
- return v_res_w;
-}
-
//////////////////////////////////////////////////////////////////////////////
// No sub-sampling
//////////////////////////////////////////////////////////////////////////////
-static INLINE void blend_mask6b_bn_w4_sse4_1(
+static INLINE void blend_a64_mask_bn_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, blend_unit_fn blend) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
const __m128i v_m0_b = xx_loadl_32(mask);
} while (--h);
}
-static void blend_mask6b_b10_w4_sse4_1(
+static void blend_a64_mask_b10_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
(void)w;
- blend_mask6b_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b10);
+ blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
}
-static void blend_mask6b_b12_w4_sse4_1(
+static void blend_a64_mask_b12_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
(void)w;
- blend_mask6b_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b12);
+ blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
}
-static inline void blend_mask6b_bn_w8n_sse4_1(
+static inline void blend_a64_mask_bn_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w, blend_unit_fn blend) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
int c;
} while (--h);
}
-static void blend_mask6b_b10_w8n_sse4_1(
+static void blend_a64_mask_b10_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- blend_mask6b_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, w,
- blend_8_b10);
+ blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
}
-static void blend_mask6b_b12_w8n_sse4_1(
+static void blend_a64_mask_b12_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- blend_mask6b_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, w,
- blend_8_b12);
+ blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
}
//////////////////////////////////////////////////////////////////////////////
// Horizontal sub-sampling
//////////////////////////////////////////////////////////////////////////////
-static INLINE void blend_mask6b_bn_sx_w4_sse4_1(
+static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, blend_unit_fn blend) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
const __m128i v_r_b = xx_loadl_64(mask);
} while (--h);
}
-static void blend_mask6b_b10_sx_w4_sse4_1(
+static void blend_a64_mask_b10_sx_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
(void)w;
- blend_mask6b_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b10);
+ blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
}
-static void blend_mask6b_b12_sx_w4_sse4_1(
+static void blend_a64_mask_b12_sx_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
(void)w;
- blend_mask6b_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b12);
+ blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
}
-static INLINE void blend_mask6b_bn_sx_w8n_sse4_1(
+static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w, blend_unit_fn blend) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
int c;
} while (--h);
}
-static void blend_mask6b_b10_sx_w8n_sse4_1(
+static void blend_a64_mask_b10_sx_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- blend_mask6b_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, w,
- blend_8_b10);
+ blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
}
-static void blend_mask6b_b12_sx_w8n_sse4_1(
+static void blend_a64_mask_b12_sx_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- blend_mask6b_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, w,
- blend_8_b12);
+ blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
}
//////////////////////////////////////////////////////////////////////////////
// Vertical sub-sampling
//////////////////////////////////////////////////////////////////////////////
-static INLINE void blend_mask6b_bn_sy_w4_sse4_1(
+static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, blend_unit_fn blend) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
const __m128i v_ra_b = xx_loadl_32(mask);
} while (--h);
}
-static void blend_mask6b_b10_sy_w4_sse4_1(
+static void blend_a64_mask_b10_sy_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
(void)w;
- blend_mask6b_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b10);
+ blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
}
-static void blend_mask6b_b12_sy_w4_sse4_1(
+static void blend_a64_mask_b12_sy_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
(void)w;
- blend_mask6b_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b12);
+ blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
}
-static INLINE void blend_mask6b_bn_sy_w8n_sse4_1(
+static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w, blend_unit_fn blend) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
int c;
} while (--h);
}
-static void blend_mask6b_b10_sy_w8n_sse4_1(
+static void blend_a64_mask_b10_sy_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- blend_mask6b_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, w,
- blend_8_b10);
+ blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
}
-static void blend_mask6b_b12_sy_w8n_sse4_1(
+static void blend_a64_mask_b12_sy_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- blend_mask6b_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, w,
- blend_8_b12);
+ blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
}
//////////////////////////////////////////////////////////////////////////////
// Horizontal and Vertical sub-sampling
//////////////////////////////////////////////////////////////////////////////
-static INLINE void blend_mask6b_bn_sx_sy_w4_sse4_1(
+static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, blend_unit_fn blend) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
const __m128i v_ra_b = xx_loadl_64(mask);
} while (--h);
}
-static void blend_mask6b_b10_sx_sy_w4_sse4_1(
+static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
(void)w;
- blend_mask6b_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b10);
+ blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
}
-static void blend_mask6b_b12_sx_sy_w4_sse4_1(
+static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
(void)w;
- blend_mask6b_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b12);
+ blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
}
-static INLINE void blend_mask6b_bn_sx_sy_w8n_sse4_1(
+static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w, blend_unit_fn blend) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS6);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
int c;
} while (--h);
}
-static void blend_mask6b_b10_sx_sy_w8n_sse4_1(
+static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- blend_mask6b_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, w,
- blend_8_b10);
+ blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
}
-static void blend_mask6b_b12_sx_sy_w8n_sse4_1(
+static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- blend_mask6b_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, w,
- blend_8_b12);
+ blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
}
//////////////////////////////////////////////////////////////////////////////
// Dispatch
//////////////////////////////////////////////////////////////////////////////
-void vpx_highbd_blend_mask6b_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
- uint8_t *src0_8, uint32_t src0_stride,
- uint8_t *src1_8, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride,
- int h, int w, int suby, int subx, int bd) {
- uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
- uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
- uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
-
+void vpx_highbd_blend_a64_mask_sse4_1(
+ uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int suby, int subx, int bd) {
typedef void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w);
- static blend_fn blend[2][2][2][2] = { // bd_index X width_index X subx X suby
+ // Dimensions are: bd_index X width_index X subx X suby
+ static const blend_fn blend[2][2][2][2] = {
{ // bd == 8 or 10
{ // w % 8 == 0
- {blend_mask6b_b10_w8n_sse4_1, blend_mask6b_b10_sy_w8n_sse4_1},
- {blend_mask6b_b10_sx_w8n_sse4_1, blend_mask6b_b10_sx_sy_w8n_sse4_1}
+ {blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1},
+ {blend_a64_mask_b10_sx_w8n_sse4_1, blend_a64_mask_b10_sx_sy_w8n_sse4_1}
}, { // w == 4
- {blend_mask6b_b10_w4_sse4_1, blend_mask6b_b10_sy_w4_sse4_1},
- {blend_mask6b_b10_sx_w4_sse4_1, blend_mask6b_b10_sx_sy_w4_sse4_1}
+ {blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1},
+ {blend_a64_mask_b10_sx_w4_sse4_1, blend_a64_mask_b10_sx_sy_w4_sse4_1}
}
},
{ // bd == 12
{ // w % 8 == 0
- {blend_mask6b_b12_w8n_sse4_1, blend_mask6b_b12_sy_w8n_sse4_1},
- {blend_mask6b_b12_sx_w8n_sse4_1, blend_mask6b_b12_sx_sy_w8n_sse4_1}
+ {blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1},
+ {blend_a64_mask_b12_sx_w8n_sse4_1, blend_a64_mask_b12_sx_sy_w8n_sse4_1}
}, { // w == 4
- {blend_mask6b_b12_w4_sse4_1, blend_mask6b_b12_sy_w4_sse4_1},
- {blend_mask6b_b12_sx_w4_sse4_1, blend_mask6b_b12_sx_sy_w4_sse4_1}
+ {blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1},
+ {blend_a64_mask_b12_sx_w4_sse4_1, blend_a64_mask_b12_sx_sy_w4_sse4_1}
}
}
};
- assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+ assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+ assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
- assert(h >= 4);
- assert(w >= 4);
+ assert(h >= 1);
+ assert(w >= 1);
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
assert(bd == 8 || bd == 10 || bd == 12);
-
- blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride,
- src0, src0_stride,
- src1, src1_stride,
- mask, mask_stride,
- h, w);
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ vpx_highbd_blend_a64_mask_c(dst_8, dst_stride,
+ src0_8, src0_stride,
+ src1_8, src1_stride,
+ mask, mask_stride,
+ h, w, suby, subx, bd);
+ } else {
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, mask_stride,
+ h, w);
+ }
}
#endif // CONFIG_VP9_HIGHBITDEPTH
--- /dev/null
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
+
+#include "vpx_dsp/x86/synonyms.h"
+#include "vpx_dsp/x86/blend_sse4.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_vmask_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+ v_m0_w, v_m1_w);
+ const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+ v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+ xx_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_blend_a64_vmask_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ typedef void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w);
+
+ // Dimension: width_index
+ static const blend_fn blend[9] = {
+ blend_a64_vmask_w16n_sse4_1, // w % 16 == 0
+ vpx_blend_a64_vmask_c, // w == 1
+ vpx_blend_a64_vmask_c, // w == 2
+ NULL, // INVALID
+ blend_a64_vmask_w4_sse4_1, // w == 4
+ NULL, // INVALID
+ NULL, // INVALID
+ NULL, // INVALID
+ blend_a64_vmask_w8_sse4_1, // w == 8
+ };
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ blend[w & 0xf](dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, h, w);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_vmask_bn_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+ do {
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ xx_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_b10_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ (void)w;
+ blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, h,
+ blend_4_b10);
+}
+
+static void blend_a64_vmask_b12_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ (void)w;
+ blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, h,
+ blend_4_b12);
+}
+
+static inline void blend_a64_vmask_bn_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ xx_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_b10_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, h, w,
+ blend_8_b10);
+}
+
+static void blend_a64_vmask_b12_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, h, w,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_highbd_blend_a64_vmask_sse4_1(
+ uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, int bd) {
+ typedef void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w);
+
+ // Dimensions are: bd_index X width_index
+ static const blend_fn blend[2][2] = {
+ { // bd == 8 or 10
+ blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0
+ blend_a64_vmask_b10_w4_sse4_1, // w == 4
+ }, { // bd == 12
+ blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0
+ blend_a64_vmask_b12_w4_sse4_1, // w == 4
+ }
+ };
+
+ assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+ assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ vpx_highbd_blend_a64_vmask_c(dst_8, dst_stride,
+ src0_8, src0_stride,
+ src1_8, src1_stride,
+ mask, h, w, bd);
+ } else {
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ blend[bd == 12][(w >> 2) & 1](dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, h, w);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
--- /dev/null
+/*
+* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+*
+* Use of this source code is governed by a BSD-style license
+* that can be found in the LICENSE file in the root of the source
+* tree. An additional intellectual property rights grant can be found
+* in the file PATENTS. All contributing project authors may
+* be found in the AUTHORS file in the root of the source tree.
+*/
+
+#ifndef VPX_DSP_X86_BLEND_SSE4_H_
+#define VPX_DSP_X86_BLEND_SSE4_H_
+
+#include "vpx_dsp/blend.h"
+#include "vpx_dsp/x86/synonyms.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_b = xx_loadl_32(src0);
+ const __m128i v_s1_b = xx_loadl_32(src1);
+ const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+ const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_b = xx_loadl_64(src0);
+ const __m128i v_s1_b = xx_loadl_64(src1);
+ const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+ const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w);
+
+static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadl_64(src0);
+ const __m128i v_s1_w = xx_loadl_64(src1);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadu_128(src0);
+ const __m128i v_s1_w = xx_loadu_128(src1);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadl_64(src0);
+ const __m128i v_s1_w = xx_loadl_64(src1);
+
+ // Interleave
+ const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+ const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+
+ // Multiply-Add
+ const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
+
+ // Scale
+ const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d,
+ VPX_BLEND_A64_ROUND_BITS - 1);
+
+ // Pack
+ const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
+
+ // Round
+ const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadu_128(src0);
+ const __m128i v_s1_w = xx_loadu_128(src1);
+
+ // Interleave
+ const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+ const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
+ const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+ const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
+
+ // Multiply-Add
+ const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
+ const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
+
+ // Scale
+ const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d,
+ VPX_BLEND_A64_ROUND_BITS - 1);
+ const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d,
+ VPX_BLEND_A64_ROUND_BITS - 1);
+
+ // Pack
+ const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
+
+ // Round
+ const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+ return v_res_w;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#endif // VPX_DSP_X86_BLEND_SSE4_H_