--- /dev/null
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/randomise.h"
+#include "test/snapshot.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "./vp10_rtcd.h"
+
+#include "test/assertion_helpers.h"
+#include "vp10/common/enums.h"
+
+using libvpx_test::assertion_helpers::BuffersEqWithin;
+using libvpx_test::assertion_helpers::BuffersEqOutside;
+using libvpx_test::assertion_helpers::ArraysEq;
+using libvpx_test::FunctionEquivalenceTest;
+using libvpx_test::Snapshot;
+using libvpx_test::Randomise;
+using std::tr1::make_tuple;
+
+namespace {
+
+template<typename F, typename T>
+class BlendMask6Test : public FunctionEquivalenceTest<F> {
+ protected:
+ virtual ~BlendMask6Test() {}
+
+ virtual void Execute(T *p_src0, T *p_src1) = 0;
+
+ void Common() {
+ w = 1 << randomise.uniform<int>(2, MAX_SB_SIZE_LOG2 + 1);
+ h = 1 << randomise.uniform<int>(2, MAX_SB_SIZE_LOG2 + 1);
+
+ randomise(subx);
+ randomise(suby);
+
+ randomise(dst_offset, 0, 32);
+ randomise(dst_stride, w, MAX_SB_SIZE * 5 + 1);
+
+ randomise(src0_offset, 0, 32);
+ randomise(src0_stride, w, MAX_SB_SIZE * 5 + 1);
+
+ randomise(src1_offset, 0, 32);
+ randomise(src1_stride, w, MAX_SB_SIZE * 5 + 1);
+
+ randomise(mask_stride, w * (subx ? 2: 1), 2 * MAX_SB_SIZE + 1);
+
+ T *p_src0;
+ T *p_src1;
+
+ switch (randomise.uniform<int>(3)) {
+ case 0: // Separate sources
+ p_src0 = &src0[0][0];
+ p_src1 = &src1[0][0];
+ break;
+ case 1: // src0 == dst
+ p_src0 = &dst_tst[0][0];
+ src0_stride = dst_stride;
+ src0_offset = dst_offset;
+ p_src1 = &src1[0][0];
+ break;
+ case 2: // src1 == dst
+ p_src0 = &src0[0][0];
+ p_src1 = &dst_tst[0][0];
+ src1_stride = dst_stride;
+ src1_offset = dst_offset;
+ break;
+ default:
+ FAIL();
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ // Prepare
+ //////////////////////////////////////////////////////////////////////////
+
+ snapshot(dst_ref);
+ snapshot(dst_tst);
+
+ snapshot(src0);
+ snapshot(src1);
+
+ snapshot(mask);
+
+ //////////////////////////////////////////////////////////////////////////
+ // Execute
+ //////////////////////////////////////////////////////////////////////////
+
+ Execute(p_src0, p_src1);
+
+ //////////////////////////////////////////////////////////////////////////
+ // Check
+ //////////////////////////////////////////////////////////////////////////
+
+ ASSERT_TRUE(BuffersEqWithin(dst_ref, dst_tst,
+ dst_stride, dst_stride,
+ dst_offset, dst_offset,
+ h, w));
+
+ ASSERT_TRUE(ArraysEq(snapshot.get(src0), src0));
+ ASSERT_TRUE(ArraysEq(snapshot.get(src1), src1));
+ ASSERT_TRUE(ArraysEq(snapshot.get(mask), mask));
+
+ ASSERT_TRUE(BuffersEqOutside(snapshot.get(dst_ref), dst_ref,
+ dst_stride,
+ dst_offset,
+ h, w));
+
+ ASSERT_TRUE(BuffersEqOutside(snapshot.get(dst_tst), dst_tst,
+ dst_stride,
+ dst_offset,
+ h, w));
+ }
+
+ Snapshot snapshot;
+ Randomise randomise;
+
+ T dst_ref[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+ T dst_tst[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+ size_t dst_stride;
+ size_t dst_offset;
+
+ T src0[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+ size_t src0_stride;
+ size_t src0_offset;
+
+ T src1[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+ size_t src1_stride;
+ size_t src1_offset;
+
+ uint8_t mask[2 * MAX_SB_SIZE][2 * MAX_SB_SIZE];
+ size_t mask_stride;
+
+ int w;
+ int h;
+
+ bool suby;
+ bool subx;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int suby, int subx);
+
+class BlendMask6Test8B : public BlendMask6Test<F8B, uint8_t> {
+ protected:
+ void Execute(uint8_t *p_src0, uint8_t *p_src1) {
+ ref_func_(&dst_ref[0][dst_offset], dst_stride,
+ p_src0 + src0_offset, src0_stride,
+ p_src1 + src1_offset, src1_stride,
+ &mask[0][0], sizeof(mask[0]),
+ h, w, suby, subx);
+
+ ASM_REGISTER_STATE_CHECK(
+ tst_func_(&dst_tst[0][dst_offset], dst_stride,
+ p_src0 + src0_offset, src0_stride,
+ p_src1 + src1_offset, src1_stride,
+ &mask[0][0], sizeof(mask[0]),
+ h, w, suby, subx));
+ }
+};
+
+TEST_P(BlendMask6Test8B, RandomValues) {
+ for (int i = 0 ; i < 10000 && !HasFatalFailure(); i++) {
+ //////////////////////////////////////////////////////////////////////////
+ // Randomise
+ //////////////////////////////////////////////////////////////////////////
+
+ randomise(dst_ref);
+ randomise(dst_tst);
+
+ randomise(src0);
+ randomise(src1);
+
+ randomise(mask, 65);
+
+ Common();
+ }
+}
+
+TEST_P(BlendMask6Test8B, ExtremeValues) {
+ for (int i = 0 ; i < 1000 && !HasFatalFailure(); i++) {
+ //////////////////////////////////////////////////////////////////////////
+ // Randomise
+ //////////////////////////////////////////////////////////////////////////
+
+ randomise(dst_ref, 254, 256);
+ randomise(dst_tst, 254, 256);
+
+ randomise(src0, 254, 256);
+ randomise(src1, 254, 256);
+
+ randomise(mask, 63, 65);
+
+ Common();
+ }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1_C_COMPARE, BlendMask6Test8B,
+ ::testing::Values(make_tuple(&vpx_blend_mask6_c, &vpx_blend_mask6_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int suby, int subx, int bd);
+
+class BlendMask6TestHBD : public BlendMask6Test<FHBD, uint16_t> {
+ protected:
+ void Execute(uint16_t *p_src0, uint16_t *p_src1) {
+ ref_func_(CONVERT_TO_BYTEPTR(&dst_ref[0][dst_offset]), dst_stride,
+ CONVERT_TO_BYTEPTR(p_src0 + src0_offset), src0_stride,
+ CONVERT_TO_BYTEPTR(p_src1 + src1_offset), src1_stride,
+ &mask[0][0], sizeof(mask[0]),
+ h, w, suby, subx, bit_depth);
+
+ ASM_REGISTER_STATE_CHECK(
+ tst_func_(CONVERT_TO_BYTEPTR(&dst_tst[0][dst_offset]), dst_stride,
+ CONVERT_TO_BYTEPTR(p_src0 + src0_offset), src0_stride,
+ CONVERT_TO_BYTEPTR(p_src1 + src1_offset), src1_stride,
+ &mask[0][0], sizeof(mask[0]),
+ h, w, suby, subx, bit_depth));
+ }
+
+ int bit_depth;
+};
+
+TEST_P(BlendMask6TestHBD, RandomValues) {
+ for (int i = 0 ; i < 10000 && !HasFatalFailure(); i++) {
+ //////////////////////////////////////////////////////////////////////////
+ // Randomise
+ //////////////////////////////////////////////////////////////////////////
+
+ bit_depth = randomise.choice(8, 10, 12);
+
+ const int hi = 1 << bit_depth;
+
+ randomise(dst_ref, hi);
+ randomise(dst_tst, hi);
+
+ randomise(src0, hi);
+ randomise(src1, hi);
+
+ randomise(mask, 65);
+
+ Common();
+ }
+}
+
+TEST_P(BlendMask6TestHBD, ExtremeValues) {
+ for (int i = 0 ; i < 1000 && !HasFatalFailure(); i++) {
+ //////////////////////////////////////////////////////////////////////////
+ // Randomise
+ //////////////////////////////////////////////////////////////////////////
+
+ bit_depth = randomise.choice(8, 10, 12);
+
+ const int hi = 1 << bit_depth;
+ const int lo = hi - 2;
+
+ randomise(dst_ref, lo, hi);
+ randomise(dst_tst, lo, hi);
+
+ randomise(src0, lo, hi);
+ randomise(src1, lo, hi);
+
+ randomise(mask, 63, 65);
+
+ Common();
+ }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1_C_COMPARE, BlendMask6TestHBD,
+ ::testing::Values(make_tuple(&vpx_highbd_blend_mask6_c,
+ &vpx_highbd_blend_mask6_sse4_1)));
+#endif // HAVE_SSE4_1
+#endif // CONFIG_VP9_HIGHBITDEPTH
+} // namespace
#include <assert.h>
#include "./vpx_scale_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
return mask;
}
-static void build_masked_compound(uint8_t *dst, int dst_stride,
- uint8_t *dst1, int dst1_stride,
- uint8_t *dst2, int dst2_stride,
- const uint8_t *mask,
- int h, int w, int subh, int subw) {
- int i, j;
- if (subw == 0 && subh == 0) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- int m = mask[i * MASK_MASTER_STRIDE + j];
- dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
- dst2[i * dst2_stride + j] *
- ((1 << WEDGE_WEIGHT_BITS) - m) +
- (1 << (WEDGE_WEIGHT_BITS - 1))) >>
- WEDGE_WEIGHT_BITS;
-
- }
- } else if (subw == 1 && subh == 1) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- int m = (mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j)] +
- mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j)] +
- mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j + 1)] +
- mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j + 1)] + 2) >> 2;
- dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
- dst2[i * dst2_stride + j] *
- ((1 << WEDGE_WEIGHT_BITS) - m) +
- (1 << (WEDGE_WEIGHT_BITS - 1))) >>
- WEDGE_WEIGHT_BITS;
- }
- } else if (subw == 1 && subh == 0) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- int m = (mask[i * MASK_MASTER_STRIDE + (2 * j)] +
- mask[i * MASK_MASTER_STRIDE + (2 * j + 1)] + 1) >> 1;
- dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
- dst2[i * dst2_stride + j] *
- ((1 << WEDGE_WEIGHT_BITS) - m) +
- (1 << (WEDGE_WEIGHT_BITS - 1))) >>
- WEDGE_WEIGHT_BITS;
- }
- } else {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- int m = (mask[(2 * i) * MASK_MASTER_STRIDE + j] +
- mask[(2 * i + 1) * MASK_MASTER_STRIDE + j] + 1) >> 1;
- dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
- dst2[i * dst2_stride + j] *
- ((1 << WEDGE_WEIGHT_BITS) - m) +
- (1 << (WEDGE_WEIGHT_BITS - 1))) >>
- WEDGE_WEIGHT_BITS;
- }
- }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void build_masked_compound_highbd(uint8_t *dst_8, int dst_stride,
- uint8_t *dst1_8, int dst1_stride,
- uint8_t *dst2_8, int dst2_stride,
- const uint8_t *mask,
- int h, int w, int subh, int subw) {
- int i, j;
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
- uint16_t *dst1 = CONVERT_TO_SHORTPTR(dst1_8);
- uint16_t *dst2 = CONVERT_TO_SHORTPTR(dst2_8);
- if (subw == 0 && subh == 0) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- int m = mask[i * MASK_MASTER_STRIDE + j];
- dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
- dst2[i * dst2_stride + j] *
- ((1 << WEDGE_WEIGHT_BITS) - m) +
- (1 << (WEDGE_WEIGHT_BITS - 1))) >>
- WEDGE_WEIGHT_BITS;
- }
- } else if (subw == 1 && subh == 1) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- int m = (mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j)] +
- mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j)] +
- mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j + 1)] +
- mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j + 1)] + 2) >> 2;
- dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
- dst2[i * dst2_stride + j] *
- ((1 << WEDGE_WEIGHT_BITS) - m) +
- (1 << (WEDGE_WEIGHT_BITS - 1))) >>
- WEDGE_WEIGHT_BITS;
- }
- } else if (subw == 1 && subh == 0) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- int m = (mask[i * MASK_MASTER_STRIDE + (2 * j)] +
- mask[i * MASK_MASTER_STRIDE + (2 * j + 1)] + 1) >> 1;
- dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
- dst2[i * dst2_stride + j] *
- ((1 << WEDGE_WEIGHT_BITS) - m) +
- (1 << (WEDGE_WEIGHT_BITS - 1))) >>
- WEDGE_WEIGHT_BITS;
- }
- } else {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- int m = (mask[(2 * i) * MASK_MASTER_STRIDE + j] +
- mask[(2 * i + 1) * MASK_MASTER_STRIDE + j] + 1) >> 1;
- dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
- dst2[i * dst2_stride + j] *
- ((1 << WEDGE_WEIGHT_BITS) - m) +
- (1 << (WEDGE_WEIGHT_BITS - 1))) >>
- WEDGE_WEIGHT_BITS;
- }
- }
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_SUPERTX
static void build_masked_compound_wedge_extend(
const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
const uint8_t *mask = vp10_get_soft_mask(
wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
- build_masked_compound(dst, dst_stride,
- dst, dst_stride, dst2, dst2_stride, mask,
- h, w, subh, subw);
+ vpx_blend_mask6(dst, dst_stride,
+ dst, dst_stride,
+ dst2, dst2_stride,
+ mask, MASK_MASTER_STRIDE,
+ h, w, subh, subw);
}
#if CONFIG_VP9_HIGHBITDEPTH
int wedge_index, int wedge_sign,
BLOCK_SIZE sb_type,
int wedge_offset_x, int wedge_offset_y,
- int h, int w) {
+ int h, int w, int bd) {
const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
const uint8_t *mask = vp10_get_soft_mask(
wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
- build_masked_compound_highbd(dst_8, dst_stride,
- dst_8, dst_stride, dst2_8, dst2_stride, mask,
- h, w, subh, subw);
+ vpx_highbd_blend_mask6(dst_8, dst_stride,
+ dst_8, dst_stride,
+ dst2_8, dst2_stride,
+ mask, MASK_MASTER_STRIDE,
+ h, w, subh, subw, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
const uint8_t *mask = vp10_get_soft_mask(wedge_index, wedge_sign,
sb_type, 0, 0);
- build_masked_compound(dst, dst_stride,
- dst, dst_stride, dst2, dst2_stride, mask,
- h, w, subh, subw);
+ vpx_blend_mask6(dst, dst_stride,
+ dst, dst_stride,
+ dst2, dst2_stride,
+ mask, MASK_MASTER_STRIDE,
+ h, w, subh, subw);
}
#if CONFIG_VP9_HIGHBITDEPTH
uint8_t *dst2_8, int dst2_stride,
int wedge_index, int wedge_sign,
BLOCK_SIZE sb_type,
- int h, int w) {
+ int h, int w, int bd) {
// Derive subsampling from h and w passed in. May be refactored to
// pass in subsampling factors directly.
const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
const uint8_t *mask = vp10_get_soft_mask(wedge_index, wedge_sign,
sb_type, 0, 0);
- build_masked_compound_highbd(dst_8, dst_stride,
- dst_8, dst_stride, dst2_8, dst2_stride, mask,
- h, w, subh, subw);
+ vpx_highbd_blend_mask6(dst_8, dst_stride,
+ dst_8, dst_stride,
+ dst2_8, dst2_stride,
+ mask, MASK_MASTER_STRIDE,
+ h, w, subh, subw, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_SUPERTX
mi->mbmi.interinter_wedge_index,
mi->mbmi.interinter_wedge_sign,
mi->mbmi.sb_type,
- wedge_offset_x, wedge_offset_y, h, w);
+ wedge_offset_x, wedge_offset_y, h, w, xd->cur_buf->bit_depth);
else
build_masked_compound_wedge_extend(
dst, dst_stride, tmp_dst, MAX_SB_SIZE,
dst, dst_stride, tmp_dst, MAX_SB_SIZE,
mi->mbmi.interinter_wedge_index,
mi->mbmi.interinter_wedge_sign,
- mi->mbmi.sb_type, h, w);
+ mi->mbmi.sb_type, h, w, xd->cur_buf->bit_depth);
else
build_masked_compound_wedge(
dst, dst_stride, tmp_dst, MAX_SB_SIZE,
bsize, 0, 0);
const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
- build_masked_compound(comppred, compstride,
- intrapred, intrastride,
- interpred, interstride, mask,
- bh, bw, subh, subw);
+ vpx_blend_mask6(comppred, compstride,
+ intrapred, intrastride,
+ interpred, interstride,
+ mask, MASK_MASTER_STRIDE,
+ bh, bw, subh, subw);
}
return;
}
uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8);
uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
- (void) bd;
if (use_wedge_interintra) {
if (is_interintra_wedge_used(bsize)) {
bsize, 0, 0);
const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
- build_masked_compound_highbd(comppred8, compstride,
- intrapred8, intrastride,
- interpred8, interstride, mask,
- bh, bw, subh, subw);
+ vpx_highbd_blend_mask6(comppred8, compstride,
+ intrapred8, intrastride,
+ interpred8, interstride,
+ mask, MASK_MASTER_STRIDE,
+ bh, bw, subh, subw, bd);
}
return;
}
mi->mbmi.interinter_wedge_index,
mi->mbmi.interinter_wedge_sign,
mi->mbmi.sb_type,
- wedge_offset_x, wedge_offset_y, h, w);
+ wedge_offset_x, wedge_offset_y, h, w, xd->cur_buf->bit_depth);
} else {
build_masked_compound_wedge_extend(
dst, dst_buf->stride, tmp_dst, MAX_SB_SIZE,
MAX_SB_SIZE,
mi->mbmi.interinter_wedge_index,
mi->mbmi.interinter_wedge_sign,
- mi->mbmi.sb_type, h, w);
+ mi->mbmi.sb_type, h, w,
+ xd->cur_buf->bit_depth);
else
#endif // CONFIG_VP9_HIGHBITDEPTH
build_masked_compound_wedge(dst, dst_buf->stride, tmp_dst, MAX_SB_SIZE,
--- /dev/null
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+#define MASK_BITS 6
+
+static INLINE __m128i mm_loadl_32(const void *a) {
+ return _mm_cvtsi32_si128(*(const uint32_t*)a);
+}
+
+static INLINE __m128i mm_loadl_64(const void *a) {
+ return _mm_loadl_epi64((const __m128i*)a);
+}
+
+static INLINE __m128i mm_loadu_128(const void *a) {
+ return _mm_loadu_si128((const __m128i*)a);
+}
+
+static INLINE void mm_storel_32(void *const a, const __m128i v) {
+ *(uint32_t*)a = _mm_cvtsi128_si32(v);
+}
+
+static INLINE void mm_storel_64(void *const a, const __m128i v) {
+ _mm_storel_epi64((__m128i*)a, v);
+}
+
+static INLINE void mm_storeu_128(void *const a, const __m128i v) {
+ _mm_storeu_si128((__m128i*)a, v);
+}
+
+static INLINE __m128i mm_round_epu16(__m128i v_val_w) {
+ return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i mm_roundn_epu16(__m128i v_val_w, int bits) {
+ const __m128i v_s_w =_mm_srli_epi16(v_val_w, bits-1);
+ return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE __m128i blend_4(uint8_t*src0, uint8_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_b = mm_loadl_32(src0);
+ const __m128i v_s1_b = mm_loadl_32(src1);
+ const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+ const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8(uint8_t*src0, uint8_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_b = mm_loadl_64(src0);
+ const __m128i v_s1_b = mm_loadl_64(src1);
+ const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+ const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+
+ return v_res_w;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_mask6_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ (void)w;
+
+ do {
+ const __m128i v_m0_b = mm_loadl_32(mask);
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ mm_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ (void)w;
+
+ do {
+ const __m128i v_m0_b = mm_loadl_64(mask);
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ mm_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_m0l_b = mm_loadl_64(mask + c);
+ const __m128i v_m0h_b = mm_loadl_64(mask + c + 8);
+ const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b);
+ const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b);
+ const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+ const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+ v_m0l_w, v_m1l_w);
+ const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+ v_m0h_w, v_m1h_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+ mm_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_mask6_sx_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ (void)w;
+
+ do {
+ const __m128i v_r_b = mm_loadl_64(mask);
+ const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+ const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ mm_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_sx_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ (void)w;
+
+ do {
+ const __m128i v_r_b = mm_loadu_128(mask);
+ const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+ const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ mm_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_sx_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_rl_b = mm_loadu_128(mask + 2 * c);
+ const __m128i v_rh_b = mm_loadu_128(mask + 2 * c + 16);
+ const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1));
+ const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1));
+
+ const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b);
+ const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b);
+ const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+ const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+ v_m0l_w, v_m1l_w);
+ const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+ v_m0h_w, v_m1h_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+ mm_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_mask6_sy_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ (void)w;
+
+ do {
+ const __m128i v_ra_b = mm_loadl_32(mask);
+ const __m128i v_rb_b = mm_loadl_32(mask + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ mm_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_sy_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ (void)w;
+
+ do {
+ const __m128i v_ra_b = mm_loadl_64(mask);
+ const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ mm_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_sy_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_zero = _mm_setzero_si128();
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_ra_b = mm_loadu_128(mask + c);
+ const __m128i v_rb_b = mm_loadu_128(mask + c + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero);
+ const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+ const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+ v_m0l_w, v_m1l_w);
+ const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+ v_m0h_w, v_m1h_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+ mm_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_mask6_sx_sy_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ (void)w;
+
+ do {
+ const __m128i v_ra_b = mm_loadl_64(mask);
+ const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+ const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+ v_zmask_b);
+ const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+ const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ mm_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_sx_sy_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ (void)w;
+
+ do {
+ const __m128i v_ra_b = mm_loadu_128(mask);
+ const __m128i v_rb_b = mm_loadu_128(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+ const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+ v_zmask_b);
+ const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+ const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ mm_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_sx_sy_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_ral_b = mm_loadu_128(mask + 2 * c);
+ const __m128i v_rah_b = mm_loadu_128(mask + 2 * c + 16);
+ const __m128i v_rbl_b = mm_loadu_128(mask + mask_stride + 2 * c);
+ const __m128i v_rbh_b = mm_loadu_128(mask + mask_stride + 2 * c + 16);
+ const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
+ const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
+ const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
+ const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
+ const __m128i v_rvsbl_w = _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1),
+ v_zmask_b);
+ const __m128i v_rvsbh_w = _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1),
+ v_zmask_b);
+ const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
+ const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+ const __m128i v_m0l_w = mm_roundn_epu16(v_rsl_w, 2);
+ const __m128i v_m0h_w = mm_roundn_epu16(v_rsh_w, 2);
+ const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+ const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+ v_m0l_w, v_m1l_w);
+ const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+ v_m0h_w, v_m1h_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+ mm_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_blend_mask6_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int suby, int subx) {
+ typedef void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w);
+
+ static blend_fn blend[3][2][2] = { // width_index X subx X suby
+ { // w % 16 == 0
+ {blend_mask6_w16n_sse4_1, blend_mask6_sy_w16n_sse4_1},
+ {blend_mask6_sx_w16n_sse4_1, blend_mask6_sx_sy_w16n_sse4_1}
+ }, { // w == 4
+ {blend_mask6_w4_sse4_1, blend_mask6_sy_w4_sse4_1},
+ {blend_mask6_sx_w4_sse4_1, blend_mask6_sx_sy_w4_sse4_1}
+ }, { // w == 8
+ {blend_mask6_w8_sse4_1, blend_mask6_sy_w8_sse4_1},
+ {blend_mask6_sx_w8_sse4_1, blend_mask6_sx_sy_w8_sse4_1}
+ }
+ };
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, mask_stride,
+ h, w);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+typedef __m128i (*blend_unit_fn)(uint16_t*src0, uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w);
+
+static INLINE __m128i blend_4_b10(uint16_t*src0, uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = mm_loadl_64(src0);
+ const __m128i v_s1_w = mm_loadl_64(src1);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8_b10(uint16_t*src0, uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = mm_loadu_128(src0);
+ const __m128i v_s1_w = mm_loadu_128(src1);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_4_b12(uint16_t*src0, uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = mm_loadl_64(src0);
+ const __m128i v_s1_w = mm_loadl_64(src1);
+
+ // Interleave
+ const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+ const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+
+ // Multiply-Add
+ const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
+
+ // Scale
+ const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d, MASK_BITS - 1);
+
+ // Pack
+ const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
+
+ // Round
+ const __m128i v_res_w = mm_round_epu16(v_pssum_d);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8_b12(uint16_t*src0, uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = mm_loadu_128(src0);
+ const __m128i v_s1_w = mm_loadu_128(src1);
+
+ // Interleave
+ const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+ const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
+ const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+ const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
+
+ // Multiply-Add
+ const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
+ const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
+
+ // Scale
+ const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d, MASK_BITS - 1);
+ const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d, MASK_BITS - 1);
+
+ // Pack
+ const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
+
+ // Round
+ const __m128i v_res_w = mm_round_epu16(v_pssum_d);
+
+ return v_res_w;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_mask6_bn_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ const __m128i v_m0_b = mm_loadl_32(mask);
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ mm_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_b10_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ (void)w;
+ blend_mask6_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
+}
+
+static void blend_mask6_b12_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ (void)w;
+ blend_mask6_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
+}
+
+static inline void blend_mask6_bn_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_m0_b = mm_loadl_64(mask + c);
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ mm_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_b10_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ blend_mask6_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
+}
+
+static void blend_mask6_b12_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ blend_mask6_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_mask6_bn_sx_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, blend_unit_fn blend) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ const __m128i v_r_b = mm_loadl_64(mask);
+ const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+ const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ mm_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_b10_sx_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ (void)w;
+ blend_mask6_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
+}
+
+static void blend_mask6_b12_sx_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ (void)w;
+ blend_mask6_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
+}
+
+static INLINE void blend_mask6_bn_sx_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, blend_unit_fn blend) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_r_b = mm_loadu_128(mask + 2 * c);
+ const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+ const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ mm_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_b10_sx_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ blend_mask6_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
+}
+
+static void blend_mask6_b12_sx_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ blend_mask6_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_mask6_bn_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ const __m128i v_ra_b = mm_loadl_32(mask);
+ const __m128i v_rb_b = mm_loadl_32(mask + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ mm_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_b10_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ (void)w;
+ blend_mask6_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
+}
+
+static void blend_mask6_b12_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ (void)w;
+ blend_mask6_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
+}
+
+static INLINE void blend_mask6_bn_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_ra_b = mm_loadl_64(mask + c);
+ const __m128i v_rb_b = mm_loadl_64(mask + c + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ mm_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_b10_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ blend_mask6_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
+}
+
+static void blend_mask6_b12_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ blend_mask6_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_mask6_bn_sx_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, blend_unit_fn blend) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ const __m128i v_ra_b = mm_loadl_64(mask);
+ const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+ const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+ v_zmask_b);
+ const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+ const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ mm_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_b10_sx_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ (void)w;
+ blend_mask6_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
+}
+
+static void blend_mask6_b12_sx_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ (void)w;
+ blend_mask6_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
+}
+
+static INLINE void blend_mask6_bn_sx_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, blend_unit_fn blend) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_ra_b = mm_loadu_128(mask + 2 * c);
+ const __m128i v_rb_b = mm_loadu_128(mask + 2 * c +mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+ const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+ v_zmask_b);
+ const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+ const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ mm_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_b10_sx_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ blend_mask6_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
+}
+
+static void blend_mask6_b12_sx_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ blend_mask6_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_highbd_blend_mask6_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
+ uint8_t *src0_8, uint32_t src0_stride,
+ uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int suby, int subx, int bd) {
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+ uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+ uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ typedef void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w);
+
+ static blend_fn blend[2][2][2][2] = { // bd_index X width_index X subx X suby
+ { // bd == 8 or 10
+ { // w % 8 == 0
+ {blend_mask6_b10_w8n_sse4_1, blend_mask6_b10_sy_w8n_sse4_1},
+ {blend_mask6_b10_sx_w8n_sse4_1, blend_mask6_b10_sx_sy_w8n_sse4_1}
+ }, { // w == 4
+ {blend_mask6_b10_w4_sse4_1, blend_mask6_b10_sy_w4_sse4_1},
+ {blend_mask6_b10_sx_w4_sse4_1, blend_mask6_b10_sx_sy_w4_sse4_1}
+ }
+ },
+ { // bd == 12
+ { // w % 8 == 0
+ {blend_mask6_b12_w8n_sse4_1, blend_mask6_b12_sy_w8n_sse4_1},
+ {blend_mask6_b12_sx_w8n_sse4_1, blend_mask6_b12_sx_sy_w8n_sse4_1}
+ }, { // w == 4
+ {blend_mask6_b12_w4_sse4_1, blend_mask6_b12_sy_w4_sse4_1},
+ {blend_mask6_b12_sx_w4_sse4_1, blend_mask6_b12_sx_sy_w4_sse4_1}
+ }
+ }
+ };
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, mask_stride,
+ h, w);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH