add_ldflags "-mmacosx-version-min=10.9"
;;
*-iphonesimulator-*)
- add_cflags "-miphoneos-version-min=5.0"
- add_ldflags "-miphoneos-version-min=5.0"
+ add_cflags "-miphoneos-version-min=6.0"
+ add_ldflags "-miphoneos-version-min=6.0"
osx_sdk_dir="$(xcrun --sdk iphonesimulator --show-sdk-path)"
add_cflags "-isysroot ${osx_sdk_dir}"
add_ldflags "-isysroot ${osx_sdk_dir}"
--- /dev/null
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+// Horizontally and Vertically need 32x32: 8 Coeffs preceeding filtered section
+// 16 Coefs within filtered section
+// 8 Coeffs following filtered section
+const int kNumCoeffs = 1024;
+
+const int number_of_iterations = 10000;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*loop_op_t)(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count, int bd);
+typedef void (*dual_loop_op_t)(uint16_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd);
+#else
+typedef void (*loop_op_t)(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count);
+typedef void (*dual_loop_op_t)(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+typedef std::tr1::tuple<loop_op_t, loop_op_t, int> loop8_param_t;
+typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+void wrapper_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count, int bd) {
+ vp9_highbd_lpf_vertical_16_sse2(s, p, blimit, limit, thresh, bd);
+}
+
+void wrapper_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count, int bd) {
+ vp9_highbd_lpf_vertical_16_c(s, p, blimit, limit, thresh, bd);
+}
+
+void wrapper_vertical_16_dual_sse2(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count, int bd) {
+ vp9_highbd_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh, bd);
+}
+
+void wrapper_vertical_16_dual_c(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count, int bd) {
+ vp9_highbd_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh, bd);
+}
+#else
+void wrapper_vertical_16_sse2(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count) {
+ vp9_lpf_vertical_16_sse2(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count) {
+ vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_sse2(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count) {
+ vp9_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count) {
+ vp9_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_SSE2
+
+class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
+ public:
+ virtual ~Loop8Test6Param() {}
+ virtual void SetUp() {
+ loopfilter_op_ = GET_PARAM(0);
+ ref_loopfilter_op_ = GET_PARAM(1);
+ bit_depth_ = GET_PARAM(2);
+ mask_ = (1 << bit_depth_) - 1;
+ }
+
+ virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+ int bit_depth_;
+ int mask_;
+ loop_op_t loopfilter_op_;
+ loop_op_t ref_loopfilter_op_;
+};
+
+class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
+ public:
+ virtual ~Loop8Test9Param() {}
+ virtual void SetUp() {
+ loopfilter_op_ = GET_PARAM(0);
+ ref_loopfilter_op_ = GET_PARAM(1);
+ bit_depth_ = GET_PARAM(2);
+ mask_ = (1 << bit_depth_) - 1;
+ }
+
+ virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+ int bit_depth_;
+ int mask_;
+ dual_loop_op_t loopfilter_op_;
+ dual_loop_op_t ref_loopfilter_op_;
+};
+
+TEST_P(Loop8Test6Param, OperationCheck) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+ int32_t bd = bit_depth_;
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
+#else
+ DECLARE_ALIGNED_ARRAY(8, uint8_t, s, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(8, uint8_t, ref_s, kNumCoeffs);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ int err_count_total = 0;
+ int first_failure = -1;
+ for (int i = 0; i < count_test_block; ++i) {
+ int err_count = 0;
+ uint8_t tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, limit[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ int32_t p = kNumCoeffs/32;
+ int count = 1;
+
+ uint16_t tmp_s[kNumCoeffs];
+ int j = 0;
+ while (j < kNumCoeffs) {
+ uint8_t val = rnd.Rand8();
+ if (val & 0x80) { // 50% chance to choose a new value.
+ tmp_s[j] = rnd.Rand16();
+ j++;
+ } else { // 50% chance to repeat previous value in row X times
+ int k = 0;
+ while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+ if (j < 1) {
+ tmp_s[j] = rnd.Rand16();
+ } else if (val & 0x20) { // Increment by an value within the limit
+ tmp_s[j] = (tmp_s[j - 1] + (*limit - 1));
+ } else { // Decrement by an value within the limit
+ tmp_s[j] = (tmp_s[j - 1] - (*limit - 1));
+ }
+ j++;
+ }
+ }
+ }
+ for (j = 0; j < kNumCoeffs; j++) {
+ if (i % 2) {
+ s[j] = tmp_s[j] & mask_;
+ } else {
+ s[j] = tmp_s[p * (j % p) + j / p] & mask_;
+ }
+ ref_s[j] = s[j];
+ }
+#if CONFIG_VP9_HIGHBITDEPTH
+ ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count, bd);
+ ASM_REGISTER_STATE_CHECK(
+ loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
+#else
+ ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
+ ASM_REGISTER_STATE_CHECK(
+ loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ err_count += ref_s[j] != s[j];
+ }
+ if (err_count && !err_count_total) {
+ first_failure = i;
+ }
+ err_count_total += err_count;
+ }
+ EXPECT_EQ(0, err_count_total)
+ << "Error: Loop8Test6Param, C output doesn't match SSE2 "
+ "loopfilter output. "
+ << "First failed at test case " << first_failure;
+}
+
+TEST_P(Loop8Test6Param, ValueCheck) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32_t bd = bit_depth_;
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
+#else
+ DECLARE_ALIGNED_ARRAY(8, uint8_t, s, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(8, uint8_t, ref_s, kNumCoeffs);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ int err_count_total = 0;
+ int first_failure = -1;
+ for (int i = 0; i < count_test_block; ++i) {
+ int err_count = 0;
+ uint8_t tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, limit[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ int32_t p = kNumCoeffs / 32;
+ int count = 1;
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ s[j] = rnd.Rand16() & mask_;
+ ref_s[j] = s[j];
+ }
+#if CONFIG_VP9_HIGHBITDEPTH
+ ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count, bd);
+ ASM_REGISTER_STATE_CHECK(
+ loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
+#else
+ ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
+ ASM_REGISTER_STATE_CHECK(
+ loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ err_count += ref_s[j] != s[j];
+ }
+ if (err_count && !err_count_total) {
+ first_failure = i;
+ }
+ err_count_total += err_count;
+ }
+ EXPECT_EQ(0, err_count_total)
+ << "Error: Loop8Test6Param, C output doesn't match SSE2 "
+ "loopfilter output. "
+ << "First failed at test case " << first_failure;
+}
+
+TEST_P(Loop8Test9Param, OperationCheck) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32_t bd = bit_depth_;
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
+#else
+ DECLARE_ALIGNED_ARRAY(8, uint8_t, s, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(8, uint8_t, ref_s, kNumCoeffs);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ int err_count_total = 0;
+ int first_failure = -1;
+ for (int i = 0; i < count_test_block; ++i) {
+ int err_count = 0;
+ uint8_t tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, limit0[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, thresh0[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, limit1[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, thresh1[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ int32_t p = kNumCoeffs / 32;
+ uint16_t tmp_s[kNumCoeffs];
+ int j = 0;
+ const uint8_t limit = *limit0 < *limit1 ? *limit0 : *limit1;
+ while (j < kNumCoeffs) {
+ uint8_t val = rnd.Rand8();
+ if (val & 0x80) { // 50% chance to choose a new value.
+ tmp_s[j] = rnd.Rand16();
+ j++;
+ } else { // 50% chance to repeat previous value in row X times.
+ int k = 0;
+ while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+ if (j < 1) {
+ tmp_s[j] = rnd.Rand16();
+ } else if (val & 0x20) { // Increment by a value within the limit.
+ tmp_s[j] = (tmp_s[j - 1] + (limit - 1));
+ } else { // Decrement by an value within the limit.
+ tmp_s[j] = (tmp_s[j - 1] - (limit - 1));
+ }
+ j++;
+ }
+ }
+ }
+ for (j = 0; j < kNumCoeffs; j++) {
+ if (i % 2) {
+ s[j] = tmp_s[j] & mask_;
+ } else {
+ s[j] = tmp_s[p * (j % p) + j / p] & mask_;
+ }
+ ref_s[j] = s[j];
+ }
+#if CONFIG_VP9_HIGHBITDEPTH
+ ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1, bd);
+ ASM_REGISTER_STATE_CHECK(
+ loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1, bd));
+#else
+ ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1);
+ ASM_REGISTER_STATE_CHECK(
+ loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ err_count += ref_s[j] != s[j];
+ }
+ if (err_count && !err_count_total) {
+ first_failure = i;
+ }
+ err_count_total += err_count;
+ }
+ EXPECT_EQ(0, err_count_total)
+ << "Error: Loop8Test9Param, C output doesn't match SSE2 "
+ "loopfilter output. "
+ << "First failed at test case " << first_failure;
+}
+
+TEST_P(Loop8Test9Param, ValueCheck) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
+#else
+ DECLARE_ALIGNED_ARRAY(8, uint8_t, s, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(8, uint8_t, ref_s, kNumCoeffs);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ int err_count_total = 0;
+ int first_failure = -1;
+ for (int i = 0; i < count_test_block; ++i) {
+ int err_count = 0;
+ uint8_t tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, limit0[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, thresh0[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, limit1[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ tmp = rnd.Rand8();
+ DECLARE_ALIGNED(16, const uint8_t, thresh1[16]) = {
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+ };
+ int32_t p = kNumCoeffs / 32; // TODO(pdlf) can we have non-square here?
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ s[j] = rnd.Rand16() & mask_;
+ ref_s[j] = s[j];
+ }
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32_t bd = bit_depth_;
+ ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1, bd);
+ ASM_REGISTER_STATE_CHECK(
+ loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0,
+ thresh0, blimit1, limit1, thresh1, bd));
+#else
+ ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1);
+ ASM_REGISTER_STATE_CHECK(
+ loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ err_count += ref_s[j] != s[j];
+ }
+ if (err_count && !err_count_total) {
+ first_failure = i;
+ }
+ err_count_total += err_count;
+ }
+ EXPECT_EQ(0, err_count_total)
+ << "Error: Loop8Test9Param, C output doesn't match SSE2"
+ "loopfilter output. "
+ << "First failed at test case " << first_failure;
+}
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+ SSE2_C_COMPARE_SINGLE, Loop8Test6Param,
+ ::testing::Values(
+ make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
+ &vp9_highbd_lpf_horizontal_4_c, 8),
+ make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
+ &vp9_highbd_lpf_vertical_4_c, 8),
+ make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
+ &vp9_highbd_lpf_horizontal_8_c, 8),
+ make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+ &vp9_highbd_lpf_horizontal_16_c, 8),
+ make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
+ &vp9_highbd_lpf_vertical_8_c, 8),
+ make_tuple(&wrapper_vertical_16_sse2,
+ &wrapper_vertical_16_c, 8),
+ make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
+ &vp9_highbd_lpf_horizontal_4_c, 10),
+ make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
+ &vp9_highbd_lpf_vertical_4_c, 10),
+ make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
+ &vp9_highbd_lpf_horizontal_8_c, 10),
+ make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+ &vp9_highbd_lpf_horizontal_16_c, 10),
+ make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
+ &vp9_highbd_lpf_vertical_8_c, 10),
+ make_tuple(&wrapper_vertical_16_sse2,
+ &wrapper_vertical_16_c, 10),
+ make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
+ &vp9_highbd_lpf_horizontal_4_c, 12),
+ make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
+ &vp9_highbd_lpf_vertical_4_c, 12),
+ make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
+ &vp9_highbd_lpf_horizontal_8_c, 12),
+ make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+ &vp9_highbd_lpf_horizontal_16_c, 12),
+ make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
+ &vp9_highbd_lpf_vertical_8_c, 12),
+ make_tuple(&wrapper_vertical_16_sse2,
+ &wrapper_vertical_16_c, 12)));
+#else
+INSTANTIATE_TEST_CASE_P(
+ SSE2_C_COMPARE_SINGLE, Loop8Test6Param,
+ ::testing::Values(
+ make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c, 8),
+ make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8),
+ make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c, 8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+ SSE2_C_COMPARE_DUAL, Loop8Test6Param,
+ ::testing::Values(
+ make_tuple(&wrapper_vertical_16_dual_sse2,
+ &wrapper_vertical_16_dual_c, 8),
+ make_tuple(&wrapper_vertical_16_dual_sse2,
+ &wrapper_vertical_16_dual_c, 10),
+ make_tuple(&wrapper_vertical_16_dual_sse2,
+ &wrapper_vertical_16_dual_c, 12)));
+#else
+INSTANTIATE_TEST_CASE_P(
+ SSE2_C_COMPARE_DUAL, Loop8Test6Param,
+ ::testing::Values(
+ make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_SSE2
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+ SSE_C_COMPARE_DUAL, Loop8Test9Param,
+ ::testing::Values(
+ make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
+ &vp9_highbd_lpf_horizontal_4_dual_c, 8),
+ make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
+ &vp9_highbd_lpf_horizontal_8_dual_c, 8),
+ make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
+ &vp9_highbd_lpf_vertical_4_dual_c, 8),
+ make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
+ &vp9_highbd_lpf_vertical_8_dual_c, 8),
+ make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
+ &vp9_highbd_lpf_horizontal_4_dual_c, 10),
+ make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
+ &vp9_highbd_lpf_horizontal_8_dual_c, 10),
+ make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
+ &vp9_highbd_lpf_vertical_4_dual_c, 10),
+ make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
+ &vp9_highbd_lpf_vertical_8_dual_c, 10),
+ make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
+ &vp9_highbd_lpf_horizontal_4_dual_c, 12),
+ make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
+ &vp9_highbd_lpf_horizontal_8_dual_c, 12),
+ make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
+ &vp9_highbd_lpf_vertical_4_dual_c, 12),
+ make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
+ &vp9_highbd_lpf_vertical_8_dual_c, 12)));
+#else
+INSTANTIATE_TEST_CASE_P(
+ SSE_C_COMPARE_DUAL, Loop8Test9Param,
+ ::testing::Values(
+ make_tuple(&vp9_lpf_horizontal_4_dual_sse2,
+ &vp9_lpf_horizontal_4_dual_c, 8),
+ make_tuple(&vp9_lpf_horizontal_8_dual_sse2,
+ &vp9_lpf_horizontal_8_dual_c, 8),
+ make_tuple(&vp9_lpf_vertical_4_dual_sse2,
+ &vp9_lpf_vertical_4_dual_c, 8),
+ make_tuple(&vp9_lpf_vertical_8_dual_sse2,
+ &vp9_lpf_vertical_8_dual_c, 8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif
+} // namespace
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += lpf_8_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9) += vp9_intrapred_test.cc
ifeq ($(CONFIG_VP9_ENCODER),yes)
for (plane = 0; plane < 3; ++plane) {
const unsigned char *buf = img->planes[plane];
const int stride = img->stride[plane];
- const int w = vpx_img_plane_width(img, plane);
+ const int w = vpx_img_plane_width(img, plane) *
+ ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
const int h = vpx_img_plane_height(img, plane);
int y;
#include <arm_neon.h>
#include "./vpx_config.h"
+#include "vpx_ports/arm.h"
static INLINE void vp8_loop_filter_neon(
uint8x16_t qblimit, // flimit
static INLINE void write_4x8(unsigned char *dst, int pitch,
const uint8x8x4_t result) {
-#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
- vst4_lane_u8(dst, result, 0);
- dst += pitch;
- vst4_lane_u8(dst, result, 1);
- dst += pitch;
- vst4_lane_u8(dst, result, 2);
- dst += pitch;
- vst4_lane_u8(dst, result, 3);
- dst += pitch;
- vst4_lane_u8(dst, result, 4);
- dst += pitch;
- vst4_lane_u8(dst, result, 5);
- dst += pitch;
- vst4_lane_u8(dst, result, 6);
- dst += pitch;
- vst4_lane_u8(dst, result, 7);
-#else
+#ifdef VPX_INCOMPATIBLE_GCC
/*
* uint8x8x4_t result
00 01 02 03 | 04 05 06 07
vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
-#endif
+#else
+ vst4_lane_u8(dst, result, 0);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 1);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 2);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 3);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 4);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 5);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 6);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 7);
+#endif // VPX_INCOMPATIBLE_GCC
}
void vp8_loop_filter_vertical_edge_y_neon(
#include <arm_neon.h>
#include "./vpx_config.h"
+#include "vpx_ports/arm.h"
-#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-static INLINE void write_2x8(unsigned char *dst, int pitch,
- const uint8x8x2_t result,
- const uint8x8x2_t result2) {
- vst2_lane_u8(dst, result, 0);
- dst += pitch;
- vst2_lane_u8(dst, result, 1);
- dst += pitch;
- vst2_lane_u8(dst, result, 2);
- dst += pitch;
- vst2_lane_u8(dst, result, 3);
- dst += pitch;
- vst2_lane_u8(dst, result, 4);
- dst += pitch;
- vst2_lane_u8(dst, result, 5);
- dst += pitch;
- vst2_lane_u8(dst, result, 6);
- dst += pitch;
- vst2_lane_u8(dst, result, 7);
- dst += pitch;
-
- vst2_lane_u8(dst, result2, 0);
- dst += pitch;
- vst2_lane_u8(dst, result2, 1);
- dst += pitch;
- vst2_lane_u8(dst, result2, 2);
- dst += pitch;
- vst2_lane_u8(dst, result2, 3);
- dst += pitch;
- vst2_lane_u8(dst, result2, 4);
- dst += pitch;
- vst2_lane_u8(dst, result2, 5);
- dst += pitch;
- vst2_lane_u8(dst, result2, 6);
- dst += pitch;
- vst2_lane_u8(dst, result2, 7);
-}
-#else
+#ifdef VPX_INCOMPATIBLE_GCC
static INLINE void write_2x4(unsigned char *dst, int pitch,
const uint8x8x2_t result) {
/*
dst += pitch * 8;
write_2x4(dst, pitch, result2);
}
-#endif
-
+#else
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+ const uint8x8x2_t result,
+ const uint8x8x2_t result2) {
+ vst2_lane_u8(dst, result, 0);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 1);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 2);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 3);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 4);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 5);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 6);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 7);
+ dst += pitch;
-#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-static INLINE
-uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
- x = vld4_lane_u8(src, x, 0);
- src += pitch;
- x = vld4_lane_u8(src, x, 1);
- src += pitch;
- x = vld4_lane_u8(src, x, 2);
- src += pitch;
- x = vld4_lane_u8(src, x, 3);
- src += pitch;
- x = vld4_lane_u8(src, x, 4);
- src += pitch;
- x = vld4_lane_u8(src, x, 5);
- src += pitch;
- x = vld4_lane_u8(src, x, 6);
- src += pitch;
- x = vld4_lane_u8(src, x, 7);
- return x;
+ vst2_lane_u8(dst, result2, 0);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 1);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 2);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 3);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 4);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 5);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 6);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 7);
}
-#else
+#endif // VPX_INCOMPATIBLE_GCC
+
+
+#ifdef VPX_INCOMPATIBLE_GCC
static INLINE
uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
const uint8x8_t a = vld1_u8(src);
return x;
}
-#endif
+#else
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
+ x = vld4_lane_u8(src, x, 0);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 1);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 2);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 3);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 4);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 5);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 6);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 7);
+ return x;
+}
+#endif // VPX_INCOMPATIBLE_GCC
static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
unsigned char *s,
*/
#include <arm_neon.h>
+#include "vpx_ports/arm.h"
+#ifdef VPX_INCOMPATIBLE_GCC
+#include "./vp8_rtcd.h"
+void vp8_short_walsh4x4_neon(
+ int16_t *input,
+ int16_t *output,
+ int pitch) {
+ vp8_short_walsh4x4_c(input, output, pitch);
+}
+#else
void vp8_short_walsh4x4_neon(
int16_t *input,
int16_t *output,
vst1q_s16(output + 8, q1s16);
return;
}
+#endif // VPX_INCOMPATIBLE_GCC
#endif
/* central mv */
- bestmv->as_mv.row <<= 3;
- bestmv->as_mv.col <<= 3;
+ bestmv->as_mv.row *= 8;
+ bestmv->as_mv.col *= 8;
startmv = *bestmv;
/* calculate central point error */
#include "vp9/common/vp9_common_data.h"
#include "vp9/common/vp9_enums.h"
#include "vp9/common/vp9_filter.h"
-#include "vp9/common/vp9_idct.h"
#include "vp9/common/vp9_mv.h"
#include "vp9/common/vp9_scale.h"
#include "vp9/common/vp9_seg_common.h"
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <assert.h>
#include <math.h>
-#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_systemdependent.h"
#include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_idct.h"
#if CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH
#define dual_set_epi16(a, b) \
_mm_set_epi16(b, b, b, b, a, a, a, a)
-// Note:
-// tran_low_t is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-#if CONFIG_VP9_HIGHBITDEPTH
-typedef int64_t tran_high_t;
-typedef int32_t tran_low_t;
-#else
-typedef int32_t tran_high_t;
-typedef int16_t tran_low_t;
-#endif
-
// Constants:
// for (int i = 1; i< 32; ++i)
// printf("static const int cospi_%d_64 = %.0f;\n", i,
}
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static void high_filter_selectively_vert_row2(PLANE_TYPE plane_type,
+ uint16_t *s, int pitch,
+ unsigned int mask_16x16_l,
+ unsigned int mask_8x8_l,
+ unsigned int mask_4x4_l,
+ unsigned int mask_4x4_int_l,
+ const loop_filter_info_n *lfi_n,
+ const uint8_t *lfl, int bd) {
+ const int mask_shift = plane_type ? 4 : 8;
+ const int mask_cutoff = plane_type ? 0xf : 0xff;
+ const int lfl_forward = plane_type ? 4 : 8;
+
+ unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
+ unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
+ unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
+ unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
+ unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
+ unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
+ unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
+ unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+ unsigned int mask;
+
+ for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
+ mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
+ mask; mask >>= 1) {
+ const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+ const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+
+ // TODO(yunqingwang): count in loopfilter functions should be removed.
+ if (mask & 1) {
+ if ((mask_16x16_0 | mask_16x16_1) & 1) {
+ if ((mask_16x16_0 & mask_16x16_1) & 1) {
+ vp9_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, bd);
+ } else if (mask_16x16_0 & 1) {
+ vp9_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, bd);
+ } else {
+ vp9_highbd_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
+ }
+ }
+
+ if ((mask_8x8_0 | mask_8x8_1) & 1) {
+ if ((mask_8x8_0 & mask_8x8_1) & 1) {
+ vp9_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr, bd);
+ } else if (mask_8x8_0 & 1) {
+ vp9_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, 1, bd);
+ } else {
+ vp9_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, 1, bd);
+ }
+ }
+
+ if ((mask_4x4_0 | mask_4x4_1) & 1) {
+ if ((mask_4x4_0 & mask_4x4_1) & 1) {
+ vp9_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr, bd);
+ } else if (mask_4x4_0 & 1) {
+ vp9_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, 1, bd);
+ } else {
+ vp9_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, 1, bd);
+ }
+ }
+
+ if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
+ if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
+ vp9_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr, bd);
+ } else if (mask_4x4_int_0 & 1) {
+ vp9_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, 1, bd);
+ } else {
+ vp9_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, 1, bd);
+ }
+ }
+ }
+
+ s += 8;
+ lfl += 1;
+ mask_16x16_0 >>= 1;
+ mask_8x8_0 >>= 1;
+ mask_4x4_0 >>= 1;
+ mask_4x4_int_0 >>= 1;
+ mask_16x16_1 >>= 1;
+ mask_8x8_1 >>= 1;
+ mask_4x4_1 >>= 1;
+ mask_4x4_int_1 >>= 1;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
static void filter_selectively_horiz(uint8_t *s, int pitch,
unsigned int mask_16x16,
unsigned int mask_8x8,
}
} else if (mask_8x8 & 1) {
if ((mask_8x8 & 3) == 3) {
- // Next block's thresholds
+ // Next block's thresholds.
const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
vp9_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
}
} else if (mask_4x4 & 1) {
if ((mask_4x4 & 3) == 3) {
- // Next block's thresholds
+ // Next block's thresholds.
const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
vp9_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
}
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static void high_filter_selectively_horiz(uint16_t *s, int pitch,
+ unsigned int mask_16x16,
+ unsigned int mask_8x8,
+ unsigned int mask_4x4,
+ unsigned int mask_4x4_int,
+ const loop_filter_info_n *lfi_n,
+ const uint8_t *lfl, int bd) {
+ unsigned int mask;
+ int count;
+
+ for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+ mask; mask >>= count) {
+ const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+ count = 1;
+ if (mask & 1) {
+ if (mask_16x16 & 1) {
+ if ((mask_16x16 & 3) == 3) {
+ vp9_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2, bd);
+ count = 2;
+ } else {
+ vp9_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1, bd);
+ }
+ } else if (mask_8x8 & 1) {
+ if ((mask_8x8 & 3) == 3) {
+ // Next block's thresholds.
+ const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+ vp9_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr, bd);
+
+ if ((mask_4x4_int & 3) == 3) {
+ vp9_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+ lfi->lim, lfi->hev_thr,
+ lfin->mblim, lfin->lim,
+ lfin->hev_thr, bd);
+ } else {
+ if (mask_4x4_int & 1) {
+ vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1, bd);
+ } else if (mask_4x4_int & 2) {
+ vp9_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+ lfin->lim, lfin->hev_thr, 1, bd);
+ }
+ }
+ count = 2;
+ } else {
+ vp9_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1, bd);
+
+ if (mask_4x4_int & 1) {
+ vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1, bd);
+ }
+ }
+ } else if (mask_4x4 & 1) {
+ if ((mask_4x4 & 3) == 3) {
+ // Next block's thresholds.
+ const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+ vp9_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr, bd);
+ if ((mask_4x4_int & 3) == 3) {
+ vp9_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+ lfi->lim, lfi->hev_thr,
+ lfin->mblim, lfin->lim,
+ lfin->hev_thr, bd);
+ } else {
+ if (mask_4x4_int & 1) {
+ vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1, bd);
+ } else if (mask_4x4_int & 2) {
+ vp9_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+ lfin->lim, lfin->hev_thr, 1, bd);
+ }
+ }
+ count = 2;
+ } else {
+ vp9_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1, bd);
+
+ if (mask_4x4_int & 1) {
+ vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1, bd);
+ }
+ }
+ } else if (mask_4x4_int & 1) {
+ vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1, bd);
+ }
+ }
+ s += 8 * count;
+ lfl += count;
+ mask_16x16 >>= count;
+ mask_8x8 >>= count;
+ mask_4x4 >>= count;
+ mask_4x4_int >>= count;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
// This function ors into the current lfm structure, where to do loop
// filters for the specific mi we are looking at. It uses information
// including the block_size_type (32x16, 32x32, etc.), the transform size,
}
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static void high_filter_selectively_vert(uint16_t *s, int pitch,
+ unsigned int mask_16x16,
+ unsigned int mask_8x8,
+ unsigned int mask_4x4,
+ unsigned int mask_4x4_int,
+ const loop_filter_info_n *lfi_n,
+ const uint8_t *lfl, int bd) {
+ unsigned int mask;
+
+ for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+ mask; mask >>= 1) {
+ const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+ if (mask & 1) {
+ if (mask_16x16 & 1) {
+ vp9_highbd_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, bd);
+ } else if (mask_8x8 & 1) {
+ vp9_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1, bd);
+ } else if (mask_4x4 & 1) {
+ vp9_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1, bd);
+ }
+ }
+ if (mask_4x4_int & 1)
+ vp9_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1, bd);
+ s += 8;
+ lfl += 1;
+ mask_16x16 >>= 1;
+ mask_8x8 >>= 1;
+ mask_4x4 >>= 1;
+ mask_4x4_int >>= 1;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
static void filter_block_plane_non420(VP9_COMMON *cm,
struct macroblockd_plane *plane,
MODE_INFO *mi_8x8,
int mi_row, int mi_col) {
const int ss_x = plane->subsampling_x;
const int ss_y = plane->subsampling_y;
- const int row_step = 1 << ss_x;
- const int col_step = 1 << ss_y;
+ const int row_step = 1 << ss_y;
+ const int col_step = 1 << ss_x;
const int row_step_stride = cm->mi_stride * row_step;
struct buf_2d *const dst = &plane->dst;
uint8_t* const dst0 = dst->buf;
// Disable filtering on the leftmost column
border_mask = ~(mi_col == 0);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ high_filter_selectively_vert(CONVERT_TO_SHORTPTR(dst->buf),
+ dst->stride,
+ mask_16x16_c & border_mask,
+ mask_8x8_c & border_mask,
+ mask_4x4_c & border_mask,
+ mask_4x4_int[r],
+ &cm->lf_info, &lfl[r << 3],
+ (int)cm->bit_depth);
+ } else {
+ filter_selectively_vert(dst->buf, dst->stride,
+ mask_16x16_c & border_mask,
+ mask_8x8_c & border_mask,
+ mask_4x4_c & border_mask,
+ mask_4x4_int[r],
+ &cm->lf_info, &lfl[r << 3]);
+ }
+#else
filter_selectively_vert(dst->buf, dst->stride,
mask_16x16_c & border_mask,
mask_8x8_c & border_mask,
mask_4x4_c & border_mask,
mask_4x4_int[r],
&cm->lf_info, &lfl[r << 3]);
+#endif // CONFIG_VP9_HIGHBITDEPTH
dst->buf += 8 * dst->stride;
mi_8x8 += row_step_stride;
}
mask_8x8_r = mask_8x8[r];
mask_4x4_r = mask_4x4[r];
}
-
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ high_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+ dst->stride,
+ mask_16x16_r,
+ mask_8x8_r,
+ mask_4x4_r,
+ mask_4x4_int_r,
+ &cm->lf_info, &lfl[r << 3],
+ (int)cm->bit_depth);
+ } else {
+ filter_selectively_horiz(dst->buf, dst->stride,
+ mask_16x16_r,
+ mask_8x8_r,
+ mask_4x4_r,
+ mask_4x4_int_r,
+ &cm->lf_info, &lfl[r << 3]);
+ }
+#else
filter_selectively_horiz(dst->buf, dst->stride,
mask_16x16_r,
mask_8x8_r,
mask_4x4_r,
mask_4x4_int_r,
&cm->lf_info, &lfl[r << 3]);
+#endif // CONFIG_VP9_HIGHBITDEPTH
dst->buf += 8 * dst->stride;
}
}
unsigned int mask_4x4_l = mask_4x4 & 0xffff;
unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
- // Disable filtering on the leftmost column
+ // Disable filtering on the leftmost column.
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ high_filter_selectively_vert_row2(plane->plane_type,
+ CONVERT_TO_SHORTPTR(dst->buf),
+ dst->stride,
+ mask_16x16_l,
+ mask_8x8_l,
+ mask_4x4_l,
+ mask_4x4_int_l,
+ &cm->lf_info, &lfm->lfl_y[r << 3],
+ (int)cm->bit_depth);
+ } else {
+ filter_selectively_vert_row2(plane->plane_type,
+ dst->buf, dst->stride,
+ mask_16x16_l,
+ mask_8x8_l,
+ mask_4x4_l,
+ mask_4x4_int_l,
+ &cm->lf_info,
+ &lfm->lfl_y[r << 3]);
+ }
+#else
filter_selectively_vert_row2(plane->plane_type,
dst->buf, dst->stride,
mask_16x16_l,
mask_4x4_l,
mask_4x4_int_l,
&cm->lf_info, &lfm->lfl_y[r << 3]);
-
+#endif // CONFIG_VP9_HIGHBITDEPTH
dst->buf += 16 * dst->stride;
mask_16x16 >>= 16;
mask_8x8 >>= 16;
mask_4x4_r = mask_4x4 & 0xff;
}
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ high_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+ dst->stride,
+ mask_16x16_r,
+ mask_8x8_r,
+ mask_4x4_r,
+ mask_4x4_int & 0xff,
+ &cm->lf_info,
+ &lfm->lfl_y[r << 3],
+ (int)cm->bit_depth);
+ } else {
+ filter_selectively_horiz(dst->buf, dst->stride,
+ mask_16x16_r,
+ mask_8x8_r,
+ mask_4x4_r,
+ mask_4x4_int & 0xff,
+ &cm->lf_info,
+ &lfm->lfl_y[r << 3]);
+ }
+#else
filter_selectively_horiz(dst->buf, dst->stride,
mask_16x16_r,
mask_8x8_r,
mask_4x4_r,
mask_4x4_int & 0xff,
- &cm->lf_info, &lfm->lfl_y[r << 3]);
+ &cm->lf_info,
+ &lfm->lfl_y[r << 3]);
+#endif // CONFIG_VP9_HIGHBITDEPTH
dst->buf += 8 * dst->stride;
mask_16x16 >>= 8;
unsigned int mask_4x4_l = mask_4x4 & 0xff;
unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
- // Disable filtering on the leftmost column
+ // Disable filtering on the leftmost column.
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ high_filter_selectively_vert_row2(plane->plane_type,
+ CONVERT_TO_SHORTPTR(dst->buf),
+ dst->stride,
+ mask_16x16_l,
+ mask_8x8_l,
+ mask_4x4_l,
+ mask_4x4_int_l,
+ &cm->lf_info,
+ &lfm->lfl_uv[r << 1],
+ (int)cm->bit_depth);
+ } else {
+ filter_selectively_vert_row2(plane->plane_type,
+ dst->buf, dst->stride,
+ mask_16x16_l,
+ mask_8x8_l,
+ mask_4x4_l,
+ mask_4x4_int_l,
+ &cm->lf_info,
+ &lfm->lfl_uv[r << 1]);
+ }
+#else
filter_selectively_vert_row2(plane->plane_type,
dst->buf, dst->stride,
mask_16x16_l,
mask_8x8_l,
mask_4x4_l,
mask_4x4_int_l,
- &cm->lf_info, &lfm->lfl_uv[r << 1]);
+ &cm->lf_info,
+ &lfm->lfl_uv[r << 1]);
+#endif // CONFIG_VP9_HIGHBITDEPTH
dst->buf += 16 * dst->stride;
mask_16x16 >>= 8;
mask_4x4_r = mask_4x4 & 0xf;
}
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ high_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+ dst->stride,
+ mask_16x16_r,
+ mask_8x8_r,
+ mask_4x4_r,
+ mask_4x4_int_r,
+ &cm->lf_info,
+ &lfm->lfl_uv[r << 1],
+ (int)cm->bit_depth);
+ } else {
+ filter_selectively_horiz(dst->buf, dst->stride,
+ mask_16x16_r,
+ mask_8x8_r,
+ mask_4x4_r,
+ mask_4x4_int_r,
+ &cm->lf_info,
+ &lfm->lfl_uv[r << 1]);
+ }
+#else
filter_selectively_horiz(dst->buf, dst->stride,
mask_16x16_r,
mask_8x8_r,
mask_4x4_r,
mask_4x4_int_r,
- &cm->lf_info, &lfm->lfl_uv[r << 1]);
+ &cm->lf_info,
+ &lfm->lfl_uv[r << 1]);
+#endif // CONFIG_VP9_HIGHBITDEPTH
dst->buf += 8 * dst->stride;
mask_16x16 >>= 4;
return (int8_t)clamp(t, -128, 127);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int16_t signed_char_clamp_high(int t, int bd) {
+ switch (bd) {
+ case 10:
+ return (int16_t)clamp(t, -128*4, 128*4-1);
+ case 12:
+ return (int16_t)clamp(t, -128*16, 128*16-1);
+ case 8:
+ default:
+ return (int16_t)clamp(t, -128, 128-1);
+ }
+}
+#endif
+
// should we apply any filter at all: 11111111 yes, 00000000 no
static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
uint8_t p3, uint8_t p2,
const uint8_t *limit, const uint8_t *thresh) {
mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t high_filter_mask(uint8_t limit, uint8_t blimit,
+ uint16_t p3, uint16_t p2,
+ uint16_t p1, uint16_t p0,
+ uint16_t q0, uint16_t q1,
+ uint16_t q2, uint16_t q3, int bd) {
+ int8_t mask = 0;
+ int16_t limit16 = (uint16_t)limit << (bd - 8);
+ int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+ mask |= (abs(p3 - p2) > limit16) * -1;
+ mask |= (abs(p2 - p1) > limit16) * -1;
+ mask |= (abs(p1 - p0) > limit16) * -1;
+ mask |= (abs(q1 - q0) > limit16) * -1;
+ mask |= (abs(q2 - q1) > limit16) * -1;
+ mask |= (abs(q3 - q2) > limit16) * -1;
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+ return ~mask;
+}
+
+static INLINE int8_t high_flat_mask4(uint8_t thresh,
+ uint16_t p3, uint16_t p2,
+ uint16_t p1, uint16_t p0,
+ uint16_t q0, uint16_t q1,
+ uint16_t q2, uint16_t q3, int bd) {
+ int8_t mask = 0;
+ int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+ mask |= (abs(p1 - p0) > thresh16) * -1;
+ mask |= (abs(q1 - q0) > thresh16) * -1;
+ mask |= (abs(p2 - p0) > thresh16) * -1;
+ mask |= (abs(q2 - q0) > thresh16) * -1;
+ mask |= (abs(p3 - p0) > thresh16) * -1;
+ mask |= (abs(q3 - q0) > thresh16) * -1;
+ return ~mask;
+}
+
+static INLINE int8_t high_flat_mask5(uint8_t thresh,
+ uint16_t p4, uint16_t p3,
+ uint16_t p2, uint16_t p1,
+ uint16_t p0, uint16_t q0,
+ uint16_t q1, uint16_t q2,
+ uint16_t q3, uint16_t q4, int bd) {
+ int8_t mask = ~high_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+ mask |= (abs(p4 - p0) > thresh16) * -1;
+ mask |= (abs(q4 - q0) > thresh16) * -1;
+ return ~mask;
+}
+
+// Is there high edge variance internal edge:
+// 11111111_11111111 yes, 00000000_00000000 no ?
+static INLINE int16_t high_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
+ uint16_t q0, uint16_t q1, int bd) {
+ int16_t hev = 0;
+ int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+ hev |= (abs(p1 - p0) > thresh16) * -1;
+ hev |= (abs(q1 - q0) > thresh16) * -1;
+ return hev;
+}
+
+static INLINE void high_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
+ uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
+ int bd) {
+ int16_t filter1, filter2;
+ // ^0x80 equivalent to subtracting 0x80 from the values to turn them
+ // into -128 to +127 instead of 0 to 255.
+ int shift = bd - 8;
+ const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
+ const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
+ const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
+ const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
+ const uint16_t hev = high_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
+
+ // Add outer taps if we have high edge variance.
+ int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
+
+ // Inner taps.
+ filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
+
+ // Save bottom 3 bits so that we round one side +4 and the other +3
+ // if it equals 4 we'll set to adjust by -1 to account for the fact
+ // we'd round 3 the other way.
+ filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
+ filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
+
+ *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
+ *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
+
+ // Outer tap adjustments.
+ filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+ *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
+ *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
+}
+
+void vp9_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int count, int bd) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8 * count; ++i) {
+ const uint16_t p3 = s[-4 * p];
+ const uint16_t p2 = s[-3 * p];
+ const uint16_t p1 = s[-2 * p];
+ const uint16_t p0 = s[-p];
+ const uint16_t q0 = s[0 * p];
+ const uint16_t q1 = s[1 * p];
+ const uint16_t q2 = s[2 * p];
+ const uint16_t q3 = s[3 * p];
+ const int8_t mask = high_filter_mask(*limit, *blimit,
+ p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ high_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
+ ++s;
+ }
+}
+
+void vp9_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1,
+ int bd) {
+ vp9_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1, bd);
+ vp9_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+}
+
+void vp9_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count, int bd) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8 * count; ++i) {
+ const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask = high_filter_mask(*limit, *blimit,
+ p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ high_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
+ s += pitch;
+ }
+}
+
+void vp9_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1,
+ int bd) {
+ vp9_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+ vp9_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
+ thresh1, 1, bd);
+}
+
+static INLINE void high_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+ uint16_t *op3, uint16_t *op2,
+ uint16_t *op1, uint16_t *op0,
+ uint16_t *oq0, uint16_t *oq1,
+ uint16_t *oq2, uint16_t *oq3, int bd) {
+ if (flat && mask) {
+ const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+ // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+ *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+ *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+ *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+ *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+ *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+ *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+ } else {
+ high_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
+ }
+}
+
+void vp9_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count, int bd) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8 * count; ++i) {
+ const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+ const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+ const int8_t mask = high_filter_mask(*limit, *blimit,
+ p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat = high_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ high_filter8(mask, *thresh, flat,
+ s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+ s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
+ ++s;
+ }
+}
+
+void vp9_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1,
+ int bd) {
+ vp9_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1, bd);
+ vp9_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+}
+
+void vp9_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count, int bd) {
+ int i;
+
+ for (i = 0; i < 8 * count; ++i) {
+ const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask = high_filter_mask(*limit, *blimit,
+ p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat = high_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ high_filter8(mask, *thresh, flat,
+ s - 4, s - 3, s - 2, s - 1,
+ s, s + 1, s + 2, s + 3,
+ bd);
+ s += pitch;
+ }
+}
+
+void vp9_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1,
+ int bd) {
+ vp9_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+ vp9_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
+ thresh1, 1, bd);
+}
+
+static INLINE void high_filter16(int8_t mask, uint8_t thresh,
+ uint8_t flat, uint8_t flat2,
+ uint16_t *op7, uint16_t *op6,
+ uint16_t *op5, uint16_t *op4,
+ uint16_t *op3, uint16_t *op2,
+ uint16_t *op1, uint16_t *op0,
+ uint16_t *oq0, uint16_t *oq1,
+ uint16_t *oq2, uint16_t *oq3,
+ uint16_t *oq4, uint16_t *oq5,
+ uint16_t *oq6, uint16_t *oq7, int bd) {
+ if (flat2 && flat && mask) {
+ const uint16_t p7 = *op7;
+ const uint16_t p6 = *op6;
+ const uint16_t p5 = *op5;
+ const uint16_t p4 = *op4;
+ const uint16_t p3 = *op3;
+ const uint16_t p2 = *op2;
+ const uint16_t p1 = *op1;
+ const uint16_t p0 = *op0;
+ const uint16_t q0 = *oq0;
+ const uint16_t q1 = *oq1;
+ const uint16_t q2 = *oq2;
+ const uint16_t q3 = *oq3;
+ const uint16_t q4 = *oq4;
+ const uint16_t q5 = *oq5;
+ const uint16_t q6 = *oq6;
+ const uint16_t q7 = *oq7;
+
+ // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+ *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
+ q0, 4);
+ *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
+ q0 + q1, 4);
+ *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
+ q0 + q1 + q2, 4);
+ *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
+ q0 + q1 + q2 + q3, 4);
+ *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
+ q0 + q1 + q2 + q3 + q4, 4);
+ *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
+ q0 + q1 + q2 + q3 + q4 + q5, 4);
+ *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+ q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
+ *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
+ q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
+ *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
+ q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
+ *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
+ q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
+ *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
+ q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
+ *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
+ q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
+ *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
+ q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
+ *oq6 = ROUND_POWER_OF_TWO(p0 +
+ q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
+ } else {
+ high_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+ bd);
+ }
+}
+
+void vp9_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count, int bd) {
+ int i;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < 8 * count; ++i) {
+ const uint16_t p3 = s[-4 * p];
+ const uint16_t p2 = s[-3 * p];
+ const uint16_t p1 = s[-2 * p];
+ const uint16_t p0 = s[-p];
+ const uint16_t q0 = s[0 * p];
+ const uint16_t q1 = s[1 * p];
+ const uint16_t q2 = s[2 * p];
+ const uint16_t q3 = s[3 * p];
+ const int8_t mask = high_filter_mask(*limit, *blimit,
+ p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat = high_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat2 = high_flat_mask5(
+ 1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
+ q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
+
+ high_filter16(mask, *thresh, flat, flat2,
+ s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
+ s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+ s, s + 1 * p, s + 2 * p, s + 3 * p,
+ s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p,
+ bd);
+ ++s;
+ }
+}
+
+static void high_mb_lpf_vertical_edge_w(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh,
+ int count, int bd) {
+ int i;
+
+ for (i = 0; i < count; ++i) {
+ const uint16_t p3 = s[-4];
+ const uint16_t p2 = s[-3];
+ const uint16_t p1 = s[-2];
+ const uint16_t p0 = s[-1];
+ const uint16_t q0 = s[0];
+ const uint16_t q1 = s[1];
+ const uint16_t q2 = s[2];
+ const uint16_t q3 = s[3];
+ const int8_t mask = high_filter_mask(*limit, *blimit,
+ p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat = high_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat2 = high_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+ q0, s[4], s[5], s[6], s[7], bd);
+
+ high_filter16(mask, *thresh, flat, flat2,
+ s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
+ s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7,
+ bd);
+ s += p;
+ }
+}
+
+void vp9_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ high_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
+}
+
+void vp9_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh,
+ int bd) {
+ high_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
// VPX_BITS_8 in profile 0 or 1, VPX_BITS_10 or VPX_BITS_12 in profile 2 or 3.
vpx_bit_depth_t bit_depth;
+ vpx_bit_depth_t dequant_bit_depth; // bit_depth of current dequantizer
#if CONFIG_VP9_POSTPROC
struct postproc_state postproc_state;
#include "vpx_scale/vpx_scale.h"
#include "vpx_scale/yv12config.h"
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vp9/common/vp9_common.h"
+#endif
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/vp9_postproc.h"
#include "vp9/common/vp9_systemdependent.h"
}
}
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr,
+ uint16_t *dst_ptr,
+ int src_pixels_per_line,
+ int dst_pixels_per_line,
+ int rows,
+ int cols,
+ int flimit) {
+ uint16_t const *p_src;
+ uint16_t *p_dst;
+ int row;
+ int col;
+ int i;
+ int v;
+ int pitch = src_pixels_per_line;
+ uint16_t d[8];
+
+ for (row = 0; row < rows; row++) {
+ // post_proc_down for one row.
+ p_src = src_ptr;
+ p_dst = dst_ptr;
+
+ for (col = 0; col < cols; col++) {
+ int kernel = 4;
+ int v = p_src[col];
+
+ for (i = -2; i <= 2; i++) {
+ if (abs(v - p_src[col + i * pitch]) > flimit)
+ goto down_skip_convolve;
+
+ kernel += kernel5[2 + i] * p_src[col + i * pitch];
+ }
+
+ v = (kernel >> 3);
+
+ down_skip_convolve:
+ p_dst[col] = v;
+ }
+
+ /* now post_proc_across */
+ p_src = dst_ptr;
+ p_dst = dst_ptr;
+
+ for (i = 0; i < 8; i++)
+ d[i] = p_src[i];
+
+ for (col = 0; col < cols; col++) {
+ int kernel = 4;
+ v = p_src[col];
+
+ d[col & 7] = v;
+
+ for (i = -2; i <= 2; i++) {
+ if (abs(v - p_src[col + i]) > flimit)
+ goto across_skip_convolve;
+
+ kernel += kernel5[2 + i] * p_src[col + i];
+ }
+
+ d[col & 7] = (kernel >> 3);
+
+ across_skip_convolve:
+ if (col >= 2)
+ p_dst[col - 2] = d[(col - 2) & 7];
+ }
+
+ /* handle the last two pixels */
+ p_dst[col - 2] = d[(col - 2) & 7];
+ p_dst[col - 1] = d[(col - 1) & 7];
+
+
+ /* next row */
+ src_ptr += pitch;
+ dst_ptr += dst_pixels_per_line;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
static int q2mbl(int x) {
if (x < 20) x = 20;
void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch,
int rows, int cols, int flimit) {
int r, c, i;
-
uint8_t *s = src;
uint8_t d[16];
+ for (r = 0; r < rows; r++) {
+ int sumsq = 0;
+ int sum = 0;
+
+ for (i = -8; i <= 6; i++) {
+ sumsq += s[i] * s[i];
+ sum += s[i];
+ d[i + 8] = 0;
+ }
+
+ for (c = 0; c < cols + 8; c++) {
+ int x = s[c + 7] - s[c - 8];
+ int y = s[c + 7] + s[c - 8];
+
+ sum += x;
+ sumsq += x * y;
+
+ d[c & 15] = s[c];
+
+ if (sumsq * 15 - sum * sum < flimit) {
+ d[c & 15] = (8 + sum + s[c]) >> 4;
+ }
+
+ s[c - 8] = d[(c - 8) & 15];
+ }
+ s += pitch;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch,
+ int rows, int cols, int flimit) {
+ int r, c, i;
+
+ uint16_t *s = src;
+ uint16_t d[16];
+
for (r = 0; r < rows; r++) {
int sumsq = 0;
s += pitch;
}
}
+#endif // CONFIG_VP9_HIGHBITDEPTH
void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch,
int rows, int cols, int flimit) {
}
}
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_mbpost_proc_down_c(uint16_t *dst, int pitch,
+ int rows, int cols, int flimit) {
+ int r, c, i;
+ const int16_t *rv3 = &vp9_rv[63 & rand()]; // NOLINT
+
+ for (c = 0; c < cols; c++) {
+ uint16_t *s = &dst[c];
+ int sumsq = 0;
+ int sum = 0;
+ uint16_t d[16];
+ const int16_t *rv2 = rv3 + ((c * 17) & 127);
+
+ for (i = -8; i <= 6; i++) {
+ sumsq += s[i * pitch] * s[i * pitch];
+ sum += s[i * pitch];
+ }
+
+ for (r = 0; r < rows + 8; r++) {
+ sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
+ sum += s[7 * pitch] - s[-8 * pitch];
+ d[r & 15] = s[0];
+
+ if (sumsq * 15 - sum * sum < flimit) {
+ d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+ }
+
+ s[-8 * pitch] = d[(r - 8) & 15];
+ s += pitch;
+ }
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *post,
int q,
(void) low_var_thresh;
(void) flag;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->y_buffer),
+ CONVERT_TO_SHORTPTR(post->y_buffer),
+ source->y_stride, post->y_stride,
+ source->y_height, source->y_width,
+ ppl);
+
+ vp9_highbd_mbpost_proc_across_ip(CONVERT_TO_SHORTPTR(post->y_buffer),
+ post->y_stride, post->y_height,
+ post->y_width, q2mbl(q));
+
+ vp9_highbd_mbpost_proc_down(CONVERT_TO_SHORTPTR(post->y_buffer),
+ post->y_stride, post->y_height,
+ post->y_width, q2mbl(q));
+
+ vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->u_buffer),
+ CONVERT_TO_SHORTPTR(post->u_buffer),
+ source->uv_stride, post->uv_stride,
+ source->uv_height, source->uv_width,
+ ppl);
+ vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->v_buffer),
+ CONVERT_TO_SHORTPTR(post->v_buffer),
+ source->uv_stride, post->uv_stride,
+ source->uv_height, source->uv_width,
+ ppl);
+ } else {
+ vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
+ source->y_stride, post->y_stride,
+ source->y_height, source->y_width, ppl);
+
+ vp9_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+ post->y_width, q2mbl(q));
+
+ vp9_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+ post->y_width, q2mbl(q));
+
+ vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer,
+ source->uv_stride, post->uv_stride,
+ source->uv_height, source->uv_width, ppl);
+ vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
+ source->uv_stride, post->uv_stride,
+ source->uv_height, source->uv_width, ppl);
+ }
+#else
vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
source->y_stride, post->y_stride,
source->y_height, source->y_width, ppl);
vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
source->uv_stride, post->uv_stride,
source->uv_height, source->uv_width, ppl);
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
for (i = 0; i < MAX_MB_PLANE; ++i)
+#if CONFIG_VP9_HIGHBITDEPTH
+ assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
+ (dst->flags & YV12_FLAG_HIGHBITDEPTH));
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(srcs[i]),
+ CONVERT_TO_SHORTPTR(dsts[i]),
+ src_strides[i], dst_strides[i],
+ src_heights[i], src_widths[i], ppl);
+ } else {
+ vp9_post_proc_down_and_across(srcs[i], dsts[i],
+ src_strides[i], dst_strides[i],
+ src_heights[i], src_widths[i], ppl);
+ }
+#else
vp9_post_proc_down_and_across(srcs[i], dsts[i],
src_strides[i], dst_strides[i],
src_heights[i], src_widths[i], ppl);
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
for (i = 0; i < MAX_MB_PLANE; ++i) {
const int src_stride = src_strides[i];
- const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
const int src_width = src_widths[i] - 4;
const int src_height = src_heights[i] - 4;
-
const int dst_stride = dst_strides[i];
- uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
+#if CONFIG_VP9_HIGHBITDEPTH
+ assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
+ (dst->flags & YV12_FLAG_HIGHBITDEPTH));
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const uint16_t *const src = CONVERT_TO_SHORTPTR(srcs[i] + 2 * src_stride
+ + 2);
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dsts[i] + 2 * dst_stride + 2);
+ vp9_highbd_post_proc_down_and_across(src, dst, src_stride, dst_stride,
+ src_height, src_width, ppl);
+ } else {
+ const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
+ uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
+
+ vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
+ src_height, src_width, ppl);
+ }
+#else
+ const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
+ uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
src_height, src_width, ppl);
+#endif
}
}
#if CONFIG_VP9_POSTPROC || CONFIG_INTERNAL_STATS
if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate post-processing buffer");
uint8_t *pre;
MV32 scaled_mv;
int xs, ys, subpel_x, subpel_y;
+ const int is_scaled = vp9_is_scaled(sf);
- if (vp9_is_scaled(sf)) {
+ if (is_scaled) {
pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
xs = sf->x_step_q4;
? average_split_mvs(pd, mi, ref, block)
: mi->mbmi.mv[ref].as_mv;
-
- // TODO(jkoleszar): This clamping is done in the incorrect place for the
- // scaling case. It needs to be done on the scaled MV, not the pre-scaling
- // MV. Note however that it performs the subsampling aware scaling so
- // that the result is always q4.
- // mv_precision precision is MV_PRECISION_Q4.
const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
pd->subsampling_x,
pd->subsampling_y);
subpel_x, subpel_y;
uint8_t *ref_frame, *buf_ptr;
const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
+ const int is_scaled = vp9_is_scaled(sf);
// Get reference frame pointer, width and height.
if (plane == 0) {
ref_frame = plane == 1 ? ref_buf->u_buffer : ref_buf->v_buffer;
}
- if (vp9_is_scaled(sf)) {
+ if (is_scaled) {
// Co-ordinate of containing block to pixel precision.
int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
subpel_x = scaled_mv.col & SUBPEL_MASK;
subpel_y = scaled_mv.row & SUBPEL_MASK;
- // Calculate the top left corner of the best matching block in the reference frame.
+ // Calculate the top left corner of the best matching block in the
+ // reference frame.
x0 += scaled_mv.col >> SUBPEL_BITS;
y0 += scaled_mv.row >> SUBPEL_BITS;
x0_16 += scaled_mv.col;
// Do border extension if there is motion or the
// width/height is not a multiple of 8 pixels.
- if (scaled_mv.col || scaled_mv.row ||
+ if (is_scaled || scaled_mv.col || scaled_mv.row ||
(frame_width & 0x7) || (frame_height & 0x7)) {
// Get reference block bottom right coordinate.
int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
int x_pad = 0, y_pad = 0;
- if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) {
+ if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
x0 -= VP9_INTERP_EXTEND - 1;
x1 += VP9_INTERP_EXTEND;
x_pad = 1;
}
- if (subpel_y || (sf->y_step_q4 & SUBPEL_MASK)) {
+ if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
y0 -= VP9_INTERP_EXTEND - 1;
y1 += VP9_INTERP_EXTEND;
y_pad = 1;
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_idct.h"
struct macroblockd;
#
# Intra prediction
#
- add_proto qw/void vp9_high_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d207_predictor_4x4/;
- add_proto qw/void vp9_high_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d45_predictor_4x4/;
- add_proto qw/void vp9_high_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d63_predictor_4x4/;
- add_proto qw/void vp9_high_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_h_predictor_4x4/;
- add_proto qw/void vp9_high_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d117_predictor_4x4/;
- add_proto qw/void vp9_high_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d135_predictor_4x4/;
- add_proto qw/void vp9_high_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d153_predictor_4x4/;
- add_proto qw/void vp9_high_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_v_predictor_4x4 neon/, "$sse_x86inc";
- add_proto qw/void vp9_high_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_tm_predictor_4x4/, "$sse_x86inc";
- add_proto qw/void vp9_high_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_dc_predictor_4x4/, "$sse_x86inc";
- add_proto qw/void vp9_high_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_dc_top_predictor_4x4/;
- add_proto qw/void vp9_high_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_dc_left_predictor_4x4/;
- add_proto qw/void vp9_high_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_dc_128_predictor_4x4/;
- add_proto qw/void vp9_high_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d207_predictor_8x8/;
- add_proto qw/void vp9_high_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d45_predictor_8x8/;
- add_proto qw/void vp9_high_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d63_predictor_8x8/;
- add_proto qw/void vp9_high_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_h_predictor_8x8/;
- add_proto qw/void vp9_high_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d117_predictor_8x8/;
- add_proto qw/void vp9_high_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d135_predictor_8x8/;
- add_proto qw/void vp9_high_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d153_predictor_8x8/;
- add_proto qw/void vp9_high_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_v_predictor_8x8/, "$sse2_x86inc";
- add_proto qw/void vp9_high_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_tm_predictor_8x8/, "$sse2_x86inc";
- add_proto qw/void vp9_high_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_dc_predictor_8x8/, "$sse2_x86inc";;
- add_proto qw/void vp9_high_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_dc_top_predictor_8x8/;
- add_proto qw/void vp9_high_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_dc_left_predictor_8x8/;
- add_proto qw/void vp9_high_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_dc_128_predictor_8x8/;
- add_proto qw/void vp9_high_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d207_predictor_16x16/;
- add_proto qw/void vp9_high_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d45_predictor_16x16/;
- add_proto qw/void vp9_high_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d63_predictor_16x16/;
- add_proto qw/void vp9_high_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_h_predictor_16x16/;
- add_proto qw/void vp9_high_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d117_predictor_16x16/;
- add_proto qw/void vp9_high_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d135_predictor_16x16/;
- add_proto qw/void vp9_high_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d153_predictor_16x16/;
- add_proto qw/void vp9_high_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_v_predictor_16x16 neon/, "$sse2_x86inc";
- add_proto qw/void vp9_high_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_tm_predictor_16x16/, "$sse2_x86_64";
- add_proto qw/void vp9_high_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_dc_predictor_16x16/, "$sse2_x86inc";
- add_proto qw/void vp9_high_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_dc_top_predictor_16x16/;
- add_proto qw/void vp9_high_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_dc_left_predictor_16x16/;
- add_proto qw/void vp9_high_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_dc_128_predictor_16x16/;
- add_proto qw/void vp9_high_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d207_predictor_32x32/;
- add_proto qw/void vp9_high_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d45_predictor_32x32/;
- add_proto qw/void vp9_high_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d63_predictor_32x32/;
- add_proto qw/void vp9_high_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_h_predictor_32x32/;
- add_proto qw/void vp9_high_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d117_predictor_32x32/;
- add_proto qw/void vp9_high_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d135_predictor_32x32/;
- add_proto qw/void vp9_high_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_d153_predictor_32x32/;
- add_proto qw/void vp9_high_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_v_predictor_32x32/, "$sse2_x86inc";
- add_proto qw/void vp9_high_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_tm_predictor_32x32/, "$sse2_x86_64";
- add_proto qw/void vp9_high_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_dc_predictor_32x32/, "$sse2_x86_64";
- add_proto qw/void vp9_high_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_dc_top_predictor_32x32/;
- add_proto qw/void vp9_high_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_dc_left_predictor_32x32/;
- add_proto qw/void vp9_high_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+ add_proto qw/void vp9_high_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_high_dc_128_predictor_32x32/;
#
add_proto qw/void vp9_high_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vp9_high_convolve8_avg_vert/, "$sse2_x86_64";
+ #
+ # Loopfilter
+ #
+ add_proto qw/void vp9_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/vp9_highbd_lpf_vertical_16 sse2/;
+
+ add_proto qw/void vp9_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/vp9_highbd_lpf_vertical_16_dual sse2/;
+
+ add_proto qw/void vp9_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+ specialize qw/vp9_highbd_lpf_vertical_8 sse2/;
+
+ add_proto qw/void vp9_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/vp9_highbd_lpf_vertical_8_dual sse2/;
+
+ add_proto qw/void vp9_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+ specialize qw/vp9_highbd_lpf_vertical_4 sse2/;
+
+ add_proto qw/void vp9_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/vp9_highbd_lpf_vertical_4_dual sse2/;
+
+ add_proto qw/void vp9_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+ specialize qw/vp9_highbd_lpf_horizontal_16 sse2/;
+
+ add_proto qw/void vp9_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+ specialize qw/vp9_highbd_lpf_horizontal_8 sse2/;
+
+ add_proto qw/void vp9_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/vp9_highbd_lpf_horizontal_8_dual sse2/;
+
+ add_proto qw/void vp9_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+ specialize qw/vp9_highbd_lpf_horizontal_4 sse2/;
+
+ add_proto qw/void vp9_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/vp9_highbd_lpf_horizontal_4_dual sse2/;
+
+ #
+ # post proc
+ #
+ if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+ add_proto qw/void vp9_highbd_mbpost_proc_down/, "uint16_t *dst, int pitch, int rows, int cols, int flimit";
+ specialize qw/vp9_highbd_mbpost_proc_down/;
+
+ add_proto qw/void vp9_highbd_mbpost_proc_across_ip/, "uint16_t *src, int pitch, int rows, int cols, int flimit";
+ specialize qw/vp9_highbd_mbpost_proc_across_ip/;
+
+ add_proto qw/void vp9_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
+ specialize qw/vp9_highbd_post_proc_down_and_across/;
+
+ add_proto qw/void vp9_highbd_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
+ specialize qw/vp9_highbd_plane_add_noise/;
+ }
+
#
# dct
#
add_proto qw/void vp9_high_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_high_fdct32x32_rd/;
- add_proto qw/void vp9_high_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
- specialize qw/vp9_high_temporal_filter_apply/;
+ add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+ specialize qw/vp9_highbd_temporal_filter_apply/;
}
# End vp9_high encoder functions
--- /dev/null
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
+ __m128i ubounded;
+ __m128i lbounded;
+ __m128i retval;
+
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
+ const __m128i max = _mm_subs_epi16(
+ _mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
+ const __m128i min = _mm_subs_epi16(zero, t80);
+ ubounded = _mm_cmpgt_epi16(value, max);
+ lbounded = _mm_cmplt_epi16(value, min);
+ retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
+ ubounded = _mm_and_si128(ubounded, max);
+ lbounded = _mm_and_si128(lbounded, min);
+ retval = _mm_or_si128(retval, ubounded);
+ retval = _mm_or_si128(retval, lbounded);
+ return retval;
+}
+
+// TODO(debargha, peter): Break up large functions into smaller ones
+// in this file.
+static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
+ int p,
+ const uint8_t *_blimit,
+ const uint8_t *_limit,
+ const uint8_t *_thresh,
+ int bd) {
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i blimit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(
+ _mm_load_si128((const __m128i *)_blimit), zero), bd - 8);
+ const __m128i limit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), bd - 8);
+ const __m128i thresh = _mm_slli_epi16(
+ _mm_unpacklo_epi8(
+ _mm_load_si128((const __m128i *)_thresh), zero), bd - 8);
+ __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
+ __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
+ __m128i ps1, qs1, ps0, qs0;
+ __m128i abs_p0q0, abs_p1q1, ffff, work;
+ __m128i filt, work_a, filter1, filter2;
+ __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
+ __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
+ __m128i flat2_q0, flat2_p0;
+ __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p7, sum_q7, sum_p3, sum_q3;
+ __m128i t4, t3, t80, t1;
+ __m128i eight, four;
+
+ q4 = _mm_load_si128((__m128i *)(s + 4 * p));
+ p4 = _mm_load_si128((__m128i *)(s - 5 * p));
+ q3 = _mm_load_si128((__m128i *)(s + 3 * p));
+ p3 = _mm_load_si128((__m128i *)(s - 4 * p));
+ q2 = _mm_load_si128((__m128i *)(s + 2 * p));
+ p2 = _mm_load_si128((__m128i *)(s - 3 * p));
+ q1 = _mm_load_si128((__m128i *)(s + 1 * p));
+ p1 = _mm_load_si128((__m128i *)(s - 2 * p));
+ q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+ p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+
+ // high_filter_mask
+ abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+ abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+
+ ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+
+ abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+ abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
+
+ // high_hev_mask (in C code this is actually called from high_filter4)
+ flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu16(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+ abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+ work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p1, p0),
+ _mm_subs_epu16(p0, p1)),
+ _mm_or_si128(_mm_subs_epu16(q1, q0),
+ _mm_subs_epu16(q0, q1)));
+ mask = _mm_max_epi16(work, mask);
+ work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+ _mm_subs_epu16(p1, p2)),
+ _mm_or_si128(_mm_subs_epu16(q2, q1),
+ _mm_subs_epu16(q1, q2)));
+ mask = _mm_max_epi16(work, mask);
+ work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
+ _mm_subs_epu16(p2, p3)),
+ _mm_or_si128(_mm_subs_epu16(q3, q2),
+ _mm_subs_epu16(q2, q3)));
+ mask = _mm_max_epi16(work, mask);
+
+ mask = _mm_subs_epu16(mask, limit);
+ mask = _mm_cmpeq_epi16(mask, zero); // return ~mask
+
+ // lp filter
+ // high_filter4
+ t4 = _mm_set1_epi16(4);
+ t3 = _mm_set1_epi16(3);
+ t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
+ t1 = _mm_set1_epi16(0x1);
+
+ ps1 = _mm_subs_epi16(p1, t80);
+ qs1 = _mm_subs_epi16(q1, t80);
+ ps0 = _mm_subs_epi16(p0, t80);
+ qs0 = _mm_subs_epi16(q0, t80);
+
+ filt = _mm_and_si128(
+ signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), hev);
+ work_a = _mm_subs_epi16(qs0, ps0);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+ filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+ // Filter1 >> 3
+ filter1 = _mm_srai_epi16(filter1, 0x3);
+ filter2 = _mm_srai_epi16(filter2, 0x3);
+
+ qs0 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd),
+ t80);
+ ps0 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd),
+ t80);
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(hev, filt);
+
+ qs1 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
+ t80);
+ ps1 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
+ t80);
+ // end high_filter4
+ // loopfilter done
+
+ // high_flat_mask4
+ flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
+ _mm_subs_epu16(p0, p2)),
+ _mm_or_si128(_mm_subs_epu16(p3, p0),
+ _mm_subs_epu16(p0, p3)));
+ work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q0),
+ _mm_subs_epu16(q0, q2)),
+ _mm_or_si128(_mm_subs_epu16(q3, q0),
+ _mm_subs_epu16(q0, q3)));
+ flat = _mm_max_epi16(work, flat);
+ work = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ flat = _mm_max_epi16(work, flat);
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+ flat = _mm_cmpeq_epi16(flat, zero);
+ // end flat_mask4
+
+ // flat & mask = flat && mask (as used in filter8)
+ // (because, in both vars, each block of 16 either all 1s or all 0s)
+ flat = _mm_and_si128(flat, mask);
+
+ p5 = _mm_load_si128((__m128i *)(s - 6 * p));
+ q5 = _mm_load_si128((__m128i *)(s + 5 * p));
+ p6 = _mm_load_si128((__m128i *)(s - 7 * p));
+ q6 = _mm_load_si128((__m128i *)(s + 6 * p));
+ p7 = _mm_load_si128((__m128i *)(s - 8 * p));
+ q7 = _mm_load_si128((__m128i *)(s + 7 * p));
+
+ // high_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
+ // but referred to as p0-p4 & q0-q4 in fn)
+ flat2 = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p4, p0),
+ _mm_subs_epu16(p0, p4)),
+ _mm_or_si128(_mm_subs_epu16(q4, q0),
+ _mm_subs_epu16(q0, q4)));
+
+ work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p5, p0),
+ _mm_subs_epu16(p0, p5)),
+ _mm_or_si128(_mm_subs_epu16(q5, q0),
+ _mm_subs_epu16(q0, q5)));
+ flat2 = _mm_max_epi16(work, flat2);
+
+ work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p6, p0),
+ _mm_subs_epu16(p0, p6)),
+ _mm_or_si128(_mm_subs_epu16(q6, q0),
+ _mm_subs_epu16(q0, q6)));
+ flat2 = _mm_max_epi16(work, flat2);
+
+ work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p7, p0),
+ _mm_subs_epu16(p0, p7)),
+ _mm_or_si128(_mm_subs_epu16(q7, q0),
+ _mm_subs_epu16(q0, q7)));
+ flat2 = _mm_max_epi16(work, flat2);
+
+ flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, bd - 8));
+ flat2 = _mm_cmpeq_epi16(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ // end high_flat_mask5
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ eight = _mm_set1_epi16(8);
+ four = _mm_set1_epi16(4);
+
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5),
+ _mm_add_epi16(p4, p3));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5),
+ _mm_add_epi16(q4, q3));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
+ pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(four,
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ pixetFilter_q2q1q0));
+ flat2_p0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(p7, p0)), 4);
+ flat2_q0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(q7, q0)), 4);
+ flat_p0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(p3, p0)), 3);
+ flat_q0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(q3, q0)), 3);
+
+ sum_p7 = _mm_add_epi16(p7, p7);
+ sum_q7 = _mm_add_epi16(q7, q7);
+ sum_p3 = _mm_add_epi16(p3, p3);
+ sum_q3 = _mm_add_epi16(q3, q3);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6);
+ flat2_p1 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4);
+ flat2_q1 = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
+ flat_p1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(sum_p3, p1)), 3);
+ flat_q1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+ _mm_add_epi16(sum_q3, q1)), 3);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ sum_p3 = _mm_add_epi16(sum_p3, p3);
+ sum_q3 = _mm_add_epi16(sum_q3, q3);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
+ flat2_p2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(sum_p7, p2)), 4);
+ flat2_q2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+ _mm_add_epi16(sum_q7, q2)), 4);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
+ flat_p2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(sum_p3, p2)), 3);
+ flat_q2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+ _mm_add_epi16(sum_q3, q2)), 3);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
+ flat2_p3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(sum_p7, p3)), 4);
+ flat2_q3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+ _mm_add_epi16(sum_q7, q3)), 4);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
+ flat2_p4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(sum_p7, p4)), 4);
+ flat2_q4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+ _mm_add_epi16(sum_q7, q4)), 4);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
+ flat2_p5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(sum_p7, p5)), 4);
+ flat2_q5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+ _mm_add_epi16(sum_q7, q5)), 4);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7);
+ sum_q7 = _mm_add_epi16(sum_q7, q7);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
+ flat2_p6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(sum_p7, p6)), 4);
+ flat2_q6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+ _mm_add_epi16(sum_q7, q6)), 4);
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ // high_filter8
+ p2 = _mm_andnot_si128(flat, p2);
+ // p2 remains unchanged if !(flat && mask)
+ flat_p2 = _mm_and_si128(flat, flat_p2);
+ // when (flat && mask)
+ p2 = _mm_or_si128(p2, flat_p2); // full list of p2 values
+ q2 = _mm_andnot_si128(flat, q2);
+ flat_q2 = _mm_and_si128(flat, flat_q2);
+ q2 = _mm_or_si128(q2, flat_q2); // full list of q2 values
+
+ ps1 = _mm_andnot_si128(flat, ps1);
+ // p1 takes the value assigned to in in filter4 if !(flat && mask)
+ flat_p1 = _mm_and_si128(flat, flat_p1);
+ // when (flat && mask)
+ p1 = _mm_or_si128(ps1, flat_p1); // full list of p1 values
+ qs1 = _mm_andnot_si128(flat, qs1);
+ flat_q1 = _mm_and_si128(flat, flat_q1);
+ q1 = _mm_or_si128(qs1, flat_q1); // full list of q1 values
+
+ ps0 = _mm_andnot_si128(flat, ps0);
+ // p0 takes the value assigned to in in filter4 if !(flat && mask)
+ flat_p0 = _mm_and_si128(flat, flat_p0);
+ // when (flat && mask)
+ p0 = _mm_or_si128(ps0, flat_p0); // full list of p0 values
+ qs0 = _mm_andnot_si128(flat, qs0);
+ flat_q0 = _mm_and_si128(flat, flat_q0);
+ q0 = _mm_or_si128(qs0, flat_q0); // full list of q0 values
+ // end high_filter8
+
+ // high_filter16
+ p6 = _mm_andnot_si128(flat2, p6);
+ // p6 remains unchanged if !(flat2 && flat && mask)
+ flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+ // get values for when (flat2 && flat && mask)
+ p6 = _mm_or_si128(p6, flat2_p6); // full list of p6 values
+ q6 = _mm_andnot_si128(flat2, q6);
+ // q6 remains unchanged if !(flat2 && flat && mask)
+ flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+ // get values for when (flat2 && flat && mask)
+ q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values
+ _mm_store_si128((__m128i *)(s - 7 * p), p6);
+ _mm_store_si128((__m128i *)(s + 6 * p), q6);
+
+ p5 = _mm_andnot_si128(flat2, p5);
+ // p5 remains unchanged if !(flat2 && flat && mask)
+ flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+ // get values for when (flat2 && flat && mask)
+ p5 = _mm_or_si128(p5, flat2_p5);
+ // full list of p5 values
+ q5 = _mm_andnot_si128(flat2, q5);
+ // q5 remains unchanged if !(flat2 && flat && mask)
+ flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+ // get values for when (flat2 && flat && mask)
+ q5 = _mm_or_si128(q5, flat2_q5);
+ // full list of q5 values
+ _mm_store_si128((__m128i *)(s - 6 * p), p5);
+ _mm_store_si128((__m128i *)(s + 5 * p), q5);
+
+ p4 = _mm_andnot_si128(flat2, p4);
+ // p4 remains unchanged if !(flat2 && flat && mask)
+ flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+ // get values for when (flat2 && flat && mask)
+ p4 = _mm_or_si128(p4, flat2_p4); // full list of p4 values
+ q4 = _mm_andnot_si128(flat2, q4);
+ // q4 remains unchanged if !(flat2 && flat && mask)
+ flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+ // get values for when (flat2 && flat && mask)
+ q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values
+ _mm_store_si128((__m128i *)(s - 5 * p), p4);
+ _mm_store_si128((__m128i *)(s + 4 * p), q4);
+
+ p3 = _mm_andnot_si128(flat2, p3);
+ // p3 takes value from high_filter8 if !(flat2 && flat && mask)
+ flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+ // get values for when (flat2 && flat && mask)
+ p3 = _mm_or_si128(p3, flat2_p3); // full list of p3 values
+ q3 = _mm_andnot_si128(flat2, q3);
+ // q3 takes value from high_filter8 if !(flat2 && flat && mask)
+ flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+ // get values for when (flat2 && flat && mask)
+ q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values
+ _mm_store_si128((__m128i *)(s - 4 * p), p3);
+ _mm_store_si128((__m128i *)(s + 3 * p), q3);
+
+ p2 = _mm_andnot_si128(flat2, p2);
+ // p2 takes value from high_filter8 if !(flat2 && flat && mask)
+ flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+ // get values for when (flat2 && flat && mask)
+ p2 = _mm_or_si128(p2, flat2_p2);
+ // full list of p2 values
+ q2 = _mm_andnot_si128(flat2, q2);
+ // q2 takes value from high_filter8 if !(flat2 && flat && mask)
+ flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+ // get values for when (flat2 && flat && mask)
+ q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values
+ _mm_store_si128((__m128i *)(s - 3 * p), p2);
+ _mm_store_si128((__m128i *)(s + 2 * p), q2);
+
+ p1 = _mm_andnot_si128(flat2, p1);
+ // p1 takes value from high_filter8 if !(flat2 && flat && mask)
+ flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+ // get values for when (flat2 && flat && mask)
+ p1 = _mm_or_si128(p1, flat2_p1); // full list of p1 values
+ q1 = _mm_andnot_si128(flat2, q1);
+ // q1 takes value from high_filter8 if !(flat2 && flat && mask)
+ flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+ // get values for when (flat2 && flat && mask)
+ q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values
+ _mm_store_si128((__m128i *)(s - 2 * p), p1);
+ _mm_store_si128((__m128i *)(s + 1 * p), q1);
+
+ p0 = _mm_andnot_si128(flat2, p0);
+ // p0 takes value from high_filter8 if !(flat2 && flat && mask)
+ flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+ // get values for when (flat2 && flat && mask)
+ p0 = _mm_or_si128(p0, flat2_p0); // full list of p0 values
+ q0 = _mm_andnot_si128(flat2, q0);
+ // q0 takes value from high_filter8 if !(flat2 && flat && mask)
+ flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+ // get values for when (flat2 && flat && mask)
+ q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values
+ _mm_store_si128((__m128i *)(s - 1 * p), p0);
+ _mm_store_si128((__m128i *)(s - 0 * p), q0);
+}
+
+static void highbd_mb_lpf_horizontal_edge_w_sse2_16(uint16_t *s,
+ int p,
+ const uint8_t *_blimit,
+ const uint8_t *_limit,
+ const uint8_t *_thresh,
+ int bd) {
+ highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
+ highbd_mb_lpf_horizontal_edge_w_sse2_8(s + 8, p, _blimit, _limit, _thresh,
+ bd);
+}
+
+// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
+void vp9_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
+ const uint8_t *_blimit,
+ const uint8_t *_limit,
+ const uint8_t *_thresh,
+ int count, int bd) {
+ if (count == 1)
+ highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
+ else
+ highbd_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh, bd);
+}
+
+void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+ const uint8_t *_blimit,
+ const uint8_t *_limit,
+ const uint8_t *_thresh,
+ int count, int bd) {
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_op2, 16);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_op1, 16);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_op0, 16);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq2, 16);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq1, 16);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq0, 16);
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i blimit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero),
+ bd - 8);
+ const __m128i limit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero),
+ bd - 8);
+ const __m128i thresh = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero),
+ bd - 8);
+ __m128i mask, hev, flat;
+ __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
+ __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
+ __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
+ __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
+ __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
+ __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
+ __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+ __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i ffff = _mm_cmpeq_epi16(one, one);
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i workp_a, workp_b, workp_shft;
+
+ const __m128i t4 = _mm_set1_epi16(4);
+ const __m128i t3 = _mm_set1_epi16(3);
+ const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ const __m128i ps1 = _mm_subs_epi16(p1, t80);
+ const __m128i ps0 = _mm_subs_epi16(p0, t80);
+ const __m128i qs0 = _mm_subs_epi16(q0, t80);
+ const __m128i qs1 = _mm_subs_epi16(q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ (void)count;
+
+ // filter_mask and hev_mask
+ abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
+ _mm_subs_epu16(p0, p1));
+ abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
+ _mm_subs_epu16(q0, q1));
+
+ abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
+ _mm_subs_epu16(q0, p0));
+ abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
+ _mm_subs_epu16(q1, p1));
+ flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu16(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+ abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ // So taking maximums continues to work:
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+ mask = _mm_max_epi16(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ mask = _mm_max_epi16(abs_q1q0, mask);
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+ _mm_subs_epu16(p1, p2)),
+ _mm_or_si128(_mm_subs_epu16(q2, q1),
+ _mm_subs_epu16(q1, q2)));
+ mask = _mm_max_epi16(work, mask);
+ work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
+ _mm_subs_epu16(p2, p3)),
+ _mm_or_si128(_mm_subs_epu16(q3, q2),
+ _mm_subs_epu16(q2, q3)));
+ mask = _mm_max_epi16(work, mask);
+ mask = _mm_subs_epu16(mask, limit);
+ mask = _mm_cmpeq_epi16(mask, zero);
+
+ // flat_mask4
+ flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
+ _mm_subs_epu16(p0, p2)),
+ _mm_or_si128(_mm_subs_epu16(q2, q0),
+ _mm_subs_epu16(q0, q2)));
+ work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p0),
+ _mm_subs_epu16(p0, p3)),
+ _mm_or_si128(_mm_subs_epu16(q3, q0),
+ _mm_subs_epu16(q0, q3)));
+ flat = _mm_max_epi16(work, flat);
+ flat = _mm_max_epi16(abs_p1p0, flat);
+ flat = _mm_max_epi16(abs_q1q0, flat);
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+ flat = _mm_cmpeq_epi16(flat, zero);
+ flat = _mm_and_si128(flat, mask); // flat & mask
+
+ // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+ workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
+
+ // lp filter
+ filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+ filt = _mm_and_si128(filt, hev);
+ work_a = _mm_subs_epi16(qs0, ps0);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = _mm_adds_epi16(filt, work_a);
+ // (vp9_filter + 3 * (qs0 - ps0)) & mask
+ filt = signed_char_clamp_bd_sse2(filt, bd);
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi16(filt, t4);
+ filter2 = _mm_adds_epi16(filt, t3);
+
+ // Filter1 >> 3
+ filter1 = signed_char_clamp_bd_sse2(filter1, bd);
+ filter1 = _mm_srai_epi16(filter1, 3);
+
+ // Filter2 >> 3
+ filter2 = signed_char_clamp_bd_sse2(filter2, bd);
+ filter2 = _mm_srai_epi16(filter2, 3);
+
+ // filt >> 1
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+ filt = _mm_andnot_si128(hev, filt);
+
+ work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
+ work_a = _mm_adds_epi16(work_a, t80);
+ q0 = _mm_load_si128((__m128i *)flat_oq0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q0 = _mm_and_si128(flat, q0);
+ q0 = _mm_or_si128(work_a, q0);
+
+ work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
+ work_a = _mm_adds_epi16(work_a, t80);
+ q1 = _mm_load_si128((__m128i *)flat_oq1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q1 = _mm_and_si128(flat, q1);
+ q1 = _mm_or_si128(work_a, q1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ q2 = _mm_load_si128((__m128i *)flat_oq2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q2 = _mm_and_si128(flat, q2);
+ q2 = _mm_or_si128(work_a, q2);
+
+ work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
+ work_a = _mm_adds_epi16(work_a, t80);
+ p0 = _mm_load_si128((__m128i *)flat_op0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p0 = _mm_and_si128(flat, p0);
+ p0 = _mm_or_si128(work_a, p0);
+
+ work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
+ work_a = _mm_adds_epi16(work_a, t80);
+ p1 = _mm_load_si128((__m128i *)flat_op1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p1 = _mm_and_si128(flat, p1);
+ p1 = _mm_or_si128(work_a, p1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ p2 = _mm_load_si128((__m128i *)flat_op2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p2 = _mm_and_si128(flat, p2);
+ p2 = _mm_or_si128(work_a, p2);
+
+ _mm_store_si128((__m128i *)(s - 3 * p), p2);
+ _mm_store_si128((__m128i *)(s - 2 * p), p1);
+ _mm_store_si128((__m128i *)(s - 1 * p), p0);
+ _mm_store_si128((__m128i *)(s + 0 * p), q0);
+ _mm_store_si128((__m128i *)(s + 1 * p), q1);
+ _mm_store_si128((__m128i *)(s + 2 * p), q2);
+}
+
+void vp9_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
+ const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0,
+ const uint8_t *_blimit1,
+ const uint8_t *_limit1,
+ const uint8_t *_thresh1,
+ int bd) {
+ vp9_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+ vp9_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1,
+ 1, bd);
+}
+
+void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+ const uint8_t *_blimit,
+ const uint8_t *_limit,
+ const uint8_t *_thresh,
+ int count, int bd) {
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i blimit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(
+ _mm_load_si128((const __m128i *)_blimit), zero), bd - 8);
+ const __m128i limit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(
+ _mm_load_si128((const __m128i *)_limit), zero), bd - 8);
+ const __m128i thresh = _mm_slli_epi16(
+ _mm_unpacklo_epi8(
+ _mm_load_si128((const __m128i *)_thresh), zero), bd - 8);
+ __m128i mask, hev, flat;
+ __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+ __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+ __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+ const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
+ _mm_subs_epu16(p0, p1));
+ const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
+ _mm_subs_epu16(q0, q1));
+ const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
+ _mm_subs_epu16(q0, p0));
+ __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
+ _mm_subs_epu16(q1, p1));
+ __m128i work;
+ const __m128i t4 = _mm_set1_epi16(4);
+ const __m128i t3 = _mm_set1_epi16(3);
+ const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
+ const __m128i tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), bd - 8);
+ const __m128i tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), bd - 8);
+ const __m128i t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 16 - bd);
+ // equivalent to shifting 0x1f left by bitdepth - 8
+ // and setting new bits to 1
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ const __m128i t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 16 - bd);
+ // equivalent to shifting 0x7f left by bitdepth - 8
+ // and setting new bits to 1
+ const __m128i ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)),
+ t80);
+ const __m128i ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)),
+ t80);
+ const __m128i qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)),
+ t80);
+ const __m128i qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)),
+ t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ (void)count;
+
+ // filter_mask and hev_mask
+ flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu16(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+ abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ // So taking maximums continues to work:
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+ mask = _mm_max_epi16(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+ _mm_subs_epu16(p1, p2)),
+ _mm_or_si128(_mm_subs_epu16(p3, p2),
+ _mm_subs_epu16(p2, p3)));
+ mask = _mm_max_epi16(work, mask);
+ work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q1),
+ _mm_subs_epu16(q1, q2)),
+ _mm_or_si128(_mm_subs_epu16(q3, q2),
+ _mm_subs_epu16(q2, q3)));
+ mask = _mm_max_epi16(work, mask);
+ mask = _mm_subs_epu16(mask, limit);
+ mask = _mm_cmpeq_epi16(mask, zero);
+
+ // filter4
+ filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+ filt = _mm_and_si128(filt, hev);
+ work_a = _mm_subs_epi16(qs0, ps0);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = _mm_adds_epi16(filt, work_a);
+ filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+ // (vp9_filter + 3 * (qs0 - ps0)) & mask
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+ filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+ // Filter1 >> 3
+ work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0
+ filter1 = _mm_and_si128(filter1, t1f); // clamp the range
+ filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits
+
+ // Filter2 >> 3
+ work_a = _mm_cmpgt_epi16(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, tffe0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+
+ // filt >> 1
+ filt = _mm_adds_epi16(filter1, t1);
+ work_a = _mm_cmpgt_epi16(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, tff80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+
+ filt = _mm_andnot_si128(hev, filt);
+
+ q0 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
+ q1 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), t80);
+ p0 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
+ p1 = _mm_adds_epi16(
+ signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80);
+
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+}
+
+void vp9_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
+ const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0,
+ const uint8_t *_blimit1,
+ const uint8_t *_limit1,
+ const uint8_t *_thresh1,
+ int bd) {
+ vp9_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+ vp9_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1,
+ bd);
+}
+
+static INLINE void highbd_transpose(uint16_t *src[], int in_p,
+ uint16_t *dst[], int out_p,
+ int num_8x8_to_transpose) {
+ int idx8x8 = 0;
+ __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
+ do {
+ uint16_t *in = src[idx8x8];
+ uint16_t *out = dst[idx8x8];
+
+ p0 = _mm_loadu_si128((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07
+ p1 = _mm_loadu_si128((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17
+ p2 = _mm_loadu_si128((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
+ p3 = _mm_loadu_si128((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
+ p4 = _mm_loadu_si128((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
+ p5 = _mm_loadu_si128((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
+ p6 = _mm_loadu_si128((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
+ p7 = _mm_loadu_si128((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
+ // 00 10 01 11 02 12 03 13
+ x0 = _mm_unpacklo_epi16(p0, p1);
+ // 20 30 21 31 22 32 23 33
+ x1 = _mm_unpacklo_epi16(p2, p3);
+ // 40 50 41 51 42 52 43 53
+ x2 = _mm_unpacklo_epi16(p4, p5);
+ // 60 70 61 71 62 72 63 73
+ x3 = _mm_unpacklo_epi16(p6, p7);
+ // 00 10 20 30 01 11 21 31
+ x4 = _mm_unpacklo_epi32(x0, x1);
+ // 40 50 60 70 41 51 61 71
+ x5 = _mm_unpacklo_epi32(x2, x3);
+ // 00 10 20 30 40 50 60 70
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 01 11 21 31 41 51 61 71
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 0*out_p), x6);
+ // 00 10 20 30 40 50 60 70
+ _mm_storeu_si128((__m128i *)(out + 1*out_p), x7);
+ // 01 11 21 31 41 51 61 71
+
+ // 02 12 22 32 03 13 23 33
+ x4 = _mm_unpackhi_epi32(x0, x1);
+ // 42 52 62 72 43 53 63 73
+ x5 = _mm_unpackhi_epi32(x2, x3);
+ // 02 12 22 32 42 52 62 72
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 03 13 23 33 43 53 63 73
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 2*out_p), x6);
+ // 02 12 22 32 42 52 62 72
+ _mm_storeu_si128((__m128i *)(out + 3*out_p), x7);
+ // 03 13 23 33 43 53 63 73
+
+ // 04 14 05 15 06 16 07 17
+ x0 = _mm_unpackhi_epi16(p0, p1);
+ // 24 34 25 35 26 36 27 37
+ x1 = _mm_unpackhi_epi16(p2, p3);
+ // 44 54 45 55 46 56 47 57
+ x2 = _mm_unpackhi_epi16(p4, p5);
+ // 64 74 65 75 66 76 67 77
+ x3 = _mm_unpackhi_epi16(p6, p7);
+ // 04 14 24 34 05 15 25 35
+ x4 = _mm_unpacklo_epi32(x0, x1);
+ // 44 54 64 74 45 55 65 75
+ x5 = _mm_unpacklo_epi32(x2, x3);
+ // 04 14 24 34 44 54 64 74
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 05 15 25 35 45 55 65 75
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 4*out_p), x6);
+ // 04 14 24 34 44 54 64 74
+ _mm_storeu_si128((__m128i *)(out + 5*out_p), x7);
+ // 05 15 25 35 45 55 65 75
+
+ // 06 16 26 36 07 17 27 37
+ x4 = _mm_unpackhi_epi32(x0, x1);
+ // 46 56 66 76 47 57 67 77
+ x5 = _mm_unpackhi_epi32(x2, x3);
+ // 06 16 26 36 46 56 66 76
+ x6 = _mm_unpacklo_epi64(x4, x5);
+ // 07 17 27 37 47 57 67 77
+ x7 = _mm_unpackhi_epi64(x4, x5);
+
+ _mm_storeu_si128((__m128i *)(out + 6*out_p), x6);
+ // 06 16 26 36 46 56 66 76
+ _mm_storeu_si128((__m128i *)(out + 7*out_p), x7);
+ // 07 17 27 37 47 57 67 77
+ } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1,
+ int in_p, uint16_t *out, int out_p) {
+ uint16_t *src0[1];
+ uint16_t *src1[1];
+ uint16_t *dest0[1];
+ uint16_t *dest1[1];
+ src0[0] = in0;
+ src1[0] = in1;
+ dest0[0] = out;
+ dest1[0] = out + 8;
+ highbd_transpose(src0, in_p, dest0, out_p, 1);
+ highbd_transpose(src1, in_p, dest1, out_p, 1);
+}
+
+void vp9_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh,
+ int count, int bd) {
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 8 * 8);
+ uint16_t *src[1];
+ uint16_t *dst[1];
+ (void)count;
+
+ // Transpose 8x8
+ src[0] = s - 4;
+ dst[0] = t_dst;
+
+ highbd_transpose(src, p, dst, 8, 1);
+
+ // Loop filtering
+ vp9_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+ bd);
+
+ src[0] = t_dst;
+ dst[0] = s - 4;
+
+ // Transpose back
+ highbd_transpose(src, 8, dst, p, 1);
+}
+
+void vp9_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1,
+ int bd) {
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 16 * 8);
+ uint16_t *src[2];
+ uint16_t *dst[2];
+
+ // Transpose 8x16
+ highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+ // Loop filtering
+ vp9_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+ thresh0, blimit1, limit1, thresh1, bd);
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+ dst[0] = s - 4;
+ dst[1] = s - 4 + p * 8;
+
+ // Transpose back
+ highbd_transpose(src, 16, dst, p, 2);
+}
+
+void vp9_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh,
+ int count, int bd) {
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 8 * 8);
+ uint16_t *src[1];
+ uint16_t *dst[1];
+ (void)count;
+
+ // Transpose 8x8
+ src[0] = s - 4;
+ dst[0] = t_dst;
+
+ highbd_transpose(src, p, dst, 8, 1);
+
+ // Loop filtering
+ vp9_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+ bd);
+
+ src[0] = t_dst;
+ dst[0] = s - 4;
+
+ // Transpose back
+ highbd_transpose(src, 8, dst, p, 1);
+}
+
+void vp9_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1,
+ int bd) {
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 16 * 8);
+ uint16_t *src[2];
+ uint16_t *dst[2];
+
+ // Transpose 8x16
+ highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+ // Loop filtering
+ vp9_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+ thresh0, blimit1, limit1, thresh1, bd);
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+
+ dst[0] = s - 4;
+ dst[1] = s - 4 + p * 8;
+
+ // Transpose back
+ highbd_transpose(src, 16, dst, p, 2);
+}
+
+void vp9_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh,
+ int bd) {
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 8 * 16);
+ uint16_t *src[2];
+ uint16_t *dst[2];
+
+ src[0] = s - 8;
+ src[1] = s;
+ dst[0] = t_dst;
+ dst[1] = t_dst + 8 * 8;
+
+ // Transpose 16x8
+ highbd_transpose(src, p, dst, 8, 2);
+
+ // Loop filtering
+ highbd_mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit,
+ thresh, bd);
+ src[0] = t_dst;
+ src[1] = t_dst + 8 * 8;
+ dst[0] = s - 8;
+ dst[1] = s;
+
+ // Transpose back
+ highbd_transpose(src, 8, dst, p, 2);
+}
+
+void vp9_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
+ int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh,
+ int bd) {
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 256);
+
+ // Transpose 16x16
+ highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
+ highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+
+ // Loop filtering
+ highbd_mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
+ thresh, bd);
+
+ // Transpose back
+ highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
+ highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+}
if (eob > 0) {
TX_TYPE tx_type = DCT_DCT;
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (xd->lossless) {
+ tx_type = DCT_DCT;
+ vp9_high_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+ } else {
+ const PLANE_TYPE plane_type = pd->plane_type;
+ switch (tx_size) {
+ case TX_4X4:
+ tx_type = get_tx_type_4x4(plane_type, xd, block);
+ vp9_high_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ case TX_8X8:
+ tx_type = get_tx_type(plane_type, xd);
+ vp9_high_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ case TX_16X16:
+ tx_type = get_tx_type(plane_type, xd);
+ vp9_high_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ case TX_32X32:
+ tx_type = DCT_DCT;
+ vp9_high_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ default:
+ assert(0 && "Invalid transform size");
+ }
+ }
+ } else {
+ if (xd->lossless) {
+ tx_type = DCT_DCT;
+ vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+ } else {
+ const PLANE_TYPE plane_type = pd->plane_type;
+ switch (tx_size) {
+ case TX_4X4:
+ tx_type = get_tx_type_4x4(plane_type, xd, block);
+ vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+ break;
+ case TX_8X8:
+ tx_type = get_tx_type(plane_type, xd);
+ vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+ break;
+ case TX_16X16:
+ tx_type = get_tx_type(plane_type, xd);
+ vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+ break;
+ case TX_32X32:
+ tx_type = DCT_DCT;
+ vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+ break;
+ default:
+ assert(0 && "Invalid transform size");
+ return;
+ }
+ }
+ }
+#else
if (xd->lossless) {
tx_type = DCT_DCT;
vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
break;
default:
assert(0 && "Invalid transform size");
+ return;
}
}
+#endif // CONFIG_VP9_HIGHBITDEPTH
if (eob == 1) {
vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
update |= read_delta_q(rb, &cm->y_dc_delta_q);
update |= read_delta_q(rb, &cm->uv_dc_delta_q);
update |= read_delta_q(rb, &cm->uv_ac_delta_q);
- if (update)
+ if (update || cm->bit_depth != cm->dequant_bit_depth) {
vp9_init_dequantizer(cm);
+ cm->dequant_bit_depth = cm->bit_depth;
+ }
xd->lossless = cm->base_qindex == 0 &&
cm->y_dc_delta_q == 0 &&
cm->uv_dc_delta_q == 0 &&
cm->uv_ac_delta_q == 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+ xd->bd = (int)cm->bit_depth;
+#endif
}
static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) {
static void read_bitdepth_colorspace_sampling(
VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
- if (cm->profile >= PROFILE_2)
+ if (cm->profile >= PROFILE_2) {
cm->bit_depth = vp9_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10;
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth = 1;
+#endif
+ } else {
+ cm->bit_depth = VPX_BITS_8;
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth = 0;
+#endif
+ }
cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3);
if (cm->color_space != SRGB) {
vp9_rb_read_bit(rb); // [16,235] (including xvycc) vs [0,255] range
// case (normative).
cm->color_space = BT_601;
cm->subsampling_y = cm->subsampling_x = 1;
+ cm->bit_depth = VPX_BITS_8;
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth = 0;
+#endif
}
pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
}
}
}
+#if CONFIG_VP9_HIGHBITDEPTH
+ get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;
+#endif
if (pbi->need_resync) {
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
cm->current_video_frame = 0;
pbi->ready_for_new_data = 1;
cm->bit_depth = VPX_BITS_8;
+ cm->dequant_bit_depth = VPX_BITS_8;
// vp9_init_dequantizer() is first called here. Add check in
// frame_init_dequantizer() to avoid unnecessary calling of
#define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN]
DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = {0};
+#if CONFIG_VP9_HIGHBITDEPTH
+DECLARE_ALIGNED(16, static const uint16_t, vp9_highbd_64_zeros[64]) = {0};
+#endif
unsigned int vp9_vaq_segment_id(int energy) {
ENERGY_IN_BOUNDS(energy);
const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow;
const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow;
int avg;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ high_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+ CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh, &sse,
+ &avg);
+ sse >>= 2 * (xd->bd - 8);
+ avg >>= (xd->bd - 8);
+ } else {
+ variance(x->plane[0].src.buf, x->plane[0].src.stride,
+ vp9_64_zeros, 0, bw, bh, &sse, &avg);
+ }
+#else
variance(x->plane[0].src.buf, x->plane[0].src.stride,
vp9_64_zeros, 0, bw, bh, &sse, &avg);
+#endif // CONFIG_VP9_HIGHBITDEPTH
var = sse - (((int64_t)avg * avg) / (bw * bh));
return (256 * var) / (bw * bh);
} else {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+ x->plane[0].src.stride,
+ CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros),
+ 0, &sse);
+ } else {
+ var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+ x->plane[0].src.stride,
+ vp9_64_zeros, 0, &sse);
+ }
+#else
var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
x->plane[0].src.stride,
vp9_64_zeros, 0, &sse);
+#endif // CONFIG_VP9_HIGHBITDEPTH
return (256 * var) >> num_pels_log2_lookup[bs];
}
}
}
static void pack_mb_tokens(vp9_writer *w,
- TOKENEXTRA **tp, const TOKENEXTRA *const stop) {
+ TOKENEXTRA **tp, const TOKENEXTRA *const stop,
+ vpx_bit_depth_t bit_depth) {
TOKENEXTRA *p = *tp;
while (p < stop && p->token != EOSB_TOKEN) {
const int t = p->token;
const struct vp9_token *const a = &vp9_coef_encodings[t];
- const vp9_extra_bit *const b = &vp9_extra_bits[t];
int i = 0;
int v = a->value;
int n = a->len;
+#if CONFIG_VP9_HIGHBITDEPTH
+ const vp9_extra_bit *b;
+ if (bit_depth == VPX_BITS_12)
+ b = &vp9_extra_bits_high12[t];
+ else if (bit_depth == VPX_BITS_10)
+ b = &vp9_extra_bits_high10[t];
+ else
+ b = &vp9_extra_bits[t];
+#else
+ const vp9_extra_bit *const b = &vp9_extra_bits[t];
+ (void) bit_depth;
+#endif // CONFIG_VP9_HIGHBITDEPTH
/* skip one or two nodes */
if (p->skip_eob_node) {
}
assert(*tok < tok_end);
- pack_mb_tokens(w, tok, tok_end);
+ pack_mb_tokens(w, tok, tok_end, cm->bit_depth);
}
static void write_partition(const VP9_COMMON *const cm,
// Otherwise, we try to dampen the filter if the delta is not too high.
delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising))
- >> 8) + 1;
+ >> num_pels_log2_lookup[bs]) + 1;
if (delta >= delta_thresh(bs, increase_denoising)) {
return COPY_BLOCK;
// Eventually this should be replaced by custom no-reference routines,
// which will be faster.
static const uint8_t VP9_VAR_OFFS[64] = {
- 128, 128, 128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 128
+ 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128
};
+#if CONFIG_VP9_HIGHBITDEPTH
+static const uint16_t VP9_HIGH_VAR_OFFS_8[64] = {
+ 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static const uint16_t VP9_HIGH_VAR_OFFS_10[64] = {
+ 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+ 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+ 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+ 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+ 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+ 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+ 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+ 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4
+};
+
+static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = {
+ 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+ 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+ 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+ 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+ 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+ 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+ 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+ 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16
+};
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,
const struct buf_2d *ref,
BLOCK_SIZE bs) {
return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static unsigned int high_get_sby_perpixel_variance(
+ VP9_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs, int bd) {
+ unsigned int var, sse;
+ switch (bd) {
+ case 10:
+ var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+ CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10),
+ 0, &sse);
+ break;
+ case 12:
+ var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+ CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12),
+ 0, &sse);
+ break;
+ case 8:
+ default:
+ var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+ CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8),
+ 0, &sse);
+ break;
+ }
+ return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
const struct buf_2d *ref,
int mi_row, int mi_col,
} else {
d = VP9_VAR_OFFS;
dp = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ switch (xd->bd) {
+ case 10:
+ d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10);
+ break;
+ case 12:
+ d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12);
+ break;
+ case 8:
+ default:
+ d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8);
+ break;
+ }
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
// Fill in the entire tree of 8x8 variances for splits.
// Set to zero to make sure we do not use the previous encoded frame stats
mbmi->skip = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ x->source_variance =
+ high_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize, xd->bd);
+ } else {
+ x->source_variance =
+ get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+ }
+#else
x->source_variance = get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+#endif // CONFIG_VP9_HIGHBITDEPTH
// Save rdmult before it might be changed, so it can be restored later.
orig_rdmult = x->rdmult;
for (i = 0; i < cm->mb_rows; i++) {
for (j = 0; j < cm->mb_cols; j++) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ switch (cm->bit_depth) {
+ case VPX_BITS_8:
+ vp9_high_get16x16var(src, src_stride, last_src, last_stride,
+ &var16->sse, &var16->sum);
+ break;
+ case VPX_BITS_10:
+ vp9_high_10_get16x16var(src, src_stride, last_src, last_stride,
+ &var16->sse, &var16->sum);
+ break;
+ case VPX_BITS_12:
+ vp9_high_12_get16x16var(src, src_stride, last_src, last_stride,
+ &var16->sse, &var16->sum);
+ break;
+ default:
+ assert(0 && "cm->bit_depth should be VPX_BITS_8, VPX_BITS_10"
+ " or VPX_BITS_12");
+ return -1;
+ }
+ } else {
+ vp9_get16x16var(src, src_stride, last_src, last_stride,
+ &var16->sse, &var16->sum);
+ }
+#else
vp9_get16x16var(src, src_stride, last_src, last_stride,
&var16->sse, &var16->sum);
-
+#endif // CONFIG_VP9_HIGHBITDEPTH
var16->var = var16->sse -
(((uint32_t)var16->sum * var16->sum) >> 8);
cm->tx_mode = select_tx_mode(cpi);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+ else
+ x->fwd_txm4x4 = xd->lossless ? vp9_high_fwht4x4 : vp9_high_fdct4x4;
+ x->high_itxm_add = xd->lossless ? vp9_high_iwht4x4_add : vp9_high_idct4x4_add;
+#else
x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+#endif // CONFIG_VP9_HIGHBITDEPTH
x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
if (xd->lossless) {
}
}
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_subtract_block_c(int rows, int cols,
+ int16_t *diff, ptrdiff_t diff_stride,
+ const uint8_t *src8, ptrdiff_t src_stride,
+ const uint8_t *pred8, ptrdiff_t pred_stride,
+ int bd) {
+ int r, c;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ (void) bd;
+
+ for (r = 0; r < rows; r++) {
+ for (c = 0; c < cols; c++) {
+ diff[c] = src[c] - pred[c];
+ }
+
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_high_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride, x->e_mbd.bd);
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
vp9_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
pd->dst.buf, pd->dst.stride);
}
int64_t rd_cost0, rd_cost1;
int rate0, rate1, error0, error1, t0, t1;
int best, band, pt, i, final_eob;
+ const TOKENVALUE *dct_value_tokens;
+ const int16_t *dct_value_cost;
assert((!type && !plane) || (type && plane));
assert(eob <= default_eob);
tokens[eob][0].qc = 0;
tokens[eob][1] = tokens[eob][0];
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->bd == 12) {
+ dct_value_tokens = vp9_dct_value_tokens_high12_ptr;
+ dct_value_cost = vp9_dct_value_cost_high12_ptr;
+ } else if (xd->bd == 10) {
+ dct_value_tokens = vp9_dct_value_tokens_high10_ptr;
+ dct_value_cost = vp9_dct_value_cost_high10_ptr;
+ } else {
+ dct_value_tokens = vp9_dct_value_tokens_ptr;
+ dct_value_cost = vp9_dct_value_cost_ptr;
+ }
+#else
+ dct_value_tokens = vp9_dct_value_tokens_ptr;
+ dct_value_cost = vp9_dct_value_cost_ptr;
+#endif
for (i = 0; i < eob; i++)
token_cache[scan[i]] =
- vp9_pt_energy_class[vp9_dct_value_tokens_ptr[qcoeff[scan[i]]].token];
+ vp9_pt_energy_class[dct_value_tokens[qcoeff[scan[i]]].token];
for (i = eob; i-- > 0;) {
int base_bits, d2, dx;
/* Evaluate the first possibility for this state. */
rate0 = tokens[next][0].rate;
rate1 = tokens[next][1].rate;
- t0 = (vp9_dct_value_tokens_ptr + x)->token;
+ t0 = (dct_value_tokens + x)->token;
/* Consider both possible successor states. */
if (next < default_eob) {
band = band_translate[i + 1];
UPDATE_RD_COST();
/* And pick the best. */
best = rd_cost1 < rd_cost0;
- base_bits = vp9_dct_value_cost_ptr[x];
+ base_bits = dct_value_cost[x];
dx = mul * (dqcoeff[rc] - coeff[rc]);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ dx >>= xd->bd - 8;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
d2 = dx * dx;
tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
tokens[i][0].error = d2 + (best ? error1 : error0);
t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
} else {
- t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
+ t0 = t1 = (dct_value_tokens + x)->token;
}
if (next < default_eob) {
band = band_translate[i + 1];
UPDATE_RD_COST();
/* And pick the best. */
best = rd_cost1 < rd_cost0;
- base_bits = vp9_dct_value_cost_ptr[x];
+ base_bits = dct_value_cost[x];
if (shortcut) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
+ } else {
+ dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+ }
+#else
dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+#endif // CONFIG_VP9_HIGHBITDEPTH
d2 = dx * dx;
}
tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
else
vp9_high_fdct32x32(src, dst, src_stride);
}
-#endif
+#endif // CONFIG_VP9_HIGHBITDEPTH
void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ switch (tx_size) {
+ case TX_32X32:
+ high_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+ vp9_high_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
+ p->round_fp, p->quant_fp, p->quant_shift,
+ qcoeff, dqcoeff, pd->dequant, p->zbin_extra,
+ eob, scan_order->scan, scan_order->iscan);
+ break;
+ case TX_16X16:
+ vp9_high_fdct16x16(src_diff, coeff, diff_stride);
+ vp9_high_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_8X8:
+ vp9_high_fdct8x8(src_diff, coeff, diff_stride);
+ vp9_high_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_4X4:
+ x->fwd_txm4x4(src_diff, coeff, diff_stride);
+ vp9_high_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ default:
+ assert(0);
+ }
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
switch (tx_size) {
case TX_32X32:
fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ switch (tx_size) {
+ case TX_32X32:
+ vp9_high_fdct32x32_1(src_diff, coeff, diff_stride);
+ vp9_high_quantize_dc_32x32(coeff, x->skip_block, p->round,
+ p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ case TX_16X16:
+ vp9_high_fdct16x16_1(src_diff, coeff, diff_stride);
+ vp9_high_quantize_dc(coeff, x->skip_block, p->round,
+ p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ case TX_8X8:
+ vp9_high_fdct8x8_1(src_diff, coeff, diff_stride);
+ vp9_high_quantize_dc(coeff, x->skip_block, p->round,
+ p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ case TX_4X4:
+ x->fwd_txm4x4(src_diff, coeff, diff_stride);
+ vp9_high_quantize_dc(coeff, x->skip_block, p->round,
+ p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ default:
+ assert(0);
+ }
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
switch (tx_size) {
case TX_32X32:
vp9_fdct32x32_1(src_diff, coeff, diff_stride);
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ switch (tx_size) {
+ case TX_32X32:
+ high_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+ vp9_high_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
+ p->round, p->quant, p->quant_shift, qcoeff,
+ dqcoeff, pd->dequant, p->zbin_extra, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_16X16:
+ vp9_high_fdct16x16(src_diff, coeff, diff_stride);
+ vp9_high_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_8X8:
+ vp9_high_fdct8x8(src_diff, coeff, diff_stride);
+ vp9_high_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_4X4:
+ x->fwd_txm4x4(src_diff, coeff, diff_stride);
+ vp9_high_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ default:
+ assert(0);
+ }
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
switch (tx_size) {
case TX_32X32:
fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
if (x->skip_encode || p->eobs[block] == 0)
return;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ switch (tx_size) {
+ case TX_32X32:
+ vp9_high_idct32x32_add(dqcoeff, dst, pd->dst.stride,
+ p->eobs[block], xd->bd);
+ break;
+ case TX_16X16:
+ vp9_high_idct16x16_add(dqcoeff, dst, pd->dst.stride,
+ p->eobs[block], xd->bd);
+ break;
+ case TX_8X8:
+ vp9_high_idct8x8_add(dqcoeff, dst, pd->dst.stride,
+ p->eobs[block], xd->bd);
+ break;
+ case TX_4X4:
+ // this is like vp9_short_idct4x4 but has a special case around eob<=1
+ // which is significant (not just an optimization) for the lossless
+ // case.
+ x->high_itxm_add(dqcoeff, dst, pd->dst.stride,
+ p->eobs[block], xd->bd);
+ break;
+ default:
+ assert(0 && "Invalid transform size");
+ }
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
switch (tx_size) {
case TX_32X32:
vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
- if (p->eobs[block] > 0)
+ if (p->eobs[block] > 0) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ x->high_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd);
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+ }
}
void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
src = &p->src.buf[4 * (j * src_stride + i)];
src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ switch (tx_size) {
+ case TX_32X32:
+ scan_order = &vp9_default_scan_orders[TX_32X32];
+ mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+ vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,
+ x->skip_encode ? src : dst,
+ x->skip_encode ? src_stride : dst_stride,
+ dst, dst_stride, i, j, plane);
+ if (!x->skip_recode) {
+ vp9_high_subtract_block(32, 32, src_diff, diff_stride,
+ src, src_stride, dst, dst_stride, xd->bd);
+ high_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+ vp9_high_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
+ p->round, p->quant, p->quant_shift, qcoeff,
+ dqcoeff, pd->dequant, p->zbin_extra, eob,
+ scan_order->scan, scan_order->iscan);
+ }
+ if (!x->skip_encode && *eob) {
+ vp9_high_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+ }
+ break;
+ case TX_16X16:
+ tx_type = get_tx_type(pd->plane_type, xd);
+ scan_order = &vp9_scan_orders[TX_16X16][tx_type];
+ mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+ vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
+ x->skip_encode ? src : dst,
+ x->skip_encode ? src_stride : dst_stride,
+ dst, dst_stride, i, j, plane);
+ if (!x->skip_recode) {
+ vp9_high_subtract_block(16, 16, src_diff, diff_stride,
+ src, src_stride, dst, dst_stride, xd->bd);
+ vp9_high_fht16x16(src_diff, coeff, diff_stride, tx_type);
+ vp9_high_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob,
+ scan_order->scan, scan_order->iscan);
+ }
+ if (!x->skip_encode && *eob) {
+ vp9_high_iht16x16_add(tx_type, dqcoeff, dst, dst_stride,
+ *eob, xd->bd);
+ }
+ break;
+ case TX_8X8:
+ tx_type = get_tx_type(pd->plane_type, xd);
+ scan_order = &vp9_scan_orders[TX_8X8][tx_type];
+ mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+ vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
+ x->skip_encode ? src : dst,
+ x->skip_encode ? src_stride : dst_stride,
+ dst, dst_stride, i, j, plane);
+ if (!x->skip_recode) {
+ vp9_high_subtract_block(8, 8, src_diff, diff_stride,
+ src, src_stride, dst, dst_stride, xd->bd);
+ vp9_high_fht8x8(src_diff, coeff, diff_stride, tx_type);
+ vp9_high_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob,
+ scan_order->scan, scan_order->iscan);
+ }
+ if (!x->skip_encode && *eob) {
+ vp9_high_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob,
+ xd->bd);
+ }
+ break;
+ case TX_4X4:
+ tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
+ scan_order = &vp9_scan_orders[TX_4X4][tx_type];
+ mode = plane == 0 ? get_y_mode(xd->mi[0].src_mi, block) : mbmi->uv_mode;
+ vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
+ x->skip_encode ? src : dst,
+ x->skip_encode ? src_stride : dst_stride,
+ dst, dst_stride, i, j, plane);
+
+ if (!x->skip_recode) {
+ vp9_high_subtract_block(4, 4, src_diff, diff_stride,
+ src, src_stride, dst, dst_stride, xd->bd);
+ if (tx_type != DCT_DCT)
+ vp9_high_fht4x4(src_diff, coeff, diff_stride, tx_type);
+ else
+ x->fwd_txm4x4(src_diff, coeff, diff_stride);
+ vp9_high_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob,
+ scan_order->scan, scan_order->iscan);
+ }
+
+ if (!x->skip_encode && *eob) {
+ if (tx_type == DCT_DCT)
+ // this is like vp9_short_idct4x4 but has a special case around
+ // eob<=1 which is significant (not just an optimization) for the
+ // lossless case.
+ x->high_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+ else
+ vp9_high_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd);
+ }
+ break;
+ default:
+ assert(0);
+ return;
+ }
+ if (*eob)
+ *(args->skip) = 0;
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
switch (tx_size) {
case TX_32X32:
scan_order = &vp9_default_scan_orders[TX_32X32];
: maximum * bandwidth / 1000;
}
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
+ cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
+ cpi->fn_ptr[BT].vf = VF; \
+ cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svaf = SVAF; \
+ cpi->fn_ptr[BT].sdx3f = SDX3F; \
+ cpi->fn_ptr[BT].sdx8f = SDX8F; \
+ cpi->fn_ptr[BT].sdx4df = SDX4DF;
+
+#define MAKE_BFP_SAD_WRAPPER(fnname) \
+static unsigned int fnname##_bits8(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride); \
+} \
+static unsigned int fnname##_bits10(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2; \
+} \
+static unsigned int fnname##_bits12(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4; \
+}
+
+#define MAKE_BFP_SADAVG_WRAPPER(fnname) static unsigned int \
+fnname##_bits8(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride, \
+ const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred); \
+} \
+static unsigned int fnname##_bits10(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride, \
+ const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+ second_pred) >> 2; \
+} \
+static unsigned int fnname##_bits12(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride, \
+ const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+ second_pred) >> 4; \
+}
+
+#define MAKE_BFP_SAD3_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride, \
+ unsigned int *sad_array) { \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 3; i++) \
+ sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 3; i++) \
+ sad_array[i] >>= 4; \
+}
+
+#define MAKE_BFP_SAD8_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride, \
+ unsigned int *sad_array) { \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 8; i++) \
+ sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 8; i++) \
+ sad_array[i] >>= 4; \
+}
+#define MAKE_BFP_SAD4D_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t* const ref_ptr[], \
+ int ref_stride, \
+ unsigned int *sad_array) { \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t* const ref_ptr[], \
+ int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 4; i++) \
+ sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t* const ref_ptr[], \
+ int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 4; i++) \
+ sad_array[i] >>= 4; \
+}
+
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad32x16x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad16x32x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad64x32x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad32x64x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad32x32_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_high_sad32x32x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad32x32x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad32x32x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad64x64_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_high_sad64x64x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad64x64x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad64x64x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad16x16_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_high_sad16x16x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad16x16x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad16x16x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad16x8_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_high_sad16x8x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad16x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad16x8x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad8x16_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_high_sad8x16x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad8x16x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad8x16x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad8x8_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_high_sad8x8x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad8x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad8x8x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad8x4_avg)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad8x4x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad8x4x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad4x8_avg)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad4x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad4x8x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad4x4_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_high_sad4x4x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad4x4x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad4x4x4d)
+
+static void highbd_set_var_fns(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ if (cm->use_highbitdepth) {
+ switch (cm->bit_depth) {
+ case VPX_BITS_8:
+ HIGHBD_BFP(BLOCK_32X16,
+ vp9_high_sad32x16_bits8,
+ vp9_high_sad32x16_avg_bits8,
+ vp9_high_variance32x16,
+ vp9_high_sub_pixel_variance32x16,
+ vp9_high_sub_pixel_avg_variance32x16,
+ NULL,
+ NULL,
+ vp9_high_sad32x16x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_16X32,
+ vp9_high_sad16x32_bits8,
+ vp9_high_sad16x32_avg_bits8,
+ vp9_high_variance16x32,
+ vp9_high_sub_pixel_variance16x32,
+ vp9_high_sub_pixel_avg_variance16x32,
+ NULL,
+ NULL,
+ vp9_high_sad16x32x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_64X32,
+ vp9_high_sad64x32_bits8,
+ vp9_high_sad64x32_avg_bits8,
+ vp9_high_variance64x32,
+ vp9_high_sub_pixel_variance64x32,
+ vp9_high_sub_pixel_avg_variance64x32,
+ NULL,
+ NULL,
+ vp9_high_sad64x32x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_32X64,
+ vp9_high_sad32x64_bits8,
+ vp9_high_sad32x64_avg_bits8,
+ vp9_high_variance32x64,
+ vp9_high_sub_pixel_variance32x64,
+ vp9_high_sub_pixel_avg_variance32x64,
+ NULL,
+ NULL,
+ vp9_high_sad32x64x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_32X32,
+ vp9_high_sad32x32_bits8,
+ vp9_high_sad32x32_avg_bits8,
+ vp9_high_variance32x32,
+ vp9_high_sub_pixel_variance32x32,
+ vp9_high_sub_pixel_avg_variance32x32,
+ vp9_high_sad32x32x3_bits8,
+ vp9_high_sad32x32x8_bits8,
+ vp9_high_sad32x32x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_64X64,
+ vp9_high_sad64x64_bits8,
+ vp9_high_sad64x64_avg_bits8,
+ vp9_high_variance64x64,
+ vp9_high_sub_pixel_variance64x64,
+ vp9_high_sub_pixel_avg_variance64x64,
+ vp9_high_sad64x64x3_bits8,
+ vp9_high_sad64x64x8_bits8,
+ vp9_high_sad64x64x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_16X16,
+ vp9_high_sad16x16_bits8,
+ vp9_high_sad16x16_avg_bits8,
+ vp9_high_variance16x16,
+ vp9_high_sub_pixel_variance16x16,
+ vp9_high_sub_pixel_avg_variance16x16,
+ vp9_high_sad16x16x3_bits8,
+ vp9_high_sad16x16x8_bits8,
+ vp9_high_sad16x16x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_16X8,
+ vp9_high_sad16x8_bits8,
+ vp9_high_sad16x8_avg_bits8,
+ vp9_high_variance16x8,
+ vp9_high_sub_pixel_variance16x8,
+ vp9_high_sub_pixel_avg_variance16x8,
+ vp9_high_sad16x8x3_bits8,
+ vp9_high_sad16x8x8_bits8,
+ vp9_high_sad16x8x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_8X16,
+ vp9_high_sad8x16_bits8,
+ vp9_high_sad8x16_avg_bits8,
+ vp9_high_variance8x16,
+ vp9_high_sub_pixel_variance8x16,
+ vp9_high_sub_pixel_avg_variance8x16,
+ vp9_high_sad8x16x3_bits8,
+ vp9_high_sad8x16x8_bits8,
+ vp9_high_sad8x16x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_8X8,
+ vp9_high_sad8x8_bits8,
+ vp9_high_sad8x8_avg_bits8,
+ vp9_high_variance8x8,
+ vp9_high_sub_pixel_variance8x8,
+ vp9_high_sub_pixel_avg_variance8x8,
+ vp9_high_sad8x8x3_bits8,
+ vp9_high_sad8x8x8_bits8,
+ vp9_high_sad8x8x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_8X4,
+ vp9_high_sad8x4_bits8,
+ vp9_high_sad8x4_avg_bits8,
+ vp9_high_variance8x4,
+ vp9_high_sub_pixel_variance8x4,
+ vp9_high_sub_pixel_avg_variance8x4,
+ NULL,
+ vp9_high_sad8x4x8_bits8,
+ vp9_high_sad8x4x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_4X8,
+ vp9_high_sad4x8_bits8,
+ vp9_high_sad4x8_avg_bits8,
+ vp9_high_variance4x8,
+ vp9_high_sub_pixel_variance4x8,
+ vp9_high_sub_pixel_avg_variance4x8,
+ NULL,
+ vp9_high_sad4x8x8_bits8,
+ vp9_high_sad4x8x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_4X4,
+ vp9_high_sad4x4_bits8,
+ vp9_high_sad4x4_avg_bits8,
+ vp9_high_variance4x4,
+ vp9_high_sub_pixel_variance4x4,
+ vp9_high_sub_pixel_avg_variance4x4,
+ vp9_high_sad4x4x3_bits8,
+ vp9_high_sad4x4x8_bits8,
+ vp9_high_sad4x4x4d_bits8)
+ break;
+
+ case VPX_BITS_10:
+ HIGHBD_BFP(BLOCK_32X16,
+ vp9_high_sad32x16_bits10,
+ vp9_high_sad32x16_avg_bits10,
+ vp9_high_10_variance32x16,
+ vp9_high_10_sub_pixel_variance32x16,
+ vp9_high_10_sub_pixel_avg_variance32x16,
+ NULL,
+ NULL,
+ vp9_high_sad32x16x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_16X32,
+ vp9_high_sad16x32_bits10,
+ vp9_high_sad16x32_avg_bits10,
+ vp9_high_10_variance16x32,
+ vp9_high_10_sub_pixel_variance16x32,
+ vp9_high_10_sub_pixel_avg_variance16x32,
+ NULL,
+ NULL,
+ vp9_high_sad16x32x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_64X32,
+ vp9_high_sad64x32_bits10,
+ vp9_high_sad64x32_avg_bits10,
+ vp9_high_10_variance64x32,
+ vp9_high_10_sub_pixel_variance64x32,
+ vp9_high_10_sub_pixel_avg_variance64x32,
+ NULL,
+ NULL,
+ vp9_high_sad64x32x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_32X64,
+ vp9_high_sad32x64_bits10,
+ vp9_high_sad32x64_avg_bits10,
+ vp9_high_10_variance32x64,
+ vp9_high_10_sub_pixel_variance32x64,
+ vp9_high_10_sub_pixel_avg_variance32x64,
+ NULL,
+ NULL,
+ vp9_high_sad32x64x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_32X32,
+ vp9_high_sad32x32_bits10,
+ vp9_high_sad32x32_avg_bits10,
+ vp9_high_10_variance32x32,
+ vp9_high_10_sub_pixel_variance32x32,
+ vp9_high_10_sub_pixel_avg_variance32x32,
+ vp9_high_sad32x32x3_bits10,
+ vp9_high_sad32x32x8_bits10,
+ vp9_high_sad32x32x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_64X64,
+ vp9_high_sad64x64_bits10,
+ vp9_high_sad64x64_avg_bits10,
+ vp9_high_10_variance64x64,
+ vp9_high_10_sub_pixel_variance64x64,
+ vp9_high_10_sub_pixel_avg_variance64x64,
+ vp9_high_sad64x64x3_bits10,
+ vp9_high_sad64x64x8_bits10,
+ vp9_high_sad64x64x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_16X16,
+ vp9_high_sad16x16_bits10,
+ vp9_high_sad16x16_avg_bits10,
+ vp9_high_10_variance16x16,
+ vp9_high_10_sub_pixel_variance16x16,
+ vp9_high_10_sub_pixel_avg_variance16x16,
+ vp9_high_sad16x16x3_bits10,
+ vp9_high_sad16x16x8_bits10,
+ vp9_high_sad16x16x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_16X8,
+ vp9_high_sad16x8_bits10,
+ vp9_high_sad16x8_avg_bits10,
+ vp9_high_10_variance16x8,
+ vp9_high_10_sub_pixel_variance16x8,
+ vp9_high_10_sub_pixel_avg_variance16x8,
+ vp9_high_sad16x8x3_bits10,
+ vp9_high_sad16x8x8_bits10,
+ vp9_high_sad16x8x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_8X16,
+ vp9_high_sad8x16_bits10,
+ vp9_high_sad8x16_avg_bits10,
+ vp9_high_10_variance8x16,
+ vp9_high_10_sub_pixel_variance8x16,
+ vp9_high_10_sub_pixel_avg_variance8x16,
+ vp9_high_sad8x16x3_bits10,
+ vp9_high_sad8x16x8_bits10,
+ vp9_high_sad8x16x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_8X8,
+ vp9_high_sad8x8_bits10,
+ vp9_high_sad8x8_avg_bits10,
+ vp9_high_10_variance8x8,
+ vp9_high_10_sub_pixel_variance8x8,
+ vp9_high_10_sub_pixel_avg_variance8x8,
+ vp9_high_sad8x8x3_bits10,
+ vp9_high_sad8x8x8_bits10,
+ vp9_high_sad8x8x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_8X4,
+ vp9_high_sad8x4_bits10,
+ vp9_high_sad8x4_avg_bits10,
+ vp9_high_10_variance8x4,
+ vp9_high_10_sub_pixel_variance8x4,
+ vp9_high_10_sub_pixel_avg_variance8x4,
+ NULL,
+ vp9_high_sad8x4x8_bits10,
+ vp9_high_sad8x4x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_4X8,
+ vp9_high_sad4x8_bits10,
+ vp9_high_sad4x8_avg_bits10,
+ vp9_high_10_variance4x8,
+ vp9_high_10_sub_pixel_variance4x8,
+ vp9_high_10_sub_pixel_avg_variance4x8,
+ NULL,
+ vp9_high_sad4x8x8_bits10,
+ vp9_high_sad4x8x4d_bits10)
+
+ HIGHBD_BFP(BLOCK_4X4,
+ vp9_high_sad4x4_bits10,
+ vp9_high_sad4x4_avg_bits10,
+ vp9_high_10_variance4x4,
+ vp9_high_10_sub_pixel_variance4x4,
+ vp9_high_10_sub_pixel_avg_variance4x4,
+ vp9_high_sad4x4x3_bits10,
+ vp9_high_sad4x4x8_bits10,
+ vp9_high_sad4x4x4d_bits10)
+ break;
+
+ case VPX_BITS_12:
+ HIGHBD_BFP(BLOCK_32X16,
+ vp9_high_sad32x16_bits12,
+ vp9_high_sad32x16_avg_bits12,
+ vp9_high_12_variance32x16,
+ vp9_high_12_sub_pixel_variance32x16,
+ vp9_high_12_sub_pixel_avg_variance32x16,
+ NULL,
+ NULL,
+ vp9_high_sad32x16x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_16X32,
+ vp9_high_sad16x32_bits12,
+ vp9_high_sad16x32_avg_bits12,
+ vp9_high_12_variance16x32,
+ vp9_high_12_sub_pixel_variance16x32,
+ vp9_high_12_sub_pixel_avg_variance16x32,
+ NULL,
+ NULL,
+ vp9_high_sad16x32x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_64X32,
+ vp9_high_sad64x32_bits12,
+ vp9_high_sad64x32_avg_bits12,
+ vp9_high_12_variance64x32,
+ vp9_high_12_sub_pixel_variance64x32,
+ vp9_high_12_sub_pixel_avg_variance64x32,
+ NULL,
+ NULL,
+ vp9_high_sad64x32x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_32X64,
+ vp9_high_sad32x64_bits12,
+ vp9_high_sad32x64_avg_bits12,
+ vp9_high_12_variance32x64,
+ vp9_high_12_sub_pixel_variance32x64,
+ vp9_high_12_sub_pixel_avg_variance32x64,
+ NULL,
+ NULL,
+ vp9_high_sad32x64x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_32X32,
+ vp9_high_sad32x32_bits12,
+ vp9_high_sad32x32_avg_bits12,
+ vp9_high_12_variance32x32,
+ vp9_high_12_sub_pixel_variance32x32,
+ vp9_high_12_sub_pixel_avg_variance32x32,
+ vp9_high_sad32x32x3_bits12,
+ vp9_high_sad32x32x8_bits12,
+ vp9_high_sad32x32x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_64X64,
+ vp9_high_sad64x64_bits12,
+ vp9_high_sad64x64_avg_bits12,
+ vp9_high_12_variance64x64,
+ vp9_high_12_sub_pixel_variance64x64,
+ vp9_high_12_sub_pixel_avg_variance64x64,
+ vp9_high_sad64x64x3_bits12,
+ vp9_high_sad64x64x8_bits12,
+ vp9_high_sad64x64x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_16X16,
+ vp9_high_sad16x16_bits12,
+ vp9_high_sad16x16_avg_bits12,
+ vp9_high_12_variance16x16,
+ vp9_high_12_sub_pixel_variance16x16,
+ vp9_high_12_sub_pixel_avg_variance16x16,
+ vp9_high_sad16x16x3_bits12,
+ vp9_high_sad16x16x8_bits12,
+ vp9_high_sad16x16x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_16X8,
+ vp9_high_sad16x8_bits12,
+ vp9_high_sad16x8_avg_bits12,
+ vp9_high_12_variance16x8,
+ vp9_high_12_sub_pixel_variance16x8,
+ vp9_high_12_sub_pixel_avg_variance16x8,
+ vp9_high_sad16x8x3_bits12,
+ vp9_high_sad16x8x8_bits12,
+ vp9_high_sad16x8x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_8X16,
+ vp9_high_sad8x16_bits12,
+ vp9_high_sad8x16_avg_bits12,
+ vp9_high_12_variance8x16,
+ vp9_high_12_sub_pixel_variance8x16,
+ vp9_high_12_sub_pixel_avg_variance8x16,
+ vp9_high_sad8x16x3_bits12,
+ vp9_high_sad8x16x8_bits12,
+ vp9_high_sad8x16x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_8X8,
+ vp9_high_sad8x8_bits12,
+ vp9_high_sad8x8_avg_bits12,
+ vp9_high_12_variance8x8,
+ vp9_high_12_sub_pixel_variance8x8,
+ vp9_high_12_sub_pixel_avg_variance8x8,
+ vp9_high_sad8x8x3_bits12,
+ vp9_high_sad8x8x8_bits12,
+ vp9_high_sad8x8x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_8X4,
+ vp9_high_sad8x4_bits12,
+ vp9_high_sad8x4_avg_bits12,
+ vp9_high_12_variance8x4,
+ vp9_high_12_sub_pixel_variance8x4,
+ vp9_high_12_sub_pixel_avg_variance8x4,
+ NULL,
+ vp9_high_sad8x4x8_bits12,
+ vp9_high_sad8x4x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_4X8,
+ vp9_high_sad4x8_bits12,
+ vp9_high_sad4x8_avg_bits12,
+ vp9_high_12_variance4x8,
+ vp9_high_12_sub_pixel_variance4x8,
+ vp9_high_12_sub_pixel_avg_variance4x8,
+ NULL,
+ vp9_high_sad4x8x8_bits12,
+ vp9_high_sad4x8x4d_bits12)
+
+ HIGHBD_BFP(BLOCK_4X4,
+ vp9_high_sad4x4_bits12,
+ vp9_high_sad4x4_avg_bits12,
+ vp9_high_12_variance4x4,
+ vp9_high_12_sub_pixel_variance4x4,
+ vp9_high_12_sub_pixel_avg_variance4x4,
+ vp9_high_sad4x4x3_bits12,
+ vp9_high_sad4x4x8_bits12,
+ vp9_high_sad4x4x4d_bits12)
+ break;
+
+ default:
+ assert(0 && "cm->bit_depth should be VPX_BITS_8, "
+ "VPX_BITS_10 or VPX_BITS_12");
+ }
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
VP9_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
if (cpi->oxcf.use_highbitdepth) {
cpi->mb.e_mbd.bd = (int)cm->bit_depth;
}
-#endif
+#endif // CONFIG_VP9_HIGHBITDEPTH
rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
cpi->ext_refresh_frame_flags_pending = 0;
cpi->ext_refresh_frame_context_pending = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+ highbd_set_var_fns(cpi);
+#endif
+
#if CONFIG_VP9_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity > 0) {
vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
vp9_sub_pixel_avg_variance4x4,
vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
+#if CONFIG_VP9_HIGHBITDEPTH
+ highbd_set_var_fns(cpi);
+#endif
+
/* vp9_init_quantizer() is first called here. Add check in
* vp9_frame_init_quantizer() so that vp9_init_quantizer is only
* called later when needed. This will avoid unnecessary calls of
#endif
}
+
static int64_t get_sse(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
int width, int height) {
return total_sse;
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int width, int height,
+ unsigned int input_shift) {
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ int64_t total_sse = 0;
+ int x, y;
+ for (y = 0; y < height; ++y) {
+ for (x = 0; x < width; ++x) {
+ int64_t diff;
+ diff = (a[x] >> input_shift) - (b[x] >> input_shift);
+ total_sse += diff * diff;
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+ return total_sse;
+}
+
+static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ int width, int height) {
+ int64_t total_sse = 0;
+ int x, y;
+ const int dw = width % 16;
+ const int dh = height % 16;
+ unsigned int sse = 0;
+ int sum = 0;
+ if (dw > 0) {
+ high_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
+ dw, height, &sse, &sum);
+ total_sse += sse;
+ }
+ if (dh > 0) {
+ high_variance(&a[(height - dh) * a_stride], a_stride,
+ &b[(height - dh) * b_stride], b_stride,
+ width - dw, dh, &sse, &sum);
+ total_sse += sse;
+ }
+ for (y = 0; y < height / 16; ++y) {
+ const uint8_t *pa = a;
+ const uint8_t *pb = b;
+ for (x = 0; x < width / 16; ++x) {
+ vp9_high_mse16x16(pa, a_stride, pb, b_stride, &sse);
+ total_sse += sse;
+ pa += 16;
+ pb += 16;
+ }
+ a += 16 * a_stride;
+ b += 16 * b_stride;
+ }
+ return total_sse;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
typedef struct {
double psnr[4]; // total/y/u/v
uint64_t sse[4]; // total/y/u/v
(double)total_sse);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b,
+ PSNR_STATS *psnr,
+ unsigned int bit_depth,
+ unsigned int in_bit_depth) {
+ const int widths[3] = {a->y_width, a->uv_width, a->uv_width };
+ const int heights[3] = {a->y_height, a->uv_height, a->uv_height};
+ const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer };
+ const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride};
+ const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer };
+ const int b_strides[3] = {b->y_stride, b->uv_stride, b->uv_stride};
+ int i;
+ uint64_t total_sse = 0;
+ uint32_t total_samples = 0;
+ const double peak = (double)((1 << in_bit_depth) - 1);
+ const unsigned int input_shift = bit_depth - in_bit_depth;
+
+ for (i = 0; i < 3; ++i) {
+ const int w = widths[i];
+ const int h = heights[i];
+ const uint32_t samples = w * h;
+ uint64_t sse;
+ if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (input_shift) {
+ sse = highbd_get_sse_shift(a_planes[i], a_strides[i],
+ b_planes[i], b_strides[i], w, h,
+ input_shift);
+ } else {
+ sse = highbd_get_sse(a_planes[i], a_strides[i],
+ b_planes[i], b_strides[i], w, h);
+ }
+ } else {
+ sse = get_sse(a_planes[i], a_strides[i],
+ b_planes[i], b_strides[i],
+ w, h);
+ }
+ psnr->sse[1 + i] = sse;
+ psnr->samples[1 + i] = samples;
+ psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+ total_sse += sse;
+ total_samples += samples;
+ }
+
+ psnr->sse[0] = total_sse;
+ psnr->samples[0] = total_samples;
+ psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
+ (double)total_sse);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
static void generate_psnr_packet(VP9_COMP *cpi) {
struct vpx_codec_cx_pkt pkt;
int i;
PSNR_STATS psnr;
+#if CONFIG_VP9_HIGHBITDEPTH
+ calc_highbd_psnr(cpi->Source, cpi->common.frame_to_show, &psnr,
+ cpi->mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+#else
calc_psnr(cpi->Source, cpi->common.frame_to_show, &psnr);
+#endif
+
for (i = 0; i < 4; ++i) {
pkt.data.psnr.samples[i] = psnr.samples[i];
pkt.data.psnr.sse[i] = psnr.sse[i];
uint8_t *src = s->y_buffer;
int h = cm->height;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
+
+ do {
+ fwrite(src16, s->y_width, 2, yuv_rec_file);
+ src16 += s->y_stride;
+ } while (--h);
+
+ src16 = CONVERT_TO_SHORTPTR(s->u_buffer);
+ h = s->uv_height;
+
+ do {
+ fwrite(src16, s->uv_width, 2, yuv_rec_file);
+ src16 += s->uv_stride;
+ } while (--h);
+
+ src16 = CONVERT_TO_SHORTPTR(s->v_buffer);
+ h = s->uv_height;
+
+ do {
+ fwrite(src16, s->uv_width, 2, yuv_rec_file);
+ src16 += s->uv_stride;
+ } while (--h);
+
+ fflush(yuv_rec_file);
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
do {
fwrite(src, s->y_width, 1, yuv_rec_file);
src += s->y_stride;
}
#endif
+#if CONFIG_VP9_HIGHBITDEPTH
+static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ int bd) {
+#else
static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *dst) {
+#endif // CONFIG_VP9_HIGHBITDEPTH
// TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t
int i;
const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
const int dst_heights[3] = {dst->y_crop_height, dst->uv_crop_height,
dst->uv_crop_height};
- for (i = 0; i < MAX_MB_PLANE; ++i)
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_highbd_resize_plane(srcs[i], src_heights[i], src_widths[i],
+ src_strides[i], dsts[i], dst_heights[i],
+ dst_widths[i], dst_strides[i], bd);
+ } else {
+ vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
+ dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
+ }
+#else
vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
-
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
vp9_extend_frame_borders(dst);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int bd) {
+#else
static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *dst) {
+#endif // CONFIG_VP9_HIGHBITDEPTH
const int src_w = src->y_crop_width;
const int src_h = src->y_crop_height;
const int dst_w = dst->y_crop_width;
src_stride + (x / factor) * src_w / dst_w;
uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_high_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+ kernel[x_q4 & 0xf], 16 * src_w / dst_w,
+ kernel[y_q4 & 0xf], 16 * src_h / dst_h,
+ 16 / factor, 16 / factor, bd);
+ } else {
+ vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+ kernel[x_q4 & 0xf], 16 * src_w / dst_w,
+ kernel[y_q4 & 0xf], 16 * src_h / dst_h,
+ 16 / factor, 16 / factor);
+ }
+#else
vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
kernel[x_q4 & 0xf], 16 * src_w / dst_w,
kernel[y_q4 & 0xf], 16 * src_h / dst_h,
16 / factor, 16 / factor);
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
}
}
cm->subsampling_x, cm->subsampling_y,
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
-#endif
+#endif // CONFIG_VP9_HIGHBITDEPTH
VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+#if CONFIG_VP9_HIGHBITDEPTH
+ scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf,
+ (int)cm->bit_depth);
+#else
scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
+#endif // CONFIG_VP9_HIGHBITDEPTH
cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
} else {
cpi->scaled_ref_idx[ref_frame - 1] = idx;
cpi->rc.total_target_vs_actual,
(cpi->rc.starting_buffer_level - cpi->rc.bits_off_target),
cpi->rc.total_actual_bits, cm->base_qindex,
- vp9_convert_qindex_to_q(cm->base_qindex),
- (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
- vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality),
+ vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth),
+ (double)vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) / 4.0,
+ vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality,
+ cm->bit_depth),
cpi->rc.avg_q,
- vp9_convert_qindex_to_q(cpi->oxcf.cq_level),
+ vp9_convert_qindex_to_q(cpi->oxcf.cq_level, cm->bit_depth),
cpi->refresh_last_frame, cpi->refresh_golden_frame,
cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost,
cpi->twopass.bits_left,
rc->this_key_frame_forced &&
(rc->projected_frame_size < rc->max_frame_bandwidth)) {
int last_q = q;
- int kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ int kf_err;
int high_err_target = cpi->ambient_err;
int low_err_target = cpi->ambient_err >> 1;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ kf_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm),
+ cm->bit_depth);
+ } else {
+ kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ }
+#else
+ kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
// Prevent possible divide by zero error below for perfect KF
kf_err += !kf_err;
YV12_BUFFER_CONFIG *scaled) {
if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
+#else
scale_and_extend_frame_nonnormative(unscaled, scaled);
+#endif // CONFIG_VP9_HIGHBITDEPTH
return scaled;
} else {
return unscaled;
// fixed interval. Note the reconstruction error if it is the frame before
// the force key frame
if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ cpi->ambient_err = vp9_highbd_get_y_sse(cpi->Source,
+ get_frame_new_buffer(cm),
+ cm->bit_depth);
+ } else {
+ cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ }
+#else
cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
// If the encoder forced a KEY_FRAME decision
}
}
-static void check_initial_width(VP9_COMP *cpi, int subsampling_x,
- int subsampling_y) {
+static void check_initial_width(VP9_COMP *cpi,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ int subsampling_x, int subsampling_y) {
VP9_COMMON *const cm = &cpi->common;
if (!cpi->initial_width) {
cm->subsampling_x = subsampling_x;
cm->subsampling_y = subsampling_y;
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth = use_highbitdepth;
+#endif
alloc_raw_frame_buffers(cpi);
alloc_ref_frame_buffers(cpi);
int res = 0;
const int subsampling_x = sd->uv_width < sd->y_width;
const int subsampling_y = sd->uv_height < sd->y_height;
-
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int use_highbitdepth = sd->flags & YV12_FLAG_HIGHBITDEPTH;
+ check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
+#else
check_initial_width(cpi, subsampling_x, subsampling_y);
+#endif // CONFIG_VP9_HIGHBITDEPTH
vpx_usec_timer_start(&timer);
vp9_setup_scale_factors_for_frame(&ref_buf->sf,
buf->y_crop_width, buf->y_crop_height,
cm->width, cm->height);
-#endif
+#endif // CONFIG_VP9_HIGHBITDEPTH
if (vp9_is_scaled(&ref_buf->sf))
vp9_extend_frame_borders(buf);
}
vp9_high_idct4x4_add;
#else
cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
-#endif
+#endif // CONFIG_VP9_HIGHBITDEPTH
cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
vp9_first_pass(cpi, source);
} else if (oxcf->pass == 2 &&
YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
PSNR_STATS psnr;
+#if CONFIG_VP9_HIGHBITDEPTH
+ calc_highbd_psnr(orig, recon, &psnr, cpi->mb.e_mbd.bd,
+ cpi->oxcf.input_bit_depth);
+#else
calc_psnr(orig, recon, &psnr);
+#endif // CONFIG_VP9_HIGHBITDEPTH
cpi->total += psnr.psnr[0];
cpi->total_y += psnr.psnr[1];
#endif
vp9_clear_system_state();
+#if CONFIG_VP9_HIGHBITDEPTH
+ calc_highbd_psnr(orig, recon, &psnr, cpi->mb.e_mbd.bd,
+ cpi->oxcf.input_bit_depth);
+#else
calc_psnr(orig, pp, &psnr2);
+#endif // CONFIG_VP9_HIGHBITDEPTH
cpi->totalp += psnr2.psnr[0];
cpi->totalp_y += psnr2.psnr[1];
cpi->totalp_sq_error += psnr2.sse[0];
cpi->totalp_samples += psnr2.samples[0];
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ frame_ssim2 = vp9_highbd_calc_ssim(
+ orig, recon, &weight, xd->bd,
+ xd->bd - cpi->oxcf.input_bit_depth);
+ } else {
+ frame_ssim2 = vp9_calc_ssim(orig, recon, 1, &weight);
+ }
+#else
frame_ssim2 = vp9_calc_ssim(orig, recon, &weight);
+#endif // CONFIG_VP9_HIGHBITDEPTH
cpi->summed_quality += frame_ssim2 * weight;
cpi->summed_weights += weight;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ frame_ssim2 = vp9_highbd_calc_ssim(
+ orig, &cm->post_proc_buffer, &weight,
+ xd->bd, xd->bd - cpi->oxcf.input_bit_depth);
+ } else {
+ frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, &weight);
+ }
+#else
frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, &weight);
+#endif // CONFIG_VP9_HIGHBITDEPTH
cpi->summedp_quality += frame_ssim2 * weight;
cpi->summedp_weights += weight;
if (cpi->b_calculate_ssimg) {
double y, u, v, frame_all;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_high) {
+ frame_all = vp9_highbd_calc_ssimg(cpi->Source, cm->frame_to_show, &y,
+ &u, &v, xd->bd,
+ xd->bd - cpi->oxcf.input_bit_depth);
+ } else {
+ frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u,
+ &v);
+ }
+#else
frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
+#endif // CONFIG_VP9_HIGHBITDEPTH
cpi->total_ssimg_y += y;
cpi->total_ssimg_u += u;
cpi->total_ssimg_v += v;
int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
unsigned int height) {
VP9_COMMON *cm = &cpi->common;
-
+#if CONFIG_VP9_HIGHBITDEPTH
+ check_initial_width(cpi, 1, 1, cm->use_highbitdepth);
+#else
check_initial_width(cpi, 1, 1);
+#endif // CONFIG_VP9_HIGHBITDEPTH
if (width) {
cm->width = width;
a->y_crop_width, a->y_crop_height);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+int vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b,
+ vpx_bit_depth_t bit_depth) {
+ unsigned int sse;
+ int sum;
+ assert(a->y_crop_width == b->y_crop_width);
+ assert(a->y_crop_height == b->y_crop_height);
+ assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+ assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+ switch (bit_depth) {
+ case VPX_BITS_8:
+ high_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+ a->y_crop_width, a->y_crop_height, &sse, &sum);
+ return (int) sse;
+ case VPX_BITS_10:
+ high_10_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+ a->y_crop_width, a->y_crop_height, &sse, &sum);
+ return (int) sse;
+ case VPX_BITS_12:
+ high_12_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+ a->y_crop_width, a->y_crop_height, &sse, &sum);
+ return (int) sse;
+ default:
+ assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+ return -1;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
int vp9_get_quantizer(VP9_COMP *cpi) {
return cpi->common.base_qindex;
}
int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+#if CONFIG_VP9_HIGHBITDEPTH
+int vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b,
+ vpx_bit_depth_t bit_depth);
+#endif // CONFIG_VP9_HIGHBITDEPTH
void vp9_alloc_compressor_data(VP9_COMP *cpi);
}
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
+ uint8_t *dst8, int dst_pitch,
+ int w, int h,
+ int extend_top, int extend_left,
+ int extend_bottom, int extend_right) {
+ int i, linesize;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ // copy the left and right most columns out
+ const uint16_t *src_ptr1 = src;
+ const uint16_t *src_ptr2 = src + w - 1;
+ uint16_t *dst_ptr1 = dst - extend_left;
+ uint16_t *dst_ptr2 = dst + w;
+
+ for (i = 0; i < h; i++) {
+ vpx_memset16(dst_ptr1, src_ptr1[0], extend_left);
+ vpx_memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(uint16_t));
+ vpx_memset16(dst_ptr2, src_ptr2[0], extend_right);
+ src_ptr1 += src_pitch;
+ src_ptr2 += src_pitch;
+ dst_ptr1 += dst_pitch;
+ dst_ptr2 += dst_pitch;
+ }
+
+ // Now copy the top and bottom lines into each line of the respective
+ // borders
+ src_ptr1 = dst - extend_left;
+ src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+ dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+ dst_ptr2 = dst + dst_pitch * (h) - extend_left;
+ linesize = extend_left + extend_right + w;
+
+ for (i = 0; i < extend_top; i++) {
+ vpx_memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
+ dst_ptr1 += dst_pitch;
+ }
+
+ for (i = 0; i < extend_bottom; i++) {
+ vpx_memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
+ dst_ptr2 += dst_pitch;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *dst) {
// Extend src frame in buffer
const int eb_uv = eb_y >> uv_height_subsampling;
const int er_uv = er_y >> uv_width_subsampling;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+ highbd_copy_and_extend_plane(src->y_buffer, src->y_stride,
+ dst->y_buffer, dst->y_stride,
+ src->y_width, src->y_height,
+ et_y, el_y, eb_y, er_y);
+
+ highbd_copy_and_extend_plane(src->u_buffer, src->uv_stride,
+ dst->u_buffer, dst->uv_stride,
+ src->uv_width, src->uv_height,
+ et_uv, el_uv, eb_uv, er_uv);
+
+ highbd_copy_and_extend_plane(src->v_buffer, src->uv_stride,
+ dst->v_buffer, dst->uv_stride,
+ src->uv_width, src->uv_height,
+ et_uv, el_uv, eb_uv, er_uv);
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
copy_and_extend_plane(src->y_buffer, src->y_stride,
dst->y_buffer, dst->y_stride,
src->y_width, src->y_height,
return sse;
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+ int bd) {
+ switch (bd) {
+ default:
+ switch (bsize) {
+ case BLOCK_8X8:
+ return vp9_high_mse8x8;
+ case BLOCK_16X8:
+ return vp9_high_mse16x8;
+ case BLOCK_8X16:
+ return vp9_high_mse8x16;
+ default:
+ return vp9_high_mse16x16;
+ }
+ break;
+ case 10:
+ switch (bsize) {
+ case BLOCK_8X8:
+ return vp9_high_10_mse8x8;
+ case BLOCK_16X8:
+ return vp9_high_10_mse16x8;
+ case BLOCK_8X16:
+ return vp9_high_10_mse8x16;
+ default:
+ return vp9_high_10_mse16x16;
+ }
+ break;
+ case 12:
+ switch (bsize) {
+ case BLOCK_8X8:
+ return vp9_high_12_mse8x8;
+ case BLOCK_16X8:
+ return vp9_high_12_mse16x8;
+ case BLOCK_8X16:
+ return vp9_high_12_mse8x16;
+ default:
+ return vp9_high_12_mse16x16;
+ }
+ break;
+ }
+}
+
+static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
+ const struct buf_2d *src,
+ const struct buf_2d *ref,
+ int bd) {
+ unsigned int sse;
+ const vp9_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+ fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+ return sse;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
// Refine the motion search range according to the frame dimension
// for first pass test.
static int get_search_range(const VP9_COMMON *cm) {
// Override the default variance function to use MSE.
v_fn_ptr.vf = get_block_variance_fn(bsize);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
// Center the initial step/diamond search on best mv.
tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
(bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
vp9_encode_intra_block_plane(x, bsize, 0);
this_error = vp9_get_mb_ss(x->plane[0].src_diff);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ switch (cm->bit_depth) {
+ case VPX_BITS_8:
+ break;
+ case VPX_BITS_10:
+ this_error >>= 4;
+ break;
+ case VPX_BITS_12:
+ this_error >>= 8;
+ break;
+ default:
+ assert(0 && "cm->bit_depth should be VPX_BITS_8, "
+ "VPX_BITS_10 or VPX_BITS_12");
+ return;
+ }
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
vp9_clear_system_state();
struct buf_2d unscaled_last_source_buf_2d;
xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
- motion_error = get_prediction_error(bsize, &x->plane[0].src,
- &xd->plane[0].pre[0]);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ motion_error = highbd_get_prediction_error(
+ bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+ } else {
+ motion_error = get_prediction_error(
+ bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+ }
+#else
+ motion_error = get_prediction_error(
+ bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+#endif // CONFIG_VP9_HIGHBITDEPTH
// Compute the motion error of the 0,0 motion using the last source
// frame as the reference. Skip the further motion search on
cpi->unscaled_last_source->y_buffer + recon_yoffset;
unscaled_last_source_buf_2d.stride =
cpi->unscaled_last_source->y_stride;
- raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
- &unscaled_last_source_buf_2d);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ raw_motion_error = highbd_get_prediction_error(
+ bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
+ } else {
+ raw_motion_error = get_prediction_error(
+ bsize, &x->plane[0].src, &unscaled_last_source_buf_2d);
+ }
+#else
+ raw_motion_error = get_prediction_error(
+ bsize, &x->plane[0].src, &unscaled_last_source_buf_2d);
+#endif // CONFIG_VP9_HIGHBITDEPTH
// TODO(pengchong): Replace the hard-coded threshold
if (raw_motion_error > 25 || lc != NULL) {
int gf_motion_error;
xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
- gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
- &xd->plane[0].pre[0]);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ gf_motion_error = highbd_get_prediction_error(
+ bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+ } else {
+ gf_motion_error = get_prediction_error(
+ bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+ }
+#else
+ gf_motion_error = get_prediction_error(
+ bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+#endif // CONFIG_VP9_HIGHBITDEPTH
first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
&gf_motion_error);
int tc = bc; \
\
bestmv->row *= 8; \
- bestmv->col *= 8; \
- if (second_pred != NULL) { \
- DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); \
- vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \
- besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); \
- } else { \
- besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1); \
- } \
- *distortion = besterr; \
+ bestmv->col *= 8;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define SETUP_CENTER_ERROR \
+ if (second_pred != NULL) { \
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { \
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, comp_pred16, 64 * 64); \
+ vp9_high_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, \
+ y_stride); \
+ besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, z, src_stride, \
+ sse1); \
+ } else { \
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); \
+ vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \
+ besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); \
+ } \
+ } else { \
+ besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1); \
+ } \
+ *distortion = besterr; \
besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
+#else
+
+#define SETUP_CENTER_ERROR \
+ if (second_pred != NULL) { \
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); \
+ vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \
+ besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); \
+ } else { \
+ besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1); \
+ } \
+ *distortion = besterr; \
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+
+
+
+static INLINE int divide_and_round(const int n, const int d) {
+ return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_sad_list_wellbehaved(int *sad_list) {
+ return sad_list[0] >= sad_list[1] &&
+ sad_list[0] >= sad_list[2] &&
+ sad_list[0] >= sad_list[3] &&
+ sad_list[0] >= sad_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static void get_cost_surf_min(int *sad_list, int *ir, int *ic,
+ int bits) {
+ *ic = divide_and_round((sad_list[1] - sad_list[3]) * (1 << (bits - 1)),
+ (sad_list[1] - 2 * sad_list[0] + sad_list[3]));
+ *ir = divide_and_round((sad_list[4] - sad_list[2]) * (1 << (bits - 1)),
+ (sad_list[4] - 2 * sad_list[0] + sad_list[2]));
+}
+
+int vp9_find_best_sub_pixel_surface_fit(const MACROBLOCK *x,
MV *bestmv, const MV *ref_mv,
int allow_hp,
int error_per_bit,
const uint8_t *second_pred,
int w, int h) {
SETUP_SUBPEL_SEARCH;
+ SETUP_CENTER_ERROR;
+ (void) halfiters;
+ (void) quarteriters;
+ (void) eighthiters;
+ (void) whichdir;
+ (void) allow_hp;
+ (void) forced_stop;
+ (void) hstep;
+ if (sad_list &&
+ sad_list[0] != INT_MAX && sad_list[1] != INT_MAX &&
+ sad_list[2] != INT_MAX && sad_list[3] != INT_MAX &&
+ sad_list[4] != INT_MAX &&
+ is_sad_list_wellbehaved(sad_list)) {
+ int ir, ic;
+ unsigned int minpt;
+ get_cost_surf_min(sad_list, &ir, &ic, 3);
+ if (ir != 0 || ic != 0) {
+ CHECK_BETTER(minpt, tr + ir, tc + ic);
+ }
+ }
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+ (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+ return INT_MAX;
+
+ return besterr;
+}
+
+int vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
+ MV *bestmv, const MV *ref_mv,
+ int allow_hp,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ int forced_stop,
+ int iters_per_step,
+ int *sad_list,
+ int *mvjcost, int *mvcost[2],
+ int *distortion,
+ unsigned int *sse1,
+ const uint8_t *second_pred,
+ int w, int h) {
+ SETUP_SUBPEL_SEARCH;
+ SETUP_CENTER_ERROR;
+ if (sad_list &&
+ sad_list[0] != INT_MAX && sad_list[1] != INT_MAX &&
+ sad_list[2] != INT_MAX && sad_list[3] != INT_MAX &&
+ sad_list[4] != INT_MAX &&
+ is_sad_list_wellbehaved(sad_list)) {
+ unsigned int minpt;
+ int ir, ic;
+ get_cost_surf_min(sad_list, &ir, &ic, 1);
+ if (ir != 0 || ic != 0) {
+ CHECK_BETTER(minpt, tr + ir * hstep, tc + ic * hstep);
+ }
+ } else {
+ FIRST_LEVEL_CHECKS;
+ if (halfiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+
+ tr = br;
+ tc = bc;
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+ // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+ if (forced_stop != 2) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (quarteriters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
+ }
+
+ if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (eighthiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
+ }
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void) tr;
+ (void) tc;
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+ (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+ return INT_MAX;
+
+ return besterr;
+}
+
+int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
+ MV *bestmv, const MV *ref_mv,
+ int allow_hp,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ int forced_stop,
+ int iters_per_step,
+ int *sad_list,
+ int *mvjcost, int *mvcost[2],
+ int *distortion,
+ unsigned int *sse1,
+ const uint8_t *second_pred,
+ int w, int h) {
+ SETUP_SUBPEL_SEARCH;
+ SETUP_CENTER_ERROR;
if (sad_list &&
sad_list[0] != INT_MAX && sad_list[1] != INT_MAX &&
sad_list[2] != INT_MAX && sad_list[3] != INT_MAX &&
const uint8_t *second_pred,
int w, int h) {
SETUP_SUBPEL_SEARCH;
+ SETUP_CENTER_ERROR;
(void) sad_list; // to silence compiler warning
// Each subsequent iteration checks at least one point in
#define MAX_PATTERN_CANDIDATES 8 // max number of canddiates per scale
#define PATTERN_CANDIDATES_REF 3 // number of refinement candidates
+// Calculate and return a sad+mvcost list around an integer best pel.
+static INLINE void calc_int_sad_cost_list(MACROBLOCK *x,
+ const MV *ref_mv,
+ int sadpb,
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ const MV *best_mv,
+ int *cost_list) {
+ static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}};
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
+ const MV fcenter_mv = {ref_mv->row >> 3, ref_mv->col >> 3};
+ int br = best_mv->row;
+ int bc = best_mv->col;
+ MV this_mv;
+ int i;
+
+ this_mv.row = br;
+ this_mv.col = bc;
+ cost_list[0] = fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride) +
+ mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+ if (check_bounds(x, br, bc, 1)) {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = {br + neighbors[i].row,
+ bc + neighbors[i].col};
+ cost_list[i + 1] = fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride) +
+ mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+ }
+ } else {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = {br + neighbors[i].row,
+ bc + neighbors[i].col};
+ if (!is_mv_in(x, &this_mv))
+ cost_list[i + 1] = INT_MAX;
+ else
+ cost_list[i + 1] = fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride) +
+ mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+ }
+ }
+}
+
// Generic pattern search function that searches over multiple scales.
// Each scale can have a different number of candidates and shape of
// candidates as indicated in the num_candidates and candidates arrays
/* do_refine: If last step (1-away) of n-step search doesn't pick the center
point as the best match, we will do a final 1-away diamond
refining search */
-
int vp9_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x,
MV *mvp_full, int step_param,
int sadpb, int further_steps, int do_refine,
+ int *cost_list,
const vp9_variance_fn_ptr_t *fn_ptr,
const MV *ref_mv, MV *dst_mv) {
MV temp_mv;
*dst_mv = best_mv;
}
}
+
+ // Return cost list.
+ if (cost_list) {
+ calc_int_sad_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+ }
return bestsme;
}
case NSTEP:
var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
MAX_MVSEARCH_STEPS - 1 - step_param,
- 1, fn_ptr, ref_mv, tmp_mv);
+ 1, sad_list, fn_ptr, ref_mv, tmp_mv);
break;
default:
assert(!"Invalid search method.");
int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
MV *mvp_full, int step_param,
int sadpb, int further_steps, int do_refine,
+ int *cost_list,
const vp9_variance_fn_ptr_t *fn_ptr,
const MV *ref_mv, MV *dst_mv);
extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_more;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_surface_fit;
typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
const MV *ref_mv, int sad_per_bit,
vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->mb.e_mbd, filt_level, 1,
partial_frame);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show, cm->bit_depth);
+ } else {
+ filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
+ }
+#else
filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
+#endif // CONFIG_VP9_HIGHBITDEPTH
// Re-instate the unfiltered frame
vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
const int q = vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth);
// These values were determined by linear fitting the result of the
// searched level, filt_guess = q * 0.316206 + 3.87252
+#if CONFIG_VP9_HIGHDEPTH
+ int filt_guess;
+ switch (cm->bit_depth) {
+ case VPX_BITS_8:
+ filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+ break;
+ case VPX_BITS_10:
+ filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
+ break;
+ case VPX_BITS_12:
+ filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
+ break;
+ default:
+ assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 "
+ "or VPX_BITS_12");
+ return;
+ }
+#else
int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+#endif // CONFIG_VP9_HIGHBITDEPTH
if (cm->frame_type == KEY_FRAME)
filt_guess -= 4;
lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+ dc_quant >> (xd->bd - 5), &rate, &dist);
+ } else {
+ vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+ dc_quant >> 3, &rate, &dist);
+ }
+#else
vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
dc_quant >> 3, &rate, &dist);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
*out_rate_sum = rate >> 1;
*out_dist_sum = dist << 3;
- vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
- ac_quant >> 3, &rate, &dist);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_model_rd_from_var_lapndz(var,
+ 1 << num_pels_log2_lookup[bsize],
+ ac_quant >> (xd->bd - 5),
+ &rate,
+ &dist);
+ } else {
+ vp9_model_rd_from_var_lapndz(var,
+ 1 << num_pels_log2_lookup[bsize],
+ ac_quant >> 3,
+ &rate,
+ &dist);
+ }
+#else
+ vp9_model_rd_from_var_lapndz(var,
+ 1 << num_pels_log2_lookup[bsize],
+ ac_quant >> 3,
+ &rate,
+ &dist);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
*out_rate_sum += rate;
*out_dist_sum += dist << 4;
}
// The encode_breakout input
const unsigned int min_thresh =
MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int shift = 2 * xd->bd - 16;
+#endif
// Calculate threshold according to dequant value.
thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {
+ thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift);
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
// Adjust ac threshold according to partition size.
8 - (b_width_log2(bsize) + b_height_log2(bsize));
thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {
+ thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift);
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
} else {
thresh_ac = 0;
thresh_dc = 0;
// var_y and sse_y are saved to be used in skipping checking
unsigned int var_y = UINT_MAX;
unsigned int sse_y = UINT_MAX;
-
- const int intra_cost_penalty =
- 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+ const int intra_cost_penalty = vp9_get_intra_cost_penalty(
+ cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
intra_cost_penalty, 0);
const int intra_mode_cost = 50;
// tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
PRED_BUFFER tmp[4];
DECLARE_ALIGNED_ARRAY(16, uint8_t, pred_buf, 3 * 64 * 64);
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, pred_buf_16, 3 * 64 * 64);
+#endif
struct buf_2d orig_dst = pd->dst;
PRED_BUFFER *best_pred = NULL;
PRED_BUFFER *this_mode_pred = NULL;
+ const int pixels_in_block = bh * bw;
if (cpi->sf.reuse_inter_pred_sby) {
int i;
for (i = 0; i < 3; i++) {
- tmp[i].data = &pred_buf[bw * bh * i];
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ tmp[i].data = CONVERT_TO_BYTEPTR(&pred_buf_16[pixels_in_block * i]);
+ else
+ tmp[i].data = &pred_buf[pixels_in_block * i];
+#else
+ tmp[i].data = &pred_buf[pixels_in_block * i];
+#endif // CONFIG_VP9_HIGHBITDEPTH
tmp[i].stride = bw;
tmp[i].in_use = 0;
}
if (best_pred != NULL && cpi->sf.reuse_inter_pred_sby &&
best_pred->data != orig_dst.buf) {
pd->dst = orig_dst;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ vp9_high_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride,
+ NULL, 0, NULL, 0, bw, bh, xd->bd);
+ } else {
+ vp9_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride,
+ NULL, 0, NULL, 0, bw, bh);
+ }
+#else
vp9_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride, NULL, 0,
NULL, 0, bw, bh);
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
mbmi->mode = best_mode;
}
#else
int rdmult = 88 * q * q / 24;
-#endif
+#endif // CONFIG_VP9_HIGHBITDEPTH
if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
#else
(void) bit_depth;
q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
-#endif
+#endif // CONFIG_VP9_HIGHBITDEPTH
// TODO(debargha): Adjust the function below.
return MAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
}
#else
cpi->mb.sadperbit16 = sad_per_bit16lut_8[qindex];
cpi->mb.sadperbit4 = sad_per_bit4lut_8[qindex];
-#endif
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {
if (sf->disable_split_mask & (1 << i))
rd->thresh_mult_sub8x8[i] = INT_MAX;
}
+
+int vp9_get_intra_cost_penalty(int qindex, int qdelta,
+ vpx_bit_depth_t bit_depth) {
+ const int q = vp9_dc_quant(qindex, qdelta, bit_depth);
+#if CONFIG_VP9_HIGHBITDEPTH
+ switch (bit_depth) {
+ case VPX_BITS_8:
+ return 20 * q;
+ case VPX_BITS_10:
+ return 5 * q;
+ case VPX_BITS_12:
+ return ROUND_POWER_OF_TWO(5 * q, 2);
+ default:
+ assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+ return -1;
+ }
+#else
+ return 20 * q;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
int mi_row, int mi_col,
const struct scale_factors *scale,
const struct scale_factors *scale_uv);
+
+int vp9_get_intra_cost_penalty(int qindex, int qdelta,
+ vpx_bit_depth_t bit_depth);
+
#ifdef __cplusplus
} // extern "C"
#endif
// Fast approximate the modelling function.
if (cpi->oxcf.speed > 4) {
int64_t rate;
- int64_t dist;
int64_t square_error = sse;
int quantizer = (pd->dequant[1] >> 3);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ quantizer >>= (xd->bd - 8);
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
if (quantizer < 120)
rate = (square_error * (280 - quantizer)) >> 8;
rate_sum += rate;
dist_sum += dist;
} else {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
+ pd->dequant[1] >> (xd->bd - 5),
+ &rate, &dist);
+ } else {
+ vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
+ pd->dequant[1] >> 3, &rate, &dist);
+ }
+#else
vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
pd->dequant[1] >> 3, &rate, &dist);
+#endif // CONFIG_VP9_HIGHBITDEPTH
rate_sum += rate;
dist_sum += dist;
}
return error;
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vp9_high_block_error_c(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size,
+ int64_t *ssz, int bd) {
+ int i;
+ int64_t error = 0, sqcoeff = 0;
+ int shift = 2 * (bd - 8);
+ int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ for (i = 0; i < block_size; i++) {
+ const int64_t diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
+ }
+ assert(error >= 0 && sqcoeff >= 0);
+ error = (error + rounding) >> shift;
+ sqcoeff = (sqcoeff + rounding) >> shift;
+
+ *ssz = sqcoeff;
+ return error;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
/* The trailing '0' is a terminator which is used inside cost_coeffs() to
* decide whether to include cost of a trailing EOB node or not (i.e. we
* can skip this if the last coefficient in this transform block, e.g. the
return cost;
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void dist_block(int plane, int block, TX_SIZE tx_size,
+ struct rdcost_block_args* args, int bd) {
+#else
static void dist_block(int plane, int block, TX_SIZE tx_size,
struct rdcost_block_args* args) {
+#endif // CONFIG_VP9_HIGHBITDEPTH
const int ss_txfrm_size = tx_size << 1;
MACROBLOCK* const x = args->x;
MACROBLOCKD* const xd = &x->e_mbd;
int shift = tx_size == TX_32X32 ? 0 : 2;
tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+#if CONFIG_VP9_HIGHBITDEPTH
+ args->dist = vp9_high_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+ &this_sse, bd) >> shift;
+#else
args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
&this_sse) >> shift;
+#endif // CONFIG_VP9_HIGHBITDEPTH
args->sse = this_sse >> shift;
if (x->skip_encode && !is_inter_block(&xd->mi[0].src_mi->mbmi)) {
// TODO(jingning): tune the model to better capture the distortion.
int64_t p = (pd->dequant[1] * pd->dequant[1] *
(1 << ss_txfrm_size)) >> (shift + 2);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ p >>= ((xd->bd - 8) * 2);
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
args->dist += (p >> 4);
args->sse += p;
}
if (!is_inter_block(mbmi)) {
vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ dist_block(plane, block, tx_size, args, xd->bd);
+ } else {
+ dist_block(plane, block, tx_size, args, 8);
+ }
+#else
dist_block(plane, block, tx_size, args);
+#endif // CONFIG_VP9_HIGHBITDEPTH
} else if (max_txsize_lookup[plane_bsize] == tx_size) {
if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) {
// full forward transform and quantization
vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ dist_block(plane, block, tx_size, args, xd->bd);
+ } else {
+ dist_block(plane, block, tx_size, args, 8);
+ }
+#else
dist_block(plane, block, tx_size, args);
+#endif // CONFIG_VP9_HIGHBITDEPTH
} else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {
// compute DC coefficient
tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
} else {
// full forward transform and quantization
vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ dist_block(plane, block, tx_size, args, xd->bd);
+ } else {
+ dist_block(plane, block, tx_size, args, 8);
+ }
+#else
dist_block(plane, block, tx_size, args);
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
rate_block(plane, block, plane_bsize, tx_size, args);
const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
int idx, idy;
uint8_t best_dst[8 * 8];
+#if CONFIG_VP9_HIGHBITDEPTH
+ uint16_t best_dst16[8 * 8];
+#endif
assert(ib < 4);
vpx_memcpy(tl, l, sizeof(tl));
xd->mi[0].src_mi->mbmi.tx_size = TX_4X4;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+ int64_t this_rd;
+ int ratey = 0;
+ int64_t distortion = 0;
+ int rate = bmode_costs[mode];
+
+ if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
+ continue;
+
+ // Only do the oblique modes if the best so far is
+ // one of the neighboring directional modes
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+ if (conditional_skipintra(mode, *best_mode))
+ continue;
+ }
+
+ vpx_memcpy(tempa, ta, sizeof(ta));
+ vpx_memcpy(templ, tl, sizeof(tl));
+
+ for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+ for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
+ const int block = ib + idy * 2 + idx;
+ const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+ uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+ int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
+ p->src_diff);
+ tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+ xd->mi[0].src_mi->bmi[block].as_mode = mode;
+ vp9_predict_intra_block(xd, block, 1,
+ TX_4X4, mode,
+ x->skip_encode ? src : dst,
+ x->skip_encode ? src_stride : dst_stride,
+ dst, dst_stride, idx, idy, 0);
+ vp9_high_subtract_block(4, 4, src_diff, 8, src, src_stride,
+ dst, dst_stride, xd->bd);
+ if (xd->lossless) {
+ const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+ vp9_high_fwht4x4(src_diff, coeff, 8);
+ vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+ ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+ so->scan, so->neighbors,
+ cpi->sf.use_fast_coef_costing);
+ if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+ goto next_highbd;
+ vp9_high_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
+ dst, dst_stride,
+ p->eobs[block], xd->bd);
+ } else {
+ int64_t unused;
+ const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
+ const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
+ vp9_high_fht4x4(src_diff, coeff, 8, tx_type);
+ vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+ ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+ so->scan, so->neighbors,
+ cpi->sf.use_fast_coef_costing);
+ distortion += vp9_high_block_error(coeff,
+ BLOCK_OFFSET(pd->dqcoeff, block),
+ 16, &unused, xd->bd) >> 2;
+ if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+ goto next_highbd;
+ vp9_high_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
+ dst, dst_stride, p->eobs[block], xd->bd);
+ }
+ }
+ }
+
+ rate += ratey;
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+ if (this_rd < best_rd) {
+ *bestrate = rate;
+ *bestratey = ratey;
+ *bestdistortion = distortion;
+ best_rd = this_rd;
+ *best_mode = mode;
+ vpx_memcpy(a, tempa, sizeof(tempa));
+ vpx_memcpy(l, templ, sizeof(templ));
+ for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
+ vpx_memcpy(best_dst16 + idy * 8,
+ CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+ num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+ }
+ }
+ next_highbd:
+ {}
+ }
+ if (best_rd >= rd_thresh || x->skip_encode)
+ return best_rd;
+
+ for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
+ vpx_memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+ best_dst16 + idy * 8,
+ num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+ }
+
+ return best_rd;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
int64_t this_rd;
int ratey = 0;
for (ref = 0; ref < 1 + is_compound; ++ref) {
const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
pd->pre[ref].stride)];
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_high_build_inter_predictor(pre, pd->pre[ref].stride,
+ dst, pd->dst.stride,
+ &mi->bmi[i].as_mv[ref].as_mv,
+ &xd->block_refs[ref]->sf, width, height, ref,
+ kernel, MV_PRECISION_Q3,
+ mi_col * MI_SIZE + 4 * (i % 2),
+ mi_row * MI_SIZE + 4 * (i / 2), xd->bd);
+ } else {
vp9_build_inter_predictor(pre, pd->pre[ref].stride,
dst, pd->dst.stride,
&mi->bmi[i].as_mv[ref].as_mv,
mi_col * MI_SIZE + 4 * (i % 2),
mi_row * MI_SIZE + 4 * (i / 2));
}
+#else
+ vp9_build_inter_predictor(pre, pd->pre[ref].stride,
+ dst, pd->dst.stride,
+ &mi->bmi[i].as_mv[ref].as_mv,
+ &xd->block_refs[ref]->sf, width, height, ref,
+ kernel, MV_PRECISION_Q3,
+ mi_col * MI_SIZE + 4 * (i % 2),
+ mi_row * MI_SIZE + 4 * (i / 2));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_high_subtract_block(
+ height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
+ src, p->src.stride, dst, pd->dst.stride, xd->bd);
+ } else {
+ vp9_subtract_block(
+ height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
+ src, p->src.stride, dst, pd->dst.stride);
+ }
+#else
vp9_subtract_block(height, width,
raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
- src, p->src.stride,
- dst, pd->dst.stride);
+ src, p->src.stride, dst, pd->dst.stride);
+#endif // CONFIG_VP9_HIGHBITDEPTH
k = i;
for (idy = 0; idy < height / 4; ++idy) {
x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
coeff, 8);
vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ thisdistortion += vp9_high_block_error(coeff,
+ BLOCK_OFFSET(pd->dqcoeff, k),
+ 16, &ssz, xd->bd);
+ } else {
+ thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
+ 16, &ssz);
+ }
+#else
thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
16, &ssz);
+#endif // CONFIG_VP9_HIGHBITDEPTH
thissse += ssz;
thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
so->scan, so->neighbors,
int_mv ref_mv[2];
int ite, ref;
// Prediction buffer from second frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+ uint8_t *second_pred;
+ uint8_t *second_pred_alloc;
+#else
uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
+#endif // CONFIG_VP9_HIGHBITDEPTH
const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
// Do joint motion search in compound mode to get more accurate mv.
vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
};
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint16_t));
+ second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc);
+ } else {
+ second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint8_t));
+ second_pred = second_pred_alloc;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
for (ref = 0; ref < 2; ++ref) {
ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
ref_yv12[1] = xd->plane[0].pre[1];
// Get pred block from second frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_high_build_inter_predictor(ref_yv12[!id].buf,
+ ref_yv12[!id].stride,
+ second_pred, pw,
+ &frame_mv[refs[!id]].as_mv,
+ &xd->block_refs[!id]->sf,
+ pw, ph, 0,
+ kernel, MV_PRECISION_Q3,
+ mi_col * MI_SIZE, mi_row * MI_SIZE,
+ xd->bd);
+ } else {
+ vp9_build_inter_predictor(ref_yv12[!id].buf,
+ ref_yv12[!id].stride,
+ second_pred, pw,
+ &frame_mv[refs[!id]].as_mv,
+ &xd->block_refs[!id]->sf,
+ pw, ph, 0,
+ kernel, MV_PRECISION_Q3,
+ mi_col * MI_SIZE, mi_row * MI_SIZE);
+ }
+#else
vp9_build_inter_predictor(ref_yv12[!id].buf,
ref_yv12[!id].stride,
second_pred, pw,
pw, ph, 0,
kernel, MV_PRECISION_Q3,
mi_col * MI_SIZE, mi_row * MI_SIZE);
+#endif // CONFIG_VP9_HIGHBITDEPTH
// Compound motion search on first ref frame.
if (id)
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+ vpx_free(second_pred_alloc);
+#else
vpx_free(second_pred);
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
static INLINE void restore_dst_buf(MACROBLOCKD *xd,
// Calculate threshold according to dequant value.
thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const int shift = 2 * xd->bd - 16;
+ if (shift > 0)
+ thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift);
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
// Adjust threshold according to partition size.
thresh_ac >>= 8 - (b_width_log2(bsize) +
b_height_log2(bsize));
thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const int shift = 2 * xd->bd - 16;
+ if (shift > 0)
+ thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift);
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
} else {
thresh_ac = 0;
thresh_dc = 0;
int refs[2] = { mbmi->ref_frame[0],
(mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
int_mv cur_mv[2];
- int64_t this_rd = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, tmp_buf16, MAX_MB_PLANE * 64 * 64);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf8, MAX_MB_PLANE * 64 * 64);
+ uint8_t *tmp_buf = tmp_buf8;
+#else
DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
+#endif // CONFIG_VP9_HIGHBITDEPTH
int pred_exists = 0;
int intpel_mv;
int64_t rd, tmp_rd, best_rd = INT64_MAX;
(((mi_row + mi_col) >> bsl) +
get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
+ } else {
+ tmp_buf = tmp_buf8;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
if (pred_filter_search) {
INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
if (xd->up_available)
single_skippable[this_mode][refs[0]] = *skippable;
restore_dst_buf(xd, orig_dst, orig_dst_stride);
- return this_rd; // if 0, this will be re-calculated by caller
+ return 0; // The rate-distortion cost will be re-calculated by caller.
}
void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
ctx->mic = *xd->mi[0].src_mi;
}
-// Updating rd_thresh_freq_fact[] here means that the different
-// partition/block sizes are handled independently based on the best
-// choice for the current partition. It may well be better to keep a scaled
-// best rd so far value and update rd_thresh_freq_fact based on the mode/size
-// combination that wins out.
static void update_rd_thresh_fact(VP9_COMP *cpi, int bsize,
int best_mode_index) {
if (cpi->sf.adaptive_rd_thresh > 0) {
const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
int mode;
for (mode = 0; mode < top_mode; ++mode) {
- int *const fact = &cpi->rd.thresh_freq_fact[bsize][mode];
-
- if (mode == best_mode_index) {
- *fact -= (*fact >> 3);
- } else {
- *fact = MIN(*fact + RD_THRESH_INC,
- cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+ const BLOCK_SIZE min_size = MAX(bsize - 1, BLOCK_4X4);
+ const BLOCK_SIZE max_size = MIN(bsize + 2, BLOCK_64X64);
+ BLOCK_SIZE bs;
+ for (bs = min_size; bs <= max_size; ++bs) {
+ int *const fact = &cpi->rd.thresh_freq_fact[bs][mode];
+ if (mode == best_mode_index) {
+ *fact -= (*fact >> 4);
+ } else {
+ *fact = MIN(*fact + RD_THRESH_INC,
+ cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+ }
}
}
}
int64_t dist_uv[TX_SIZES];
int skip_uv[TX_SIZES];
PREDICTION_MODE mode_uv[TX_SIZES];
- const int intra_cost_penalty =
- 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+ const int intra_cost_penalty = vp9_get_intra_cost_penalty(
+ cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
int best_skip2 = 0;
uint8_t ref_frame_skip_mask[2] = { 0 };
uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
int mode_skip_start = cpi->sf.mode_skip_start + 1;
const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
const int *const rd_thresh_freq_fact = rd_opt->thresh_freq_fact[bsize];
- int mode_threshold[MAX_MODES];
+ int64_t mode_threshold[MAX_MODES];
int *mode_map = rd_opt->mode_map[bsize];
const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
vp9_zero(best_mbmode);
}
}
+ if (cpi->sf.alt_ref_search_fp)
+ if (!cm->show_frame)
+ if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
+ mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
+
if (bsize > cpi->sf.max_intra_bsize) {
ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
// based on qp, activity mask and history
if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
(mode_index > MIN_EARLY_TERM_INDEX)) {
- const int qstep = xd->plane[0].dequant[1];
+ int qstep = xd->plane[0].dequant[1];
// TODO(debargha): Enhance this by specializing for each mode_index
int scale = 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ qstep >>= (xd->bd - 8);
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
if (x->source_variance < UINT_MAX) {
const int var_adjust = (x->source_variance < 16);
scale -= var_adjust;
int64_t dist_uv;
int skip_uv;
PREDICTION_MODE mode_uv = DC_PRED;
- const int intra_cost_penalty =
- 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+ const int intra_cost_penalty = vp9_get_intra_cost_penalty(
+ cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
int_mv seg_mvs[4][MAX_REF_FRAMES];
b_mode_info best_bmodes[4];
int best_skip2 = 0;
// based on qp, activity mask and history
if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
(ref_index > MIN_EARLY_TERM_INDEX)) {
- const int qstep = xd->plane[0].dequant[1];
+ int qstep = xd->plane[0].dequant[1];
// TODO(debargha): Enhance this by specializing for each mode_index
int scale = 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ qstep >>= (xd->bd - 8);
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
if (x->source_variance < UINT_MAX) {
const int var_adjust = (x->source_variance < 16);
scale -= var_adjust;
BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx,
int64_t best_rd_so_far);
-
#ifdef __cplusplus
} // extern "C"
#endif
static void down2_symeven(const uint8_t *const input, int length,
uint8_t *output) {
// Actual filter len = 2 * filter_len_half.
- static const int16_t *filter = vp9_down2_symeven_half_filter;
+ const int16_t *filter = vp9_down2_symeven_half_filter;
const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2;
int i, j;
uint8_t *optr = output;
static void down2_symodd(const uint8_t *const input, int length,
uint8_t *output) {
// Actual filter len = 2 * filter_len_half - 1.
- static const int16_t *filter = vp9_down2_symodd_half_filter;
+ const int16_t *filter = vp9_down2_symodd_half_filter;
const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2;
int i, j;
uint8_t *optr = output;
free(arrbuf);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_interpolate(const uint16_t *const input, int inlength,
+ uint16_t *output, int outlength, int bd) {
+ const int64_t delta =
+ (((uint64_t)inlength << 32) + outlength / 2) / outlength;
+ const int64_t offset = inlength > outlength ?
+ (((int64_t)(inlength - outlength) << 31) + outlength / 2) / outlength :
+ -(((int64_t)(outlength - inlength) << 31) + outlength / 2) / outlength;
+ uint16_t *optr = output;
+ int x, x1, x2, sum, k, int_pel, sub_pel;
+ int64_t y;
+
+ const interp_kernel *interp_filters =
+ choose_interp_filter(inlength, outlength);
+
+ x = 0;
+ y = offset;
+ while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) {
+ x++;
+ y += delta;
+ }
+ x1 = x;
+ x = outlength - 1;
+ y = delta * x + offset;
+ while ((y >> INTERP_PRECISION_BITS) +
+ (int64_t)(INTERP_TAPS / 2) >= inlength) {
+ x--;
+ y -= delta;
+ }
+ x2 = x;
+ if (x1 > x2) {
+ for (x = 0, y = offset; x < outlength; ++x, y += delta) {
+ const int16_t *filter;
+ int_pel = y >> INTERP_PRECISION_BITS;
+ sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+ filter = interp_filters[sub_pel];
+ sum = 0;
+ for (k = 0; k < INTERP_TAPS; ++k) {
+ const int pk = int_pel - INTERP_TAPS / 2 + 1 + k;
+ sum += filter[k] *
+ input[(pk < 0 ? 0 : (pk >= inlength ? inlength - 1 : pk))];
+ }
+ *optr++ = clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ }
+ } else {
+ // Initial part.
+ for (x = 0, y = offset; x < x1; ++x, y += delta) {
+ const int16_t *filter;
+ int_pel = y >> INTERP_PRECISION_BITS;
+ sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+ filter = interp_filters[sub_pel];
+ sum = 0;
+ for (k = 0; k < INTERP_TAPS; ++k)
+ sum += filter[k] *
+ input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ?
+ 0 : int_pel - INTERP_TAPS / 2 + 1 + k)];
+ *optr++ = clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ }
+ // Middle part.
+ for (; x <= x2; ++x, y += delta) {
+ const int16_t *filter;
+ int_pel = y >> INTERP_PRECISION_BITS;
+ sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+ filter = interp_filters[sub_pel];
+ sum = 0;
+ for (k = 0; k < INTERP_TAPS; ++k)
+ sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k];
+ *optr++ = clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ }
+ // End part.
+ for (; x < outlength; ++x, y += delta) {
+ const int16_t *filter;
+ int_pel = y >> INTERP_PRECISION_BITS;
+ sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+ filter = interp_filters[sub_pel];
+ sum = 0;
+ for (k = 0; k < INTERP_TAPS; ++k)
+ sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >=
+ inlength ? inlength - 1 :
+ int_pel - INTERP_TAPS / 2 + 1 + k)];
+ *optr++ = clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ }
+ }
+}
+
+static void highbd_down2_symeven(const uint16_t *const input, int length,
+ uint16_t *output, int bd) {
+ // Actual filter len = 2 * filter_len_half.
+ static const int16_t *filter = vp9_down2_symeven_half_filter;
+ const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2;
+ int i, j;
+ uint16_t *optr = output;
+ int l1 = filter_len_half;
+ int l2 = (length - filter_len_half);
+ l1 += (l1 & 1);
+ l2 += (l2 & 1);
+ if (l1 > l2) {
+ // Short input length.
+ for (i = 0; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[(i - j < 0 ? 0 : i - j)] +
+ input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_high(sum, bd);
+ }
+ } else {
+ // Initial part.
+ for (i = 0; i < l1; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_high(sum, bd);
+ }
+ // Middle part.
+ for (; i < l2; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_high(sum, bd);
+ }
+ // End part.
+ for (; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[i - j] +
+ input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_high(sum, bd);
+ }
+ }
+}
+
+static void highbd_down2_symodd(const uint16_t *const input, int length,
+ uint16_t *output, int bd) {
+ // Actual filter len = 2 * filter_len_half - 1.
+ static const int16_t *filter = vp9_down2_symodd_half_filter;
+ const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2;
+ int i, j;
+ uint16_t *optr = output;
+ int l1 = filter_len_half - 1;
+ int l2 = (length - filter_len_half + 1);
+ l1 += (l1 & 1);
+ l2 += (l2 & 1);
+ if (l1 > l2) {
+ // Short input length.
+ for (i = 0; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[(i - j < 0 ? 0 : i - j)] +
+ input[(i + j >= length ? length - 1 : i + j)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_high(sum, bd);
+ }
+ } else {
+ // Initial part.
+ for (i = 0; i < l1; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_high(sum, bd);
+ }
+ // Middle part.
+ for (; i < l2; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[i + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_high(sum, bd);
+ }
+ // End part.
+ for (; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_high(sum, bd);
+ }
+ }
+}
+
+static void highbd_resize_multistep(const uint16_t *const input,
+ int length,
+ uint16_t *output,
+ int olength,
+ uint16_t *buf,
+ int bd) {
+ int steps;
+ if (length == olength) {
+ memcpy(output, input, sizeof(uint16_t) * length);
+ return;
+ }
+ steps = get_down2_steps(length, olength);
+
+ if (steps > 0) {
+ int s;
+ uint16_t *out = NULL;
+ uint16_t *tmpbuf = NULL;
+ uint16_t *otmp, *otmp2;
+ int filteredlength = length;
+ if (!tmpbuf) {
+ tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) * length);
+ otmp = tmpbuf;
+ } else {
+ otmp = buf;
+ }
+ otmp2 = otmp + get_down2_length(length, 1);
+ for (s = 0; s < steps; ++s) {
+ const int proj_filteredlength = get_down2_length(filteredlength, 1);
+ const uint16_t *const in = (s == 0 ? input : out);
+ if (s == steps - 1 && proj_filteredlength == olength)
+ out = output;
+ else
+ out = (s & 1 ? otmp2 : otmp);
+ if (filteredlength & 1)
+ highbd_down2_symodd(in, filteredlength, out, bd);
+ else
+ highbd_down2_symeven(in, filteredlength, out, bd);
+ filteredlength = proj_filteredlength;
+ }
+ if (filteredlength != olength) {
+ highbd_interpolate(out, filteredlength, output, olength, bd);
+ }
+ if (tmpbuf)
+ free(tmpbuf);
+ } else {
+ highbd_interpolate(input, length, output, olength, bd);
+ }
+}
+
+static void highbd_fill_col_to_arr(uint16_t *img, int stride, int len,
+ uint16_t *arr) {
+ int i;
+ uint16_t *iptr = img;
+ uint16_t *aptr = arr;
+ for (i = 0; i < len; ++i, iptr += stride) {
+ *aptr++ = *iptr;
+ }
+}
+
+static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len,
+ uint16_t *arr) {
+ int i;
+ uint16_t *iptr = img;
+ uint16_t *aptr = arr;
+ for (i = 0; i < len; ++i, iptr += stride) {
+ *iptr = *aptr++;
+ }
+}
+
+void vp9_highbd_resize_plane(const uint8_t *const input,
+ int height,
+ int width,
+ int in_stride,
+ uint8_t *output,
+ int height2,
+ int width2,
+ int out_stride,
+ int bd) {
+ int i;
+ uint16_t *intbuf = (uint16_t *)malloc(sizeof(uint16_t) * width2 * height);
+ uint16_t *tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) *
+ (width < height ? height : width));
+ uint16_t *arrbuf = (uint16_t *)malloc(sizeof(uint16_t) * (height + height2));
+ for (i = 0; i < height; ++i) {
+ highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
+ intbuf + width2 * i, width2, tmpbuf, bd);
+ }
+ for (i = 0; i < width2; ++i) {
+ highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+ highbd_resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf,
+ bd);
+ highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2,
+ arrbuf + height);
+ }
+ free(intbuf);
+ free(tmpbuf);
+ free(arrbuf);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
void vp9_resize_frame420(const uint8_t *const y,
int y_stride,
const uint8_t *const u, const uint8_t *const v,
vp9_resize_plane(v, height, width, uv_stride,
ov, oheight, owidth, ouv_stride);
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_resize_frame420(const uint8_t *const y,
+ int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride,
+ int height, int width,
+ uint8_t *oy, int oy_stride,
+ uint8_t *ou, uint8_t *ov, int ouv_stride,
+ int oheight, int owidth, int bd) {
+ vp9_highbd_resize_plane(y, height, width, y_stride,
+ oy, oheight, owidth, oy_stride, bd);
+ vp9_highbd_resize_plane(u, height / 2, width / 2, uv_stride,
+ ou, oheight / 2, owidth / 2, ouv_stride, bd);
+ vp9_highbd_resize_plane(v, height / 2, width / 2, uv_stride,
+ ov, oheight / 2, owidth / 2, ouv_stride, bd);
+}
+
+void vp9_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride,
+ int height, int width,
+ uint8_t *oy, int oy_stride,
+ uint8_t *ou, uint8_t *ov, int ouv_stride,
+ int oheight, int owidth, int bd) {
+ vp9_highbd_resize_plane(y, height, width, y_stride,
+ oy, oheight, owidth, oy_stride, bd);
+ vp9_highbd_resize_plane(u, height, width / 2, uv_stride,
+ ou, oheight, owidth / 2, ouv_stride, bd);
+ vp9_highbd_resize_plane(v, height, width / 2, uv_stride,
+ ov, oheight, owidth / 2, ouv_stride, bd);
+}
+
+void vp9_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride,
+ int height, int width,
+ uint8_t *oy, int oy_stride,
+ uint8_t *ou, uint8_t *ov, int ouv_stride,
+ int oheight, int owidth, int bd) {
+ vp9_highbd_resize_plane(y, height, width, y_stride,
+ oy, oheight, owidth, oy_stride, bd);
+ vp9_highbd_resize_plane(u, height, width, uv_stride,
+ ou, oheight, owidth, ouv_stride, bd);
+ vp9_highbd_resize_plane(v, height, width, uv_stride,
+ ov, oheight, owidth, ouv_stride, bd);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
int oheight,
int owidth);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_resize_plane(const uint8_t *const input,
+ int height,
+ int width,
+ int in_stride,
+ uint8_t *output,
+ int height2,
+ int width2,
+ int out_stride,
+ int bd);
+void vp9_highbd_resize_frame420(const uint8_t *const y,
+ int y_stride,
+ const uint8_t *const u,
+ const uint8_t *const v,
+ int uv_stride,
+ int height,
+ int width,
+ uint8_t *oy,
+ int oy_stride,
+ uint8_t *ou,
+ uint8_t *ov,
+ int ouv_stride,
+ int oheight,
+ int owidth,
+ int bd);
+void vp9_highbd_resize_frame422(const uint8_t *const y,
+ int y_stride,
+ const uint8_t *const u,
+ const uint8_t *const v,
+ int uv_stride,
+ int height,
+ int width,
+ uint8_t *oy,
+ int oy_stride,
+ uint8_t *ou,
+ uint8_t *ov,
+ int ouv_stride,
+ int oheight,
+ int owidth,
+ int bd);
+void vp9_highbd_resize_frame444(const uint8_t *const y,
+ int y_stride,
+ const uint8_t *const u,
+ const uint8_t *const v,
+ int uv_stride,
+ int height,
+ int width,
+ uint8_t *oy,
+ int oy_stride,
+ uint8_t *ou,
+ uint8_t *ov,
+ int ouv_stride,
+ int oheight,
+ int owidth,
+ int bd);
+#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // VP9_ENCODER_VP9_RESIZE_H_
if (MIN(cm->width, cm->height) >= 720) {
sf->disable_split_mask = DISABLE_ALL_SPLIT;
sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
+ sf->partition_search_breakout_dist_thr = (1 << 25);
} else {
sf->max_intra_bsize = BLOCK_32X32;
sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
+ sf->partition_search_breakout_dist_thr = (1 << 23);
}
+ sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
sf->adaptive_pred_interp_filter = 0;
sf->adaptive_mode_search = 1;
sf->cb_partition_search = !boosted;
sf->cb_pred_filter_search = 1;
sf->alt_ref_search_fp = 1;
- sf->motion_field_mode_search = !boosted;
sf->recode_loop = ALLOW_RECODE_KFMAXBW;
sf->adaptive_rd_thresh = 3;
sf->mode_skip_start = 6;
sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
sf->adaptive_interp_filter_search = 1;
-
- if (MIN(cm->width, cm->height) >= 720)
- sf->partition_search_breakout_dist_thr = (1 << 25);
- else
- sf->partition_search_breakout_dist_thr = (1 << 23);
sf->partition_search_breakout_rate_thr = 1000;
}
sf->use_lp32x32fdct = 1;
sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
sf->use_fast_coef_costing = 1;
+ sf->motion_field_mode_search = !boosted;
if (MIN(cm->width, cm->height) >= 720)
sf->partition_search_breakout_dist_thr = (1 << 26);
cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
} else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned;
+ } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
+ cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_more;
+ } else if (sf->mv.subpel_search_method == SUBPEL_SURFACE_FIT) {
+ cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_surface_fit;
}
cpi->mb.optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
typedef enum {
SUBPEL_TREE = 0,
SUBPEL_TREE_PRUNED = 1,
+ SUBPEL_TREE_PRUNED_MORE = 2,
+ SUBPEL_SURFACE_FIT = 3,
// Other methods to come
} SUBPEL_SEARCH_METHODS;
}
}
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_ssim_parms_8x8_c(uint16_t *s, int sp, uint16_t *r, int rp,
+ uint32_t *sum_s, uint32_t *sum_r,
+ uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+ uint32_t *sum_sxr) {
+ int i, j;
+ for (i = 0; i < 8; i++, s += sp, r += rp) {
+ for (j = 0; j < 8; j++) {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+
+void vp9_highbd_ssim_parms_8x8_shift_c(uint16_t *s, int sp, uint16_t *r, int rp,
+ uint32_t *sum_s, uint32_t *sum_r,
+ uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+ uint32_t *sum_sxr, unsigned int bd,
+ unsigned int shift) {
+ int i, j;
+ const int max_val = (1 << bd) - 1;
+ for (i = 0; i < 8; i++, s += sp, r += rp) {
+ for (j = 0; j < 8; j++) {
+ int sj = s[j];
+ int rj = r[j];
+ *sum_s += sj;
+ *sum_r += rj;
+ *sum_sq_s += sj * sj;
+ *sum_sq_r += rj * rj;
+ *sum_sxr += sj * rj;
+ }
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
static const int64_t cc1 = 26634; // (64^2*(.01*255)^2
static const int64_t cc2 = 239708; // (64^2*(.03*255)^2
return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static double high_ssim_8x8_shift(uint16_t *s, int sp, uint16_t *r, int rp,
+ unsigned int bd, unsigned int shift) {
+ uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+ const int oshift = bd - 8;
+ vp9_highbd_ssim_parms_8x8_shift(s, sp, r, rp, &sum_s,
+ &sum_r, &sum_sq_s, &sum_sq_r,
+ &sum_sxr, bd, shift);
+ return similarity(sum_s >> oshift,
+ sum_r >> oshift,
+ sum_sq_s >> (2 * oshift),
+ sum_sq_r >> (2 * oshift),
+ sum_sxr >> (2 * oshift),
+ 64);
+}
+
+static double high_ssim_8x8(uint16_t *s, int sp, uint16_t *r, int rp,
+ unsigned int bd) {
+ uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+ const int oshift = bd - 8;
+ vp9_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+ &sum_sxr);
+ return similarity(sum_s >> oshift,
+ sum_r >> oshift,
+ sum_sq_s >> (2 * oshift),
+ sum_sq_r >> (2 * oshift),
+ sum_sxr >> (2 * oshift),
+ 64);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
// We are using a 8x8 moving window with starting location of each 8x8 window
// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
// block boundaries to penalize blocking artifacts.
ssim_total /= samples;
return ssim_total;
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+double vp9_highbd_ssim2(uint8_t *img1, uint8_t *img2, int stride_img1,
+ int stride_img2, int width, int height,
+ unsigned int bd, unsigned int shift) {
+ int i, j;
+ int samples = 0;
+ double ssim_total = 0;
+
+ if (shift) {
+ // sample point start with each 4x4 location
+ for (i = 0; i <= height - 8;
+ i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+ for (j = 0; j <= width - 8; j += 4) {
+ double v = high_ssim_8x8_shift(CONVERT_TO_SHORTPTR(img1 + j),
+ stride_img1,
+ CONVERT_TO_SHORTPTR(img2 + j),
+ stride_img2,
+ bd, shift);
+ ssim_total += v;
+ samples++;
+ }
+ }
+ } else {
+ // sample point start with each 4x4 location
+ for (i = 0; i <= height - 8;
+ i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+ for (j = 0; j <= width - 8; j += 4) {
+ double v = high_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+ CONVERT_TO_SHORTPTR(img2 + j), stride_img2,
+ bd);
+ ssim_total += v;
+ samples++;
+ }
+ }
+ }
+ ssim_total /= samples;
+ return ssim_total;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
double *weight) {
double a, b, c;
return ssim_all;
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+double vp9_highbd_calc_ssim(YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest,
+ double *weight, unsigned int bd,
+ unsigned int shift) {
+ double a, b, c;
+ double ssimv;
+
+ a = vp9_highbd_ssim2(source->y_buffer, dest->y_buffer,
+ source->y_stride, dest->y_stride,
+ source->y_crop_width, source->y_crop_height,
+ bd, shift);
+
+ b = vp9_highbd_ssim2(source->u_buffer, dest->u_buffer,
+ source->uv_stride, dest->uv_stride,
+ source->uv_crop_width, source->uv_crop_height,
+ bd, shift);
+
+ c = vp9_highbd_ssim2(source->v_buffer, dest->v_buffer,
+ source->uv_stride, dest->uv_stride,
+ source->uv_crop_width, source->uv_crop_height,
+ bd, shift);
+
+ ssimv = a * .8 + .1 * (b + c);
+
+ *weight = 1;
+
+ return ssimv;
+}
+
+double vp9_highbd_calc_ssimg(YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest, double *ssim_y,
+ double *ssim_u, double *ssim_v,
+ unsigned int bd, unsigned int shift) {
+ double ssim_all = 0;
+ double a, b, c;
+
+ a = vp9_highbd_ssim2(source->y_buffer, dest->y_buffer,
+ source->y_stride, dest->y_stride,
+ source->y_crop_width, source->y_crop_height,
+ bd, shift);
+
+ b = vp9_highbd_ssim2(source->u_buffer, dest->u_buffer,
+ source->uv_stride, dest->uv_stride,
+ source->uv_crop_width, source->uv_crop_height,
+ bd, shift);
+
+ c = vp9_highbd_ssim2(source->v_buffer, dest->v_buffer,
+ source->uv_stride, dest->uv_stride,
+ source->uv_crop_width, source->uv_crop_height,
+ bd, shift);
+ *ssim_y = a;
+ *ssim_u = b;
+ *ssim_v = c;
+ ssim_all = (a * 4 + b + c) / 6;
+
+ return ssim_all;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
double *ssim_y, double *ssim_u, double *ssim_v);
+#if CONFIG_VP9_HIGHBITDEPTH
+double vp9_highbd_calc_ssim(YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest,
+ double *weight,
+ unsigned int bd,
+ unsigned int shift);
+
+double vp9_highbd_calc_ssimg(YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest,
+ double *ssim_y,
+ double *ssim_u,
+ double *ssim_v,
+ unsigned int bps,
+ unsigned int shift);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
#ifdef __cplusplus
} // extern "C"
#endif
mv_precision_uv = MV_PRECISION_Q3;
}
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_high_build_inter_predictor(y_mb_ptr, stride,
+ &pred[0], 16,
+ &mv,
+ scale,
+ 16, 16,
+ which_mv,
+ kernel, MV_PRECISION_Q3, x, y, xd->bd);
+
+ vp9_high_build_inter_predictor(u_mb_ptr, uv_stride,
+ &pred[256], uv_block_width,
+ &mv,
+ scale,
+ uv_block_width, uv_block_height,
+ which_mv,
+ kernel, mv_precision_uv, x, y, xd->bd);
+
+ vp9_high_build_inter_predictor(v_mb_ptr, uv_stride,
+ &pred[512], uv_block_width,
+ &mv,
+ scale,
+ uv_block_width, uv_block_height,
+ which_mv,
+ kernel, mv_precision_uv, x, y, xd->bd);
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
vp9_build_inter_predictor(y_mb_ptr, stride,
&pred[0], 16,
&mv,
}
}
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_temporal_filter_apply_c(uint8_t *frame1_8,
+ unsigned int stride,
+ uint8_t *frame2_8,
+ unsigned int block_width,
+ unsigned int block_height,
+ int strength,
+ int filter_weight,
+ unsigned int *accumulator,
+ uint16_t *count) {
+ uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
+ uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
+ unsigned int i, j, k;
+ int modifier;
+ int byte = 0;
+ const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+ for (i = 0, k = 0; i < block_height; i++) {
+ for (j = 0; j < block_width; j++, k++) {
+ int src_byte = frame1[byte];
+ int pixel_value = *frame2++;
+
+ modifier = src_byte - pixel_value;
+ // This is an integer approximation of:
+ // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
+ // modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff);
+ modifier *= modifier;
+ modifier *= 3;
+ modifier += rounding;
+ modifier >>= strength;
+
+ if (modifier > 16)
+ modifier = 16;
+
+ modifier = 16 - modifier;
+ modifier *= filter_weight;
+
+ count[k] += modifier;
+ accumulator[k] += modifier * pixel_value;
+
+ byte++;
+ }
+
+ byte += stride - block_width;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
uint8_t *arf_frame_buf,
uint8_t *frame_ptr_buf,
MACROBLOCKD *mbd = &cpi->mb.e_mbd;
YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
uint8_t *dst1, *dst2;
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, predictor16, 16 * 16 * 3);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, predictor8, 16 * 16 * 3);
+ uint8_t *predictor;
+#else
DECLARE_ALIGNED_ARRAY(16, uint8_t, predictor, 16 * 16 * 3);
+#endif
const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x;
// Save input state
uint8_t* input_buffer[MAX_MB_PLANE];
int i;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ predictor = CONVERT_TO_BYTEPTR(predictor16);
+ } else {
+ predictor = predictor8;
+ }
+#endif
for (i = 0; i < MAX_MB_PLANE; i++)
input_buffer[i] = mbd->plane[i].pre[0].buf;
predictor, scale,
mb_col * 16, mb_row * 16);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ int adj_strength = strength + 2 * (mbd->bd - 8);
+ // Apply the filter (YUV)
+ vp9_highbd_temporal_filter_apply(f->y_buffer + mb_y_offset,
+ f->y_stride,
+ predictor, 16, 16, adj_strength,
+ filter_weight,
+ accumulator, count);
+ vp9_highbd_temporal_filter_apply(f->u_buffer + mb_uv_offset,
+ f->uv_stride, predictor + 256,
+ mb_uv_width, mb_uv_height,
+ adj_strength,
+ filter_weight, accumulator + 256,
+ count + 256);
+ vp9_highbd_temporal_filter_apply(f->v_buffer + mb_uv_offset,
+ f->uv_stride, predictor + 512,
+ mb_uv_width, mb_uv_height,
+ adj_strength, filter_weight,
+ accumulator + 512, count + 512);
+ } else {
+ // Apply the filter (YUV)
+ vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+ predictor, 16, 16,
+ strength, filter_weight,
+ accumulator, count);
+ vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+ predictor + 256,
+ mb_uv_width, mb_uv_height, strength,
+ filter_weight, accumulator + 256,
+ count + 256);
+ vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+ predictor + 512,
+ mb_uv_width, mb_uv_height, strength,
+ filter_weight, accumulator + 512,
+ count + 512);
+ }
+#else
// Apply the filter (YUV)
vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
predictor, 16, 16,
mb_uv_width, mb_uv_height, strength,
filter_weight, accumulator + 512,
count + 512);
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
}
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *dst1_16;
+ uint16_t *dst2_16;
+ // Normalize filter output to produce AltRef frame
+ dst1 = cpi->alt_ref_buffer.y_buffer;
+ dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+ stride = cpi->alt_ref_buffer.y_stride;
+ byte = mb_y_offset;
+ for (i = 0, k = 0; i < 16; i++) {
+ for (j = 0; j < 16; j++, k++) {
+ unsigned int pval = accumulator[k] + (count[k] >> 1);
+ pval *= fixed_divide[count[k]];
+ pval >>= 19;
+
+ dst1_16[byte] = (uint16_t)pval;
+
+ // move to next pixel
+ byte++;
+ }
+
+ byte += stride - 16;
+ }
+
+ dst1 = cpi->alt_ref_buffer.u_buffer;
+ dst2 = cpi->alt_ref_buffer.v_buffer;
+ dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+ dst2_16 = CONVERT_TO_SHORTPTR(dst2);
+ stride = cpi->alt_ref_buffer.uv_stride;
+ byte = mb_uv_offset;
+ for (i = 0, k = 256; i < mb_uv_height; i++) {
+ for (j = 0; j < mb_uv_width; j++, k++) {
+ int m = k + 256;
+
+ // U
+ unsigned int pval = accumulator[k] + (count[k] >> 1);
+ pval *= fixed_divide[count[k]];
+ pval >>= 19;
+ dst1_16[byte] = (uint16_t)pval;
+
+ // V
+ pval = accumulator[m] + (count[m] >> 1);
+ pval *= fixed_divide[count[m]];
+ pval >>= 19;
+ dst2_16[byte] = (uint16_t)pval;
+
+ // move to next pixel
+ byte++;
+ }
+
+ byte += stride - mb_uv_width;
+ }
+ } else {
+ // Normalize filter output to produce AltRef frame
+ dst1 = cpi->alt_ref_buffer.y_buffer;
+ stride = cpi->alt_ref_buffer.y_stride;
+ byte = mb_y_offset;
+ for (i = 0, k = 0; i < 16; i++) {
+ for (j = 0; j < 16; j++, k++) {
+ unsigned int pval = accumulator[k] + (count[k] >> 1);
+ pval *= fixed_divide[count[k]];
+ pval >>= 19;
+
+ dst1[byte] = (uint8_t)pval;
+
+ // move to next pixel
+ byte++;
+ }
+ byte += stride - 16;
+ }
+
+ dst1 = cpi->alt_ref_buffer.u_buffer;
+ dst2 = cpi->alt_ref_buffer.v_buffer;
+ stride = cpi->alt_ref_buffer.uv_stride;
+ byte = mb_uv_offset;
+ for (i = 0, k = 256; i < mb_uv_height; i++) {
+ for (j = 0; j < mb_uv_width; j++, k++) {
+ int m = k + 256;
+
+ // U
+ unsigned int pval = accumulator[k] + (count[k] >> 1);
+ pval *= fixed_divide[count[k]];
+ pval >>= 19;
+ dst1[byte] = (uint8_t)pval;
+
+ // V
+ pval = accumulator[m] + (count[m] >> 1);
+ pval *= fixed_divide[count[m]];
+ pval >>= 19;
+ dst2[byte] = (uint8_t)pval;
+
+ // move to next pixel
+ byte++;
+ }
+ byte += stride - mb_uv_width;
+ }
+ }
+#else
// Normalize filter output to produce AltRef frame
dst1 = cpi->alt_ref_buffer.y_buffer;
stride = cpi->alt_ref_buffer.y_stride;
}
byte += stride - mb_uv_width;
}
+#endif // CONFIG_VP9_HIGHBITDEPTH
mb_y_offset += 16;
mb_uv_offset += mb_uv_width;
}
get_frame_new_buffer(cm)->y_crop_height,
get_frame_new_buffer(cm)->y_crop_width,
get_frame_new_buffer(cm)->y_crop_height);
-#endif
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
for (frame = 0; frame < frames_to_blur; ++frame) {
if (cm->mi_cols * MI_SIZE != frames[frame]->y_width ||
cm->mi_rows * MI_SIZE != frames[frame]->y_height) {
frames[0]->y_crop_height,
frames[0]->y_crop_width,
frames[0]->y_crop_height);
-#endif
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
temporal_filter_iterate_c(cpi, frames, frames_to_blur,
* fields are not.
*/
extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
+#if CONFIG_VP9_HIGHBITDEPTH
+extern const int16_t *vp9_dct_value_cost_high10_ptr;
+extern const TOKENVALUE *vp9_dct_value_tokens_high10_ptr;
+extern const int16_t *vp9_dct_value_cost_high12_ptr;
+extern const TOKENVALUE *vp9_dct_value_tokens_high12_ptr;
+#endif // CONFIG_VP9_HIGHBITDEPTH
#ifdef __cplusplus
} // extern "C"
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_intrapred_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_8t_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_bilinear_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_loopfilter_intrin_sse2.c
endif
# common (c)
}
#if !CONFIG_VP9_HIGHBITDEPTH
- if (cfg->g_profile > (unsigned int)PROFILE_1)
+ if (cfg->g_profile > (unsigned int)PROFILE_1) {
ERROR("Profile > 1 not supported in this build configuration");
+ }
#endif
if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
- extra_cfg->bit_depth > VPX_BITS_8)
+ cfg->g_bit_depth > VPX_BITS_8) {
ERROR("Codec high bit-depth not supported in profile < 2");
+ }
+ if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+ cfg->g_input_bit_depth > 8) {
+ ERROR("Source high bit-depth not supported in profile < 2");
+ }
if (cfg->g_profile > (unsigned int)PROFILE_1 &&
- extra_cfg->bit_depth == VPX_BITS_8)
+ cfg->g_bit_depth == VPX_BITS_8) {
ERROR("Codec bit-depth 8 not supported in profile > 1");
+ }
return VPX_CODEC_OK;
}
-
static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
const vpx_image_t *img) {
switch (img->fmt) {
case VPX_IMG_FMT_YV12:
case VPX_IMG_FMT_I420:
+ case VPX_IMG_FMT_I42016:
+ break;
case VPX_IMG_FMT_I422:
case VPX_IMG_FMT_I444:
+ if (ctx->cfg.g_profile != (unsigned int)PROFILE_1) {
+ ERROR("Invalid image format. I422, I444 images are "
+ "not supported in profile.");
+ }
+ break;
+ case VPX_IMG_FMT_I42216:
+ case VPX_IMG_FMT_I44416:
+ if (ctx->cfg.g_profile != (unsigned int)PROFILE_1 &&
+ ctx->cfg.g_profile != (unsigned int)PROFILE_3) {
+ ERROR("Invalid image format. 16-bit I422, I444 images are "
+ "not supported in profile.");
+ }
break;
default:
ERROR("Invalid image format. Only YV12, I420, I422, I444 images are "
oxcf->profile = cfg->g_profile;
oxcf->width = cfg->g_w;
oxcf->height = cfg->g_h;
- oxcf->bit_depth = extra_cfg->bit_depth;
+ oxcf->bit_depth = cfg->g_bit_depth;
oxcf->input_bit_depth = cfg->g_input_bit_depth;
// guess a frame rate if out of whack, use 30
oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
// call to get_frame.
if (!(*iter)) {
img = &ctx->img;
- img->bit_depth = (int)ctx->pbi->common.bit_depth;
*iter = img;
}
}
img->stride[VPX_PLANE_U] = yv12->uv_stride;
img->stride[VPX_PLANE_V] = yv12->uv_stride;
img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
+ // vpx_image_t uses byte strides and a pointer to the first byte
+ // of the image.
+ img->fmt |= VPX_IMG_FMT_HIGHBITDEPTH;
+ img->bit_depth = yv12->bit_depth;
+ img->planes[VPX_PLANE_Y] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->y_buffer);
+ img->planes[VPX_PLANE_U] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->u_buffer);
+ img->planes[VPX_PLANE_V] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->v_buffer);
+ img->planes[VPX_PLANE_ALPHA] = NULL;
+ img->stride[VPX_PLANE_Y] = 2 * yv12->y_stride;
+ img->stride[VPX_PLANE_U] = 2 * yv12->uv_stride;
+ img->stride[VPX_PLANE_V] = 2 * yv12->uv_stride;
+ img->stride[VPX_PLANE_ALPHA] = 2 * yv12->y_stride;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
img->bps = bps;
img->user_priv = user_priv;
img->img_data = yv12->buffer_alloc;
yv12->y_stride = img->stride[VPX_PLANE_Y];
yv12->uv_stride = img->stride[VPX_PLANE_U];
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+ // In vpx_image_t
+ // planes point to uint8 address of start of data
+ // stride counts uint8s to reach next row
+ // In YV12_BUFFER_CONFIG
+ // y_buffer, u_buffer, v_buffer point to uint16 address of data
+ // stride and border counts in uint16s
+ // This means that all the address calculations in the main body of code
+ // should work correctly.
+ // However, before we do any pixel operations we need to cast the address
+ // to a uint16 ponter and double its value.
+ yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer);
+ yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer);
+ yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer);
+ yv12->y_stride >>= 1;
+ yv12->uv_stride >>= 1;
+ yv12->flags = YV12_FLAG_HIGHBITDEPTH;
+ } else {
+ yv12->flags = 0;
+ }
+ yv12->border = (yv12->y_stride - img->w) / 2;
+#else
+ yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2;
return VPX_CODEC_OK;
}
case VPX_IMG_FMT_YV12:
case VPX_IMG_FMT_VPXI420:
case VPX_IMG_FMT_VPXYV12:
+ case VPX_IMG_FMT_I42016:
ycs = 1;
break;
default:
img->planes[VPX_PLANE_PACKED] =
img->img_data + x * img->bps / 8 + y * img->stride[VPX_PLANE_PACKED];
} else {
+ const int bytes_per_sample =
+ (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
data = img->img_data;
if (img->fmt & VPX_IMG_FMT_HAS_ALPHA) {
img->planes[VPX_PLANE_ALPHA] =
- data + x + y * img->stride[VPX_PLANE_ALPHA];
+ data + x * bytes_per_sample + y * img->stride[VPX_PLANE_ALPHA];
data += img->h * img->stride[VPX_PLANE_ALPHA];
}
- img->planes[VPX_PLANE_Y] = data + x + y * img->stride[VPX_PLANE_Y];
+ img->planes[VPX_PLANE_Y] = data + x * bytes_per_sample +
+ y * img->stride[VPX_PLANE_Y];
data += img->h * img->stride[VPX_PLANE_Y];
if (!(img->fmt & VPX_IMG_FMT_UV_FLIP)) {
- img->planes[VPX_PLANE_U] = data
- + (x >> img->x_chroma_shift)
- + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
+ img->planes[VPX_PLANE_U] =
+ data + (x >> img->x_chroma_shift) * bytes_per_sample +
+ (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
- img->planes[VPX_PLANE_V] = data
- + (x >> img->x_chroma_shift)
- + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
+ img->planes[VPX_PLANE_V] =
+ data + (x >> img->x_chroma_shift) * bytes_per_sample +
+ (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
} else {
- img->planes[VPX_PLANE_V] = data
- + (x >> img->x_chroma_shift)
- + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
+ img->planes[VPX_PLANE_V] =
+ data + (x >> img->x_chroma_shift) * bytes_per_sample +
+ (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
- img->planes[VPX_PLANE_U] = data
- + (x >> img->x_chroma_shift)
- + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
+ img->planes[VPX_PLANE_U] =
+ data + (x >> img->x_chroma_shift) * bytes_per_sample +
+ (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
}
}
-
return 0;
}
-
return -1;
}
#include <inttypes.h>
#endif
+// Note:
+// tran_low_t is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+#if CONFIG_VP9_HIGHBITDEPTH && CONFIG_VP9
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+#else
+typedef int32_t tran_high_t;
+typedef int16_t tran_low_t;
+#endif
+
#endif // VPX_VPX_INTEGER_H_
int arm_cpu_caps(void);
+// Earlier gcc compilers have issues with some neon intrinsics
+#if !defined(__clang__) && defined(__GNUC__) && \
+ __GNUC__ == 4 && __GNUC_MINOR__ <= 6
+#define VPX_INCOMPATIBLE_GCC
+#endif
+
#ifdef __cplusplus
} // extern "C"
#endif
int h = src->d_h;
int x, y;
if (plane) {
- w >>= src->x_chroma_shift;
- h >>= src->y_chroma_shift;
+ w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+ h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
}
for (y = 0; y < h; y++) {
uint16_t *p_src = (uint16_t *)(src->planes[plane] +
int h = src->d_h;
int x, y;
if (plane) {
- w >>= src->x_chroma_shift;
- h >>= src->y_chroma_shift;
+ w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+ h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
}
for (y = 0; y < h; y++) {
uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
int h = src->d_h;
int x, y;
if (plane) {
- w >>= src->x_chroma_shift;
- h >>= src->y_chroma_shift;
+ w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+ h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
}
for (y = 0; y < h; y++) {
uint16_t *p_src = (uint16_t *)(src->planes[plane] +
int h = src->d_h;
int x, y;
if (plane) {
- w >>= src->x_chroma_shift;
- h >>= src->y_chroma_shift;
+ w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+ h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
}
for (y = 0; y < h; y++) {
uint16_t *p_src = (uint16_t *)(src->planes[plane] +
low_img_downshift(dst, src, down_shift);
}
}
+
+static int img_shifted_realloc_required(const vpx_image_t *img,
+ const vpx_image_t *shifted,
+ vpx_img_fmt_t required_fmt) {
+ return img->d_w != shifted->d_w ||
+ img->d_h != shifted->d_h ||
+ required_fmt != shifted->fmt;
+}
#endif
int main_loop(int argc, const char **argv_) {
}
// Shift up or down if necessary
if (output_bit_depth != img->bit_depth) {
+ const vpx_img_fmt_t shifted_fmt = output_bit_depth == 8 ?
+ img->fmt ^ (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) :
+ img->fmt | VPX_IMG_FMT_HIGHBITDEPTH;
+ if (img_shifted &&
+ img_shifted_realloc_required(img, img_shifted, shifted_fmt)) {
+ vpx_img_free(img_shifted);
+ img_shifted = NULL;
+ }
if (!img_shifted) {
- if (output_bit_depth == 8) {
- img_shifted = vpx_img_alloc(
- NULL, img->fmt - VPX_IMG_FMT_HIGHBITDEPTH,
- img->d_w, img->d_h, 16);
- } else {
- img_shifted = vpx_img_alloc(
- NULL, img->fmt | VPX_IMG_FMT_HIGHBITDEPTH,
- img->d_w, img->d_h, 16);
- }
+ img_shifted = vpx_img_alloc(NULL, shifted_fmt,
+ img->d_w, img->d_h, 16);
img_shifted->bit_depth = output_bit_depth;
}
if (output_bit_depth > img->bit_depth) {