]> granicus.if.org Git - libvpx/commitdiff
Merge "Refactor encode_rd_sb_row function"
authorYunqing Wang <yunqingwang@google.com>
Tue, 30 Sep 2014 18:58:39 +0000 (11:58 -0700)
committerGerrit Code Review <gerrit@gerrit.golo.chromium.org>
Tue, 30 Sep 2014 18:58:39 +0000 (11:58 -0700)
53 files changed:
build/make/configure.sh
test/lpf_8_test.cc [new file with mode: 0644]
test/test.mk
tools_common.c
vp8/common/arm/neon/loopfilter_neon.c
vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
vp8/encoder/mcomp.c
vp9/common/vp9_blockd.h
vp9/common/vp9_idct.c
vp9/common/vp9_idct.h
vp9/common/vp9_loopfilter.c
vp9/common/vp9_loopfilter_filters.c
vp9/common/vp9_onyxc_int.h
vp9/common/vp9_postproc.c
vp9/common/vp9_reconinter.c
vp9/common/vp9_rtcd_defs.pl
vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c [new file with mode: 0644]
vp9/decoder/vp9_decodeframe.c
vp9/decoder/vp9_decoder.c
vp9/encoder/vp9_aq_variance.c
vp9/encoder/vp9_bitstream.c
vp9/encoder/vp9_denoiser.c
vp9/encoder/vp9_encodeframe.c
vp9/encoder/vp9_encodemb.c
vp9/encoder/vp9_encoder.c
vp9/encoder/vp9_encoder.h
vp9/encoder/vp9_extend.c
vp9/encoder/vp9_firstpass.c
vp9/encoder/vp9_mcomp.c
vp9/encoder/vp9_mcomp.h
vp9/encoder/vp9_picklpf.c
vp9/encoder/vp9_pickmode.c
vp9/encoder/vp9_rd.c
vp9/encoder/vp9_rd.h
vp9/encoder/vp9_rdopt.c
vp9/encoder/vp9_rdopt.h
vp9/encoder/vp9_resize.c
vp9/encoder/vp9_resize.h
vp9/encoder/vp9_speed_features.c
vp9/encoder/vp9_speed_features.h
vp9/encoder/vp9_ssim.c
vp9/encoder/vp9_ssim.h
vp9/encoder/vp9_temporal_filter.c
vp9/encoder/vp9_tokenize.h
vp9/vp9_common.mk
vp9/vp9_cx_iface.c
vp9/vp9_dx_iface.c
vp9/vp9_iface_common.h
vpx/src/vpx_image.c
vpx/vpx_integer.h
vpx_ports/arm.h
vpxdec.c

index ab6687f73ac944fd2fba1f2e47be491ea0892854..d07f87749a45bc769ceab1dcdba7d5482f57de1c 100644 (file)
@@ -788,8 +788,8 @@ process_common_toolchain() {
             add_ldflags "-mmacosx-version-min=10.9"
             ;;
         *-iphonesimulator-*)
-            add_cflags  "-miphoneos-version-min=5.0"
-            add_ldflags "-miphoneos-version-min=5.0"
+            add_cflags  "-miphoneos-version-min=6.0"
+            add_ldflags "-miphoneos-version-min=6.0"
             osx_sdk_dir="$(xcrun --sdk iphonesimulator --show-sdk-path)"
             add_cflags  "-isysroot ${osx_sdk_dir}"
             add_ldflags "-isysroot ${osx_sdk_dir}"
diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
new file mode 100644 (file)
index 0000000..abc4107
--- /dev/null
@@ -0,0 +1,587 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+// Horizontally and Vertically need 32x32: 8  Coeffs preceeding filtered section
+//                                         16 Coefs within filtered section
+//                                         8  Coeffs following filtered section
+const int kNumCoeffs = 1024;
+
+const int number_of_iterations = 10000;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*loop_op_t)(uint16_t *s, int p, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh,
+                          int count, int bd);
+typedef void (*dual_loop_op_t)(uint16_t *s, int p, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1, int bd);
+#else
+typedef void (*loop_op_t)(uint8_t *s, int p, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh,
+                          int count);
+typedef void (*dual_loop_op_t)(uint8_t *s, int p, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+typedef std::tr1::tuple<loop_op_t, loop_op_t, int> loop8_param_t;
+typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+void wrapper_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh,
+                              int count, int bd) {
+  vp9_highbd_lpf_vertical_16_sse2(s, p, blimit, limit, thresh, bd);
+}
+
+void wrapper_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh,
+                           int count, int bd) {
+  vp9_highbd_lpf_vertical_16_c(s, p, blimit, limit, thresh, bd);
+}
+
+void wrapper_vertical_16_dual_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count, int bd) {
+  vp9_highbd_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh, bd);
+}
+
+void wrapper_vertical_16_dual_c(uint16_t *s, int p, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh,
+                                int count, int bd) {
+  vp9_highbd_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh, bd);
+}
+#else
+void wrapper_vertical_16_sse2(uint8_t *s, int p, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh,
+                              int count) {
+  vp9_lpf_vertical_16_sse2(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh,
+                           int count) {
+  vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_sse2(uint8_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count) {
+  vp9_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh,
+                                int count) {
+  vp9_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSE2
+
+class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
+ public:
+  virtual ~Loop8Test6Param() {}
+  virtual void SetUp() {
+    loopfilter_op_ = GET_PARAM(0);
+    ref_loopfilter_op_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int bit_depth_;
+  int mask_;
+  loop_op_t loopfilter_op_;
+  loop_op_t ref_loopfilter_op_;
+};
+
+class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
+ public:
+  virtual ~Loop8Test9Param() {}
+  virtual void SetUp() {
+    loopfilter_op_ = GET_PARAM(0);
+    ref_loopfilter_op_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int bit_depth_;
+  int mask_;
+  dual_loop_op_t loopfilter_op_;
+  dual_loop_op_t ref_loopfilter_op_;
+};
+
+TEST_P(Loop8Test6Param, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int32_t bd = bit_depth_;
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
+#else
+  DECLARE_ALIGNED_ARRAY(8, uint8_t, s, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(8, uint8_t, ref_s, kNumCoeffs);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < count_test_block; ++i) {
+    int err_count = 0;
+    uint8_t tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, limit[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    int32_t p = kNumCoeffs/32;
+    int count = 1;
+
+    uint16_t tmp_s[kNumCoeffs];
+    int j = 0;
+    while (j < kNumCoeffs) {
+      uint8_t val = rnd.Rand8();
+      if (val & 0x80) {  // 50% chance to choose a new value.
+        tmp_s[j] = rnd.Rand16();
+        j++;
+      } else {  // 50% chance to repeat previous value in row X times
+        int k = 0;
+        while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+          if (j < 1) {
+            tmp_s[j] = rnd.Rand16();
+          } else if (val & 0x20) {  // Increment by an value within the limit
+            tmp_s[j] = (tmp_s[j - 1] + (*limit - 1));
+          } else {  // Decrement by an value within the limit
+            tmp_s[j] = (tmp_s[j - 1] - (*limit - 1));
+          }
+          j++;
+        }
+      }
+    }
+    for (j = 0; j < kNumCoeffs; j++) {
+      if (i % 2) {
+        s[j] = tmp_s[j] & mask_;
+      } else {
+        s[j] = tmp_s[p * (j % p) + j / p] & mask_;
+      }
+      ref_s[j] = s[j];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count, bd);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
+#else
+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      err_count += ref_s[j] != s[j];
+    }
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Loop8Test6Param, C output doesn't match SSE2 "
+         "loopfilter output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(Loop8Test6Param, ValueCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32_t bd = bit_depth_;
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
+#else
+  DECLARE_ALIGNED_ARRAY(8, uint8_t, s, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(8, uint8_t, ref_s, kNumCoeffs);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < count_test_block; ++i) {
+    int err_count = 0;
+    uint8_t tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, limit[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    int32_t p = kNumCoeffs / 32;
+    int count = 1;
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      s[j] = rnd.Rand16() & mask_;
+      ref_s[j] = s[j];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count, bd);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
+#else
+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      err_count += ref_s[j] != s[j];
+    }
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Loop8Test6Param, C output doesn't match SSE2 "
+         "loopfilter output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(Loop8Test9Param, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32_t bd = bit_depth_;
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
+#else
+  DECLARE_ALIGNED_ARRAY(8,  uint8_t,  s, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(8,  uint8_t,  ref_s, kNumCoeffs);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < count_test_block; ++i) {
+    int err_count = 0;
+    uint8_t tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, limit0[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh0[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, limit1[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh1[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    int32_t p = kNumCoeffs / 32;
+    uint16_t tmp_s[kNumCoeffs];
+    int j = 0;
+    const uint8_t limit = *limit0 < *limit1 ? *limit0 : *limit1;
+    while (j < kNumCoeffs) {
+      uint8_t val = rnd.Rand8();
+      if (val & 0x80) {  // 50% chance to choose a new value.
+        tmp_s[j] = rnd.Rand16();
+        j++;
+      } else {  // 50% chance to repeat previous value in row X times.
+        int k = 0;
+        while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+          if (j < 1) {
+            tmp_s[j] = rnd.Rand16();
+          } else if (val & 0x20) {  // Increment by a value within the limit.
+            tmp_s[j] = (tmp_s[j - 1] + (limit - 1));
+          } else {  // Decrement by an value within the limit.
+            tmp_s[j] = (tmp_s[j - 1] - (limit - 1));
+          }
+          j++;
+        }
+      }
+    }
+    for (j = 0; j < kNumCoeffs; j++) {
+      if (i % 2) {
+        s[j] = tmp_s[j] & mask_;
+      } else {
+        s[j] = tmp_s[p * (j % p) + j / p] & mask_;
+      }
+      ref_s[j] = s[j];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1, bd);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1, bd));
+#else
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      err_count += ref_s[j] != s[j];
+    }
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Loop8Test9Param, C output doesn't match SSE2 "
+         "loopfilter output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(Loop8Test9Param, ValueCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
+#else
+  DECLARE_ALIGNED_ARRAY(8,  uint8_t, s, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(8,  uint8_t, ref_s, kNumCoeffs);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < count_test_block; ++i) {
+    int err_count = 0;
+    uint8_t tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, limit0[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh0[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, limit1[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh1[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    int32_t p = kNumCoeffs / 32;  // TODO(pdlf) can we have non-square here?
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      s[j] = rnd.Rand16() & mask_;
+      ref_s[j] = s[j];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    const int32_t bd = bit_depth_;
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1, bd);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0,
+                       thresh0, blimit1, limit1, thresh1, bd));
+#else
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      err_count += ref_s[j] != s[j];
+    }
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Loop8Test9Param, C output doesn't match SSE2"
+         "loopfilter output. "
+      << "First failed at test case " << first_failure;
+}
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSE2_C_COMPARE_SINGLE, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
+                   &vp9_highbd_lpf_horizontal_4_c, 8),
+        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
+                   &vp9_highbd_lpf_vertical_4_c, 8),
+        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
+                   &vp9_highbd_lpf_horizontal_8_c, 8),
+        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+                   &vp9_highbd_lpf_horizontal_16_c, 8),
+        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
+                   &vp9_highbd_lpf_vertical_8_c, 8),
+        make_tuple(&wrapper_vertical_16_sse2,
+                   &wrapper_vertical_16_c, 8),
+        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
+                   &vp9_highbd_lpf_horizontal_4_c, 10),
+        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
+                   &vp9_highbd_lpf_vertical_4_c, 10),
+        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
+                   &vp9_highbd_lpf_horizontal_8_c, 10),
+        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+                   &vp9_highbd_lpf_horizontal_16_c, 10),
+        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
+                   &vp9_highbd_lpf_vertical_8_c, 10),
+        make_tuple(&wrapper_vertical_16_sse2,
+                   &wrapper_vertical_16_c, 10),
+        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
+                   &vp9_highbd_lpf_horizontal_4_c, 12),
+        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
+                   &vp9_highbd_lpf_vertical_4_c, 12),
+        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
+                   &vp9_highbd_lpf_horizontal_8_c, 12),
+        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+                   &vp9_highbd_lpf_horizontal_16_c, 12),
+        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
+                   &vp9_highbd_lpf_vertical_8_c, 12),
+        make_tuple(&wrapper_vertical_16_sse2,
+                   &wrapper_vertical_16_c, 12)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    SSE2_C_COMPARE_SINGLE, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c, 8),
+        make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8),
+        make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSE2_C_COMPARE_DUAL, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&wrapper_vertical_16_dual_sse2,
+                   &wrapper_vertical_16_dual_c, 8),
+        make_tuple(&wrapper_vertical_16_dual_sse2,
+                   &wrapper_vertical_16_dual_c, 10),
+        make_tuple(&wrapper_vertical_16_dual_sse2,
+                   &wrapper_vertical_16_dual_c, 12)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    SSE2_C_COMPARE_DUAL, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSE2
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSE_C_COMPARE_DUAL, Loop8Test9Param,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
+                   &vp9_highbd_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
+                   &vp9_highbd_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
+                   &vp9_highbd_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
+                   &vp9_highbd_lpf_vertical_8_dual_c, 8),
+        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
+                   &vp9_highbd_lpf_horizontal_4_dual_c, 10),
+        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
+                   &vp9_highbd_lpf_horizontal_8_dual_c, 10),
+        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
+                   &vp9_highbd_lpf_vertical_4_dual_c, 10),
+        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
+                   &vp9_highbd_lpf_vertical_8_dual_c, 10),
+        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
+                   &vp9_highbd_lpf_horizontal_4_dual_c, 12),
+        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
+                   &vp9_highbd_lpf_horizontal_8_dual_c, 12),
+        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
+                   &vp9_highbd_lpf_vertical_4_dual_c, 12),
+        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
+                   &vp9_highbd_lpf_vertical_8_dual_c, 12)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    SSE_C_COMPARE_DUAL, Loop8Test9Param,
+    ::testing::Values(
+        make_tuple(&vp9_lpf_horizontal_4_dual_sse2,
+                   &vp9_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vp9_lpf_horizontal_8_dual_sse2,
+                   &vp9_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vp9_lpf_vertical_4_dual_sse2,
+                   &vp9_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vp9_lpf_vertical_8_dual_sse2,
+                   &vp9_lpf_vertical_8_dual_c, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif
+}  // namespace
index 97b8abab4623f17a119d913b1d3b16d9667dee3e..df4969f5c6916064dd0f7e62949513a790674c14 100644 (file)
@@ -128,6 +128,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += lpf_8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += vp9_intrapred_test.cc
 
 ifeq ($(CONFIG_VP9_ENCODER),yes)
index 2ec17117c004a4a9c4523c052c1524fe18d901be..0d4774f1a3f46e7c325463b0a79f3854a2360203 100644 (file)
@@ -224,7 +224,8 @@ void vpx_img_write(const vpx_image_t *img, FILE *file) {
   for (plane = 0; plane < 3; ++plane) {
     const unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int w = vpx_img_plane_width(img, plane);
+    const int w = vpx_img_plane_width(img, plane) *
+        ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
     const int h = vpx_img_plane_height(img, plane);
     int y;
 
index 0bec7fb06431365ab104bd095d4b67e52b157098..9d6807af7157e3644b99ad0ee7f3fd6e41a0de7c 100644 (file)
@@ -10,6 +10,7 @@
 
 #include <arm_neon.h>
 #include "./vpx_config.h"
+#include "vpx_ports/arm.h"
 
 static INLINE void vp8_loop_filter_neon(
         uint8x16_t qblimit,  // flimit
@@ -253,23 +254,7 @@ void vp8_loop_filter_horizontal_edge_uv_neon(
 
 static INLINE void write_4x8(unsigned char *dst, int pitch,
                              const uint8x8x4_t result) {
-#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-    vst4_lane_u8(dst, result, 0);
-    dst += pitch;
-    vst4_lane_u8(dst, result, 1);
-    dst += pitch;
-    vst4_lane_u8(dst, result, 2);
-    dst += pitch;
-    vst4_lane_u8(dst, result, 3);
-    dst += pitch;
-    vst4_lane_u8(dst, result, 4);
-    dst += pitch;
-    vst4_lane_u8(dst, result, 5);
-    dst += pitch;
-    vst4_lane_u8(dst, result, 6);
-    dst += pitch;
-    vst4_lane_u8(dst, result, 7);
-#else
+#ifdef VPX_INCOMPATIBLE_GCC
     /*
      * uint8x8x4_t result
     00 01 02 03 | 04 05 06 07
@@ -316,7 +301,23 @@ static INLINE void write_4x8(unsigned char *dst, int pitch,
     vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
     dst += pitch;
     vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
-#endif
+#else
+    vst4_lane_u8(dst, result, 0);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 1);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 2);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 3);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 4);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 5);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 6);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 7);
+#endif  // VPX_INCOMPATIBLE_GCC
 }
 
 void vp8_loop_filter_vertical_edge_y_neon(
index d5178bbae1461562a33640b26b13017f73428ddc..e1c8609e37105c23a504984662c8c8e021011154 100644 (file)
 
 #include <arm_neon.h>
 #include "./vpx_config.h"
+#include "vpx_ports/arm.h"
 
-#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-static INLINE void write_2x8(unsigned char *dst, int pitch,
-                             const uint8x8x2_t result,
-                             const uint8x8x2_t result2) {
-  vst2_lane_u8(dst, result, 0);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 1);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 2);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 3);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 4);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 5);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 6);
-  dst += pitch;
-  vst2_lane_u8(dst, result, 7);
-  dst += pitch;
-
-  vst2_lane_u8(dst, result2, 0);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 1);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 2);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 3);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 4);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 5);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 6);
-  dst += pitch;
-  vst2_lane_u8(dst, result2, 7);
-}
-#else
+#ifdef VPX_INCOMPATIBLE_GCC
 static INLINE void write_2x4(unsigned char *dst, int pitch,
                              const uint8x8x2_t result) {
     /*
@@ -88,30 +52,47 @@ static INLINE void write_2x8(unsigned char *dst, int pitch,
   dst += pitch * 8;
   write_2x4(dst, pitch, result2);
 }
-#endif
-
+#else
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+                             const uint8x8x2_t result,
+                             const uint8x8x2_t result2) {
+  vst2_lane_u8(dst, result, 0);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 1);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 2);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 3);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 4);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 5);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 6);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 7);
+  dst += pitch;
 
-#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-static INLINE
-uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
-    x = vld4_lane_u8(src, x, 0);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 1);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 2);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 3);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 4);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 5);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 6);
-    src += pitch;
-    x = vld4_lane_u8(src, x, 7);
-    return x;
+  vst2_lane_u8(dst, result2, 0);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 1);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 2);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 3);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 4);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 5);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 6);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 7);
 }
-#else
+#endif  // VPX_INCOMPATIBLE_GCC
+
+
+#ifdef VPX_INCOMPATIBLE_GCC
 static INLINE
 uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
     const uint8x8_t a = vld1_u8(src);
@@ -169,7 +150,27 @@ uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
 
     return x;
 }
-#endif
+#else
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
+    x = vld4_lane_u8(src, x, 0);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 1);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 2);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 3);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 4);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 5);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 6);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 7);
+    return x;
+}
+#endif  // VPX_INCOMPATIBLE_GCC
 
 static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
         unsigned char *s,
index d6b67f89503478ad8c7aa9a4c44c5d0c6a986a04..5ad9465002eaae24ca5e2ae83e448f66f23ec3fa 100644 (file)
@@ -9,7 +9,17 @@
  */
 
 #include <arm_neon.h>
+#include "vpx_ports/arm.h"
 
+#ifdef VPX_INCOMPATIBLE_GCC
+#include "./vp8_rtcd.h"
+void vp8_short_walsh4x4_neon(
+        int16_t *input,
+        int16_t *output,
+        int pitch) {
+  vp8_short_walsh4x4_c(input, output, pitch);
+}
+#else
 void vp8_short_walsh4x4_neon(
         int16_t *input,
         int16_t *output,
@@ -116,3 +126,4 @@ void vp8_short_walsh4x4_neon(
     vst1q_s16(output + 8, q1s16);
     return;
 }
+#endif  // VPX_INCOMPATIBLE_GCC
index 54abe76acd2a2bd87885faca80eda252f6ffdab2..545f2c8932160f115df3529020eb2d3bd28d22dd 100644 (file)
@@ -393,8 +393,8 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 #endif
 
     /* central mv */
-    bestmv->as_mv.row <<= 3;
-    bestmv->as_mv.col <<= 3;
+    bestmv->as_mv.row *= 8;
+    bestmv->as_mv.col *= 8;
     startmv = *bestmv;
 
     /* calculate central point error */
index 702efe07b6ec86d7bd94e0e3b80efab9f1956308..03ab2257e317ce59334f7dc18572d5a73ff6a463 100644 (file)
@@ -21,7 +21,6 @@
 #include "vp9/common/vp9_common_data.h"
 #include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_filter.h"
-#include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_mv.h"
 #include "vp9/common/vp9_scale.h"
 #include "vp9/common/vp9_seg_common.h"
index b196fc527959f8fa723ba37efb3eefe7b8e063cb..5b031752b8ce8d61d0b312cabbfbe6119314102a 100644 (file)
@@ -8,14 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <assert.h>
 #include <math.h>
 
-#include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
 #if CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH
index 694be3cf94bf0991dc33ef473bcdb29fc54485f0..1d619824a575d6cfca12b22f01ac14feca38b258 100644 (file)
@@ -36,17 +36,6 @@ extern "C" {
 #define dual_set_epi16(a, b) \
   _mm_set_epi16(b, b, b, b, a, a, a, a)
 
-// Note:
-// tran_low_t  is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-#if CONFIG_VP9_HIGHBITDEPTH
-typedef int64_t tran_high_t;
-typedef int32_t tran_low_t;
-#else
-typedef int32_t tran_high_t;
-typedef int16_t tran_low_t;
-#endif
-
 // Constants:
 //  for (int i = 1; i< 32; ++i)
 //    printf("static const int cospi_%d_64 = %.0f;\n", i,
index 102eb71a55da47fe887db17ab54b362131e32679..4d6d457ca9fc287a8b839e63e496b24745b82c33 100644 (file)
@@ -392,6 +392,107 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void high_filter_selectively_vert_row2(PLANE_TYPE plane_type,
+                                              uint16_t *s, int pitch,
+                                              unsigned int mask_16x16_l,
+                                              unsigned int mask_8x8_l,
+                                              unsigned int mask_4x4_l,
+                                              unsigned int mask_4x4_int_l,
+                                              const loop_filter_info_n *lfi_n,
+                                              const uint8_t *lfl, int bd) {
+  const int mask_shift = plane_type ? 4 : 8;
+  const int mask_cutoff = plane_type ? 0xf : 0xff;
+  const int lfl_forward = plane_type ? 4 : 8;
+
+  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
+  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
+  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
+  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
+  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+  unsigned int mask;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
+       mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+
+    // TODO(yunqingwang): count in loopfilter functions should be removed.
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          vp9_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                          lfi0->hev_thr, bd);
+        } else if (mask_16x16_0 & 1) {
+          vp9_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
+                                     lfi0->hev_thr, bd);
+        } else {
+          vp9_highbd_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
+                                     lfi1->lim, lfi1->hev_thr, bd);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          vp9_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_8x8_0 & 1) {
+          vp9_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, 1, bd);
+        } else {
+          vp9_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          vp9_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_4x4_0 & 1) {
+          vp9_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, 1, bd);
+        } else {
+          vp9_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+        }
+      }
+
+      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
+        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
+          vp9_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_4x4_int_0 & 1) {
+          vp9_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, 1, bd);
+        } else {
+          vp9_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+        }
+      }
+    }
+
+    s += 8;
+    lfl += 1;
+    mask_16x16_0 >>= 1;
+    mask_8x8_0 >>= 1;
+    mask_4x4_0 >>= 1;
+    mask_4x4_int_0 >>= 1;
+    mask_16x16_1 >>= 1;
+    mask_8x8_1 >>= 1;
+    mask_4x4_1 >>= 1;
+    mask_4x4_int_1 >>= 1;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void filter_selectively_horiz(uint8_t *s, int pitch,
                                      unsigned int mask_16x16,
                                      unsigned int mask_8x8,
@@ -419,7 +520,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
         }
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
-          // Next block's thresholds
+          // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
           vp9_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
@@ -448,7 +549,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
         }
       } else if (mask_4x4 & 1) {
         if ((mask_4x4 & 3) == 3) {
-          // Next block's thresholds
+          // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
           vp9_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
@@ -488,6 +589,112 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void high_filter_selectively_horiz(uint16_t *s, int pitch,
+                                          unsigned int mask_16x16,
+                                          unsigned int mask_8x8,
+                                          unsigned int mask_4x4,
+                                          unsigned int mask_4x4_int,
+                                          const loop_filter_info_n *lfi_n,
+                                          const uint8_t *lfl, int bd) {
+  unsigned int mask;
+  int count;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        if ((mask_16x16 & 3) == 3) {
+          vp9_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, 2, bd);
+          count = 2;
+        } else {
+          vp9_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, 1, bd);
+        }
+      } else if (mask_8x8 & 1) {
+        if ((mask_8x8 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vp9_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, lfin->mblim, lfin->lim,
+                                           lfin->hev_thr, bd);
+
+          if ((mask_4x4_int & 3) == 3) {
+            vp9_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                             lfi->lim, lfi->hev_thr,
+                                             lfin->mblim, lfin->lim,
+                                             lfin->hev_thr, bd);
+          } else {
+            if (mask_4x4_int & 1) {
+              vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1, bd);
+            } else if (mask_4x4_int & 2) {
+              vp9_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                          lfin->lim, lfin->hev_thr, 1, bd);
+            }
+          }
+          count = 2;
+        } else {
+          vp9_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, 1, bd);
+
+          if (mask_4x4_int & 1) {
+            vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                        lfi->lim, lfi->hev_thr, 1, bd);
+          }
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vp9_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, lfin->mblim, lfin->lim,
+                                           lfin->hev_thr, bd);
+          if ((mask_4x4_int & 3) == 3) {
+            vp9_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                             lfi->lim, lfi->hev_thr,
+                                             lfin->mblim, lfin->lim,
+                                             lfin->hev_thr, bd);
+          } else {
+            if (mask_4x4_int & 1) {
+              vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1, bd);
+            } else if (mask_4x4_int & 2) {
+              vp9_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                          lfin->lim, lfin->hev_thr, 1, bd);
+            }
+          }
+          count = 2;
+        } else {
+          vp9_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, 1, bd);
+
+          if (mask_4x4_int & 1) {
+            vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                        lfi->lim, lfi->hev_thr, 1, bd);
+          }
+        }
+      } else if (mask_4x4_int & 1) {
+        vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, 1, bd);
+      }
+    }
+    s += 8 * count;
+    lfl += count;
+    mask_16x16 >>= count;
+    mask_8x8 >>= count;
+    mask_4x4 >>= count;
+    mask_4x4_int >>= count;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 // This function ors into the current lfm structure, where to do loop
 // filters for the specific mi we are looking at. It uses information
 // including the block_size_type (32x16, 32x32, etc.), the transform size,
@@ -903,14 +1110,53 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void high_filter_selectively_vert(uint16_t *s, int pitch,
+                                         unsigned int mask_16x16,
+                                         unsigned int mask_8x8,
+                                         unsigned int mask_4x4,
+                                         unsigned int mask_4x4_int,
+                                         const loop_filter_info_n *lfi_n,
+                                         const uint8_t *lfl, int bd) {
+  unsigned int mask;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        vp9_highbd_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim,
+                                   lfi->hev_thr, bd);
+      } else if (mask_8x8 & 1) {
+        vp9_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim,
+                                  lfi->hev_thr, 1, bd);
+      } else if (mask_4x4 & 1) {
+        vp9_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 1, bd);
+      }
+    }
+    if (mask_4x4_int & 1)
+      vp9_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 1, bd);
+    s += 8;
+    lfl += 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void filter_block_plane_non420(VP9_COMMON *cm,
                                       struct macroblockd_plane *plane,
                                       MODE_INFO *mi_8x8,
                                       int mi_row, int mi_col) {
   const int ss_x = plane->subsampling_x;
   const int ss_y = plane->subsampling_y;
-  const int row_step = 1 << ss_x;
-  const int col_step = 1 << ss_y;
+  const int row_step = 1 << ss_y;
+  const int col_step = 1 << ss_x;
   const int row_step_stride = cm->mi_stride * row_step;
   struct buf_2d *const dst = &plane->dst;
   uint8_t* const dst0 = dst->buf;
@@ -1001,12 +1247,32 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
 
     // Disable filtering on the leftmost column
     border_mask = ~(mi_col == 0);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      high_filter_selectively_vert(CONVERT_TO_SHORTPTR(dst->buf),
+                                   dst->stride,
+                                   mask_16x16_c & border_mask,
+                                   mask_8x8_c & border_mask,
+                                   mask_4x4_c & border_mask,
+                                   mask_4x4_int[r],
+                                   &cm->lf_info, &lfl[r << 3],
+                                   (int)cm->bit_depth);
+    } else {
+      filter_selectively_vert(dst->buf, dst->stride,
+                              mask_16x16_c & border_mask,
+                              mask_8x8_c & border_mask,
+                              mask_4x4_c & border_mask,
+                              mask_4x4_int[r],
+                              &cm->lf_info, &lfl[r << 3]);
+    }
+#else
     filter_selectively_vert(dst->buf, dst->stride,
                             mask_16x16_c & border_mask,
                             mask_8x8_c & border_mask,
                             mask_4x4_c & border_mask,
                             mask_4x4_int[r],
                             &cm->lf_info, &lfl[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 8 * dst->stride;
     mi_8x8 += row_step_stride;
   }
@@ -1030,13 +1296,32 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
       mask_8x8_r = mask_8x8[r];
       mask_4x4_r = mask_4x4[r];
     }
-
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      high_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                    dst->stride,
+                                    mask_16x16_r,
+                                    mask_8x8_r,
+                                    mask_4x4_r,
+                                    mask_4x4_int_r,
+                                    &cm->lf_info, &lfl[r << 3],
+                                    (int)cm->bit_depth);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride,
+                               mask_16x16_r,
+                               mask_8x8_r,
+                               mask_4x4_r,
+                               mask_4x4_int_r,
+                               &cm->lf_info, &lfl[r << 3]);
+    }
+#else
     filter_selectively_horiz(dst->buf, dst->stride,
                              mask_16x16_r,
                              mask_8x8_r,
                              mask_4x4_r,
                              mask_4x4_int_r,
                              &cm->lf_info, &lfl[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 8 * dst->stride;
   }
 }
@@ -1062,7 +1347,29 @@ void vp9_filter_block_plane(VP9_COMMON *const cm,
       unsigned int mask_4x4_l = mask_4x4 & 0xffff;
       unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
 
-      // Disable filtering on the leftmost column
+      // Disable filtering on the leftmost column.
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        high_filter_selectively_vert_row2(plane->plane_type,
+                                          CONVERT_TO_SHORTPTR(dst->buf),
+                                          dst->stride,
+                                          mask_16x16_l,
+                                          mask_8x8_l,
+                                          mask_4x4_l,
+                                          mask_4x4_int_l,
+                                          &cm->lf_info, &lfm->lfl_y[r << 3],
+                                          (int)cm->bit_depth);
+      } else {
+        filter_selectively_vert_row2(plane->plane_type,
+                                     dst->buf, dst->stride,
+                                     mask_16x16_l,
+                                     mask_8x8_l,
+                                     mask_4x4_l,
+                                     mask_4x4_int_l,
+                                     &cm->lf_info,
+                                     &lfm->lfl_y[r << 3]);
+      }
+#else
       filter_selectively_vert_row2(plane->plane_type,
                                    dst->buf, dst->stride,
                                    mask_16x16_l,
@@ -1070,7 +1377,7 @@ void vp9_filter_block_plane(VP9_COMMON *const cm,
                                    mask_4x4_l,
                                    mask_4x4_int_l,
                                    &cm->lf_info, &lfm->lfl_y[r << 3]);
-
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       dst->buf += 16 * dst->stride;
       mask_16x16 >>= 16;
       mask_8x8 >>= 16;
@@ -1100,12 +1407,35 @@ void vp9_filter_block_plane(VP9_COMMON *const cm,
         mask_4x4_r = mask_4x4 & 0xff;
       }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        high_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                      dst->stride,
+                                      mask_16x16_r,
+                                      mask_8x8_r,
+                                      mask_4x4_r,
+                                      mask_4x4_int & 0xff,
+                                      &cm->lf_info,
+                                      &lfm->lfl_y[r << 3],
+                                      (int)cm->bit_depth);
+      } else {
+        filter_selectively_horiz(dst->buf, dst->stride,
+                                 mask_16x16_r,
+                                 mask_8x8_r,
+                                 mask_4x4_r,
+                                 mask_4x4_int & 0xff,
+                                 &cm->lf_info,
+                                 &lfm->lfl_y[r << 3]);
+      }
+#else
       filter_selectively_horiz(dst->buf, dst->stride,
                                mask_16x16_r,
                                mask_8x8_r,
                                mask_4x4_r,
                                mask_4x4_int & 0xff,
-                               &cm->lf_info, &lfm->lfl_y[r << 3]);
+                               &cm->lf_info,
+                               &lfm->lfl_y[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
       dst->buf += 8 * dst->stride;
       mask_16x16 >>= 8;
@@ -1135,14 +1465,39 @@ void vp9_filter_block_plane(VP9_COMMON *const cm,
         unsigned int mask_4x4_l = mask_4x4 & 0xff;
         unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
 
-        // Disable filtering on the leftmost column
+        // Disable filtering on the leftmost column.
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          high_filter_selectively_vert_row2(plane->plane_type,
+                                            CONVERT_TO_SHORTPTR(dst->buf),
+                                            dst->stride,
+                                            mask_16x16_l,
+                                            mask_8x8_l,
+                                            mask_4x4_l,
+                                            mask_4x4_int_l,
+                                            &cm->lf_info,
+                                            &lfm->lfl_uv[r << 1],
+                                            (int)cm->bit_depth);
+        } else {
+          filter_selectively_vert_row2(plane->plane_type,
+                                       dst->buf, dst->stride,
+                                       mask_16x16_l,
+                                       mask_8x8_l,
+                                       mask_4x4_l,
+                                       mask_4x4_int_l,
+                                       &cm->lf_info,
+                                       &lfm->lfl_uv[r << 1]);
+        }
+#else
         filter_selectively_vert_row2(plane->plane_type,
                                      dst->buf, dst->stride,
                                      mask_16x16_l,
                                      mask_8x8_l,
                                      mask_4x4_l,
                                      mask_4x4_int_l,
-                                     &cm->lf_info, &lfm->lfl_uv[r << 1]);
+                                     &cm->lf_info,
+                                     &lfm->lfl_uv[r << 1]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
         dst->buf += 16 * dst->stride;
         mask_16x16 >>= 8;
@@ -1177,12 +1532,35 @@ void vp9_filter_block_plane(VP9_COMMON *const cm,
         mask_4x4_r = mask_4x4 & 0xf;
       }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        high_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                      dst->stride,
+                                      mask_16x16_r,
+                                      mask_8x8_r,
+                                      mask_4x4_r,
+                                      mask_4x4_int_r,
+                                      &cm->lf_info,
+                                      &lfm->lfl_uv[r << 1],
+                                      (int)cm->bit_depth);
+      } else {
+        filter_selectively_horiz(dst->buf, dst->stride,
+                                 mask_16x16_r,
+                                 mask_8x8_r,
+                                 mask_4x4_r,
+                                 mask_4x4_int_r,
+                                 &cm->lf_info,
+                                 &lfm->lfl_uv[r << 1]);
+      }
+#else
       filter_selectively_horiz(dst->buf, dst->stride,
                                mask_16x16_r,
                                mask_8x8_r,
                                mask_4x4_r,
                                mask_4x4_int_r,
-                               &cm->lf_info, &lfm->lfl_uv[r << 1]);
+                               &cm->lf_info,
+                               &lfm->lfl_uv[r << 1]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
       dst->buf += 8 * dst->stride;
       mask_16x16 >>= 4;
index 25d3311b6f6b00137157e5b97227b56b2370ed6a..5af52c273c7acdb20a0182d8cd9abb7fa624157e 100644 (file)
@@ -17,6 +17,20 @@ static INLINE int8_t signed_char_clamp(int t) {
   return (int8_t)clamp(t, -128, 127);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int16_t signed_char_clamp_high(int t, int bd) {
+  switch (bd) {
+    case 10:
+      return (int16_t)clamp(t, -128*4, 128*4-1);
+    case 12:
+      return (int16_t)clamp(t, -128*16, 128*16-1);
+    case 8:
+    default:
+      return (int16_t)clamp(t, -128, 128-1);
+  }
+}
+#endif
+
 // should we apply any filter at all: 11111111 yes, 00000000 no
 static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
                                  uint8_t p3, uint8_t p2,
@@ -337,3 +351,390 @@ void vp9_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh) {
   mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t high_filter_mask(uint8_t limit, uint8_t blimit,
+                                      uint16_t p3, uint16_t p2,
+                                      uint16_t p1, uint16_t p0,
+                                      uint16_t q0, uint16_t q1,
+                                      uint16_t q2, uint16_t q3, int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p3 - p2) > limit16) * -1;
+  mask |= (abs(p2 - p1) > limit16) * -1;
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(q2 - q1) > limit16) * -1;
+  mask |= (abs(q3 - q2) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit16) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t high_flat_mask4(uint8_t thresh,
+                                     uint16_t p3, uint16_t p2,
+                                     uint16_t p1, uint16_t p0,
+                                     uint16_t q0, uint16_t q1,
+                                     uint16_t q2, uint16_t q3, int bd) {
+  int8_t mask = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  mask |= (abs(p1 - p0) > thresh16) * -1;
+  mask |= (abs(q1 - q0) > thresh16) * -1;
+  mask |= (abs(p2 - p0) > thresh16) * -1;
+  mask |= (abs(q2 - q0) > thresh16) * -1;
+  mask |= (abs(p3 - p0) > thresh16) * -1;
+  mask |= (abs(q3 - q0) > thresh16) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t high_flat_mask5(uint8_t thresh,
+                                     uint16_t p4, uint16_t p3,
+                                     uint16_t p2, uint16_t p1,
+                                     uint16_t p0, uint16_t q0,
+                                     uint16_t q1, uint16_t q2,
+                                     uint16_t q3, uint16_t q4, int bd) {
+  int8_t mask = ~high_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  mask |= (abs(p4 - p0) > thresh16) * -1;
+  mask |= (abs(q4 - q0) > thresh16) * -1;
+  return ~mask;
+}
+
+// Is there high edge variance internal edge:
+// 11111111_11111111 yes, 00000000_00000000 no ?
+static INLINE int16_t high_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
+                                    uint16_t q0, uint16_t q1, int bd) {
+  int16_t hev = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  hev |= (abs(p1 - p0) > thresh16) * -1;
+  hev |= (abs(q1 - q0) > thresh16) * -1;
+  return hev;
+}
+
+static INLINE void high_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
+                                uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
+                                int bd) {
+  int16_t filter1, filter2;
+  // ^0x80 equivalent to subtracting 0x80 from the values to turn them
+  // into -128 to +127 instead of 0 to 255.
+  int shift = bd - 8;
+  const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
+  const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
+  const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
+  const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
+  const uint16_t hev = high_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
+
+  // Add outer taps if we have high edge variance.
+  int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
+
+  // Inner taps.
+  filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
+
+  // Save bottom 3 bits so that we round one side +4 and the other +3
+  // if it equals 4 we'll set to adjust by -1 to account for the fact
+  // we'd round 3 the other way.
+  filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
+  filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
+
+  *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
+  *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
+
+  // Outer tap adjustments.
+  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+  *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
+  *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
+}
+
+void vp9_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+                                   const uint8_t *blimit, const uint8_t *limit,
+                                   const uint8_t *thresh, int count, int bd) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4 * p];
+    const uint16_t p2 = s[-3 * p];
+    const uint16_t p1 = s[-2 * p];
+    const uint16_t p0 = s[-p];
+    const uint16_t q0 = s[0 * p];
+    const uint16_t q1 = s[1 * p];
+    const uint16_t q2 = s[2 * p];
+    const uint16_t q3 = s[3 * p];
+    const int8_t mask = high_filter_mask(*limit, *blimit,
+                                         p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    high_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
+    ++s;
+  }
+}
+
+void vp9_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
+                                        const uint8_t *blimit0,
+                                        const uint8_t *limit0,
+                                        const uint8_t *thresh0,
+                                        const uint8_t *blimit1,
+                                        const uint8_t *limit1,
+                                        const uint8_t *thresh1,
+                                        int bd) {
+  vp9_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1, bd);
+  vp9_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+}
+
+void vp9_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int count, int bd) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
+    const int8_t mask = high_filter_mask(*limit, *blimit,
+                                         p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    high_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
+    s += pitch;
+  }
+}
+
+void vp9_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
+                                      const uint8_t *blimit0,
+                                      const uint8_t *limit0,
+                                      const uint8_t *thresh0,
+                                      const uint8_t *blimit1,
+                                      const uint8_t *limit1,
+                                      const uint8_t *thresh1,
+                                      int bd) {
+  vp9_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vp9_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
+                              thresh1, 1, bd);
+}
+
+static INLINE void high_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+                                uint16_t *op3, uint16_t *op2,
+                                uint16_t *op1, uint16_t *op0,
+                                uint16_t *oq0, uint16_t *oq1,
+                                uint16_t *oq2, uint16_t *oq3, int bd) {
+  if (flat && mask) {
+    const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+  } else {
+    high_filter4(mask, thresh, op1,  op0, oq0, oq1, bd);
+  }
+}
+
+void vp9_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count, int bd) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+    const int8_t mask = high_filter_mask(*limit, *blimit,
+                                         p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = high_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    high_filter8(mask, *thresh, flat,
+                 s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+                 s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
+    ++s;
+  }
+}
+
+void vp9_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
+                                        const uint8_t *blimit0,
+                                        const uint8_t *limit0,
+                                        const uint8_t *thresh0,
+                                        const uint8_t *blimit1,
+                                        const uint8_t *limit1,
+                                        const uint8_t *thresh1,
+                                        int bd) {
+  vp9_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1, bd);
+  vp9_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+}
+
+void vp9_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int count, int bd) {
+  int i;
+
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask = high_filter_mask(*limit, *blimit,
+                                         p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = high_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    high_filter8(mask, *thresh, flat,
+                 s - 4, s - 3, s - 2, s - 1,
+                 s, s + 1, s + 2, s + 3,
+                 bd);
+    s += pitch;
+  }
+}
+
+void vp9_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
+                                      const uint8_t *blimit0,
+                                      const uint8_t *limit0,
+                                      const uint8_t *thresh0,
+                                      const uint8_t *blimit1,
+                                      const uint8_t *limit1,
+                                      const uint8_t *thresh1,
+                                      int bd) {
+  vp9_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vp9_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
+                              thresh1, 1, bd);
+}
+
+static INLINE void high_filter16(int8_t mask, uint8_t thresh,
+                                 uint8_t flat, uint8_t flat2,
+                                 uint16_t *op7, uint16_t *op6,
+                                 uint16_t *op5, uint16_t *op4,
+                                 uint16_t *op3, uint16_t *op2,
+                                 uint16_t *op1, uint16_t *op0,
+                                 uint16_t *oq0, uint16_t *oq1,
+                                 uint16_t *oq2, uint16_t *oq3,
+                                 uint16_t *oq4, uint16_t *oq5,
+                                 uint16_t *oq6, uint16_t *oq7, int bd) {
+  if (flat2 && flat && mask) {
+    const uint16_t p7 = *op7;
+    const uint16_t p6 = *op6;
+    const uint16_t p5 = *op5;
+    const uint16_t p4 = *op4;
+    const uint16_t p3 = *op3;
+    const uint16_t p2 = *op2;
+    const uint16_t p1 = *op1;
+    const uint16_t p0 = *op0;
+    const uint16_t q0 = *oq0;
+    const uint16_t q1 = *oq1;
+    const uint16_t q2 = *oq2;
+    const uint16_t q3 = *oq3;
+    const uint16_t q4 = *oq4;
+    const uint16_t q5 = *oq5;
+    const uint16_t q6 = *oq6;
+    const uint16_t q7 = *oq7;
+
+    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0, 4);
+    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1, 4);
+    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2, 4);
+    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3, 4);
+    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4, 4);
+    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5, 4);
+    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
+    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
+    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
+    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
+    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
+    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
+    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
+    *oq6 = ROUND_POWER_OF_TWO(p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
+  } else {
+    high_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+                 bd);
+  }
+}
+
+void vp9_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int count, int bd) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4 * p];
+    const uint16_t p2 = s[-3 * p];
+    const uint16_t p1 = s[-2 * p];
+    const uint16_t p0 = s[-p];
+    const uint16_t q0 = s[0 * p];
+    const uint16_t q1 = s[1 * p];
+    const uint16_t q2 = s[2 * p];
+    const uint16_t q3 = s[3 * p];
+    const int8_t mask = high_filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = high_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat2 = high_flat_mask5(
+        1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
+        q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
+
+    high_filter16(mask, *thresh, flat, flat2,
+                  s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
+                  s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+                  s, s + 1 * p, s + 2 * p, s + 3 * p,
+                  s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p,
+                  bd);
+    ++s;
+  }
+}
+
+static void high_mb_lpf_vertical_edge_w(uint16_t *s, int p,
+                                        const uint8_t *blimit,
+                                        const uint8_t *limit,
+                                        const uint8_t *thresh,
+                                        int count, int bd) {
+  int i;
+
+  for (i = 0; i < count; ++i) {
+    const uint16_t p3 = s[-4];
+    const uint16_t p2 = s[-3];
+    const uint16_t p1 = s[-2];
+    const uint16_t p0 = s[-1];
+    const uint16_t q0 = s[0];
+    const uint16_t q1 = s[1];
+    const uint16_t q2 = s[2];
+    const uint16_t q3 = s[3];
+    const int8_t mask = high_filter_mask(*limit, *blimit,
+                                         p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = high_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat2 = high_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+                                         q0, s[4], s[5], s[6], s[7], bd);
+
+    high_filter16(mask, *thresh, flat, flat2,
+                  s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
+                  s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7,
+                  bd);
+    s += p;
+  }
+}
+
+void vp9_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+                                  const uint8_t *limit, const uint8_t *thresh,
+                                  int bd) {
+  high_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
+}
+
+void vp9_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh,
+                                       int bd) {
+  high_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
index 792e9d970b7095d76de34d3f5ccbbdc04822bc26..3954c459b641c16f26cf0ab8baffac82b236f431 100644 (file)
@@ -180,6 +180,7 @@ typedef struct VP9Common {
 
   // VPX_BITS_8 in profile 0 or 1, VPX_BITS_10 or VPX_BITS_12 in profile 2 or 3.
   vpx_bit_depth_t bit_depth;
+  vpx_bit_depth_t dequant_bit_depth;  // bit_depth of current dequantizer
 
 #if CONFIG_VP9_POSTPROC
   struct postproc_state  postproc_state;
index e4e6ce7826ed12ebf1f0f3acf15632a7b44255c5..abe71da40a7cfcedc1d8791989ea80fdfbbeebee 100644 (file)
@@ -19,6 +19,9 @@
 #include "vpx_scale/vpx_scale.h"
 #include "vpx_scale/yv12config.h"
 
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vp9/common/vp9_common.h"
+#endif
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_postproc.h"
 #include "vp9/common/vp9_systemdependent.h"
@@ -152,6 +155,84 @@ void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr,
+                                            uint16_t *dst_ptr,
+                                            int src_pixels_per_line,
+                                            int dst_pixels_per_line,
+                                            int rows,
+                                            int cols,
+                                            int flimit) {
+  uint16_t const *p_src;
+  uint16_t *p_dst;
+  int row;
+  int col;
+  int i;
+  int v;
+  int pitch = src_pixels_per_line;
+  uint16_t d[8];
+
+  for (row = 0; row < rows; row++) {
+    // post_proc_down for one row.
+    p_src = src_ptr;
+    p_dst = dst_ptr;
+
+    for (col = 0; col < cols; col++) {
+      int kernel = 4;
+      int v = p_src[col];
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - p_src[col + i * pitch]) > flimit)
+          goto down_skip_convolve;
+
+        kernel += kernel5[2 + i] * p_src[col + i * pitch];
+      }
+
+      v = (kernel >> 3);
+
+    down_skip_convolve:
+      p_dst[col] = v;
+    }
+
+    /* now post_proc_across */
+    p_src = dst_ptr;
+    p_dst = dst_ptr;
+
+    for (i = 0; i < 8; i++)
+      d[i] = p_src[i];
+
+    for (col = 0; col < cols; col++) {
+      int kernel = 4;
+      v = p_src[col];
+
+      d[col & 7] = v;
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - p_src[col + i]) > flimit)
+          goto across_skip_convolve;
+
+        kernel += kernel5[2 + i] * p_src[col + i];
+      }
+
+      d[col & 7] = (kernel >> 3);
+
+    across_skip_convolve:
+      if (col >= 2)
+        p_dst[col - 2] = d[(col - 2) & 7];
+    }
+
+    /* handle the last two pixels */
+    p_dst[col - 2] = d[(col - 2) & 7];
+    p_dst[col - 1] = d[(col - 1) & 7];
+
+
+    /* next row */
+    src_ptr += pitch;
+    dst_ptr += dst_pixels_per_line;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static int q2mbl(int x) {
   if (x < 20) x = 20;
 
@@ -162,10 +243,46 @@ static int q2mbl(int x) {
 void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch,
                                  int rows, int cols, int flimit) {
   int r, c, i;
-
   uint8_t *s = src;
   uint8_t d[16];
 
+  for (r = 0; r < rows; r++) {
+    int sumsq = 0;
+    int sum = 0;
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i] * s[i];
+      sum += s[i];
+      d[i + 8] = 0;
+    }
+
+    for (c = 0; c < cols + 8; c++) {
+      int x = s[c + 7] - s[c - 8];
+      int y = s[c + 7] + s[c - 8];
+
+      sum += x;
+      sumsq += x * y;
+
+      d[c & 15] = s[c];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[c & 15] = (8 + sum + s[c]) >> 4;
+      }
+
+      s[c - 8] = d[(c - 8) & 15];
+    }
+    s += pitch;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch,
+                                        int rows, int cols, int flimit) {
+  int r, c, i;
+
+  uint16_t *s = src;
+  uint16_t d[16];
+
 
   for (r = 0; r < rows; r++) {
     int sumsq = 0;
@@ -196,6 +313,7 @@ void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch,
     s += pitch;
   }
 }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch,
                             int rows, int cols, int flimit) {
@@ -229,6 +347,40 @@ void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch,
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_mbpost_proc_down_c(uint16_t *dst, int pitch,
+                                   int rows, int cols, int flimit) {
+  int r, c, i;
+  const int16_t *rv3 = &vp9_rv[63 & rand()];  // NOLINT
+
+  for (c = 0; c < cols; c++) {
+    uint16_t *s = &dst[c];
+    int sumsq = 0;
+    int sum = 0;
+    uint16_t d[16];
+    const int16_t *rv2 = rv3 + ((c * 17) & 127);
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i * pitch] * s[i * pitch];
+      sum += s[i * pitch];
+    }
+
+    for (r = 0; r < rows + 8; r++) {
+      sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
+      sum += s[7 * pitch] - s[-8 * pitch];
+      d[r & 15] = s[0];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+      }
+
+      s[-8 * pitch] = d[(r - 8) & 15];
+      s += pitch;
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG   *source,
                                        YV12_BUFFER_CONFIG   *post,
                                        int                   q,
@@ -239,6 +391,51 @@ static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG   *source,
   (void) low_var_thresh;
   (void) flag;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->y_buffer),
+                                         CONVERT_TO_SHORTPTR(post->y_buffer),
+                                         source->y_stride, post->y_stride,
+                                         source->y_height, source->y_width,
+                                         ppl);
+
+    vp9_highbd_mbpost_proc_across_ip(CONVERT_TO_SHORTPTR(post->y_buffer),
+                                     post->y_stride, post->y_height,
+                                     post->y_width, q2mbl(q));
+
+    vp9_highbd_mbpost_proc_down(CONVERT_TO_SHORTPTR(post->y_buffer),
+                                post->y_stride, post->y_height,
+                                post->y_width, q2mbl(q));
+
+    vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->u_buffer),
+                                         CONVERT_TO_SHORTPTR(post->u_buffer),
+                                         source->uv_stride, post->uv_stride,
+                                         source->uv_height, source->uv_width,
+                                         ppl);
+    vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->v_buffer),
+                                         CONVERT_TO_SHORTPTR(post->v_buffer),
+                                         source->uv_stride, post->uv_stride,
+                                         source->uv_height, source->uv_width,
+                                         ppl);
+  } else {
+    vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
+                                  source->y_stride, post->y_stride,
+                                  source->y_height, source->y_width, ppl);
+
+    vp9_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+                              post->y_width, q2mbl(q));
+
+    vp9_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+                         post->y_width, q2mbl(q));
+
+    vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer,
+                                  source->uv_stride, post->uv_stride,
+                                  source->uv_height, source->uv_width, ppl);
+    vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
+                                  source->uv_stride, post->uv_stride,
+                                  source->uv_height, source->uv_width, ppl);
+  }
+#else
   vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
                                 source->y_stride, post->y_stride,
                                 source->y_height, source->y_width, ppl);
@@ -255,6 +452,7 @@ static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG   *source,
   vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
                                 source->uv_stride, post->uv_stride,
                                 source->uv_height, source->uv_width, ppl);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
 void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
@@ -272,9 +470,24 @@ void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
   const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
 
   for (i = 0; i < MAX_MB_PLANE; ++i)
+#if CONFIG_VP9_HIGHBITDEPTH
+    assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
+           (dst->flags & YV12_FLAG_HIGHBITDEPTH));
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(srcs[i]),
+                                           CONVERT_TO_SHORTPTR(dsts[i]),
+                                           src_strides[i], dst_strides[i],
+                                           src_heights[i], src_widths[i], ppl);
+    } else {
+      vp9_post_proc_down_and_across(srcs[i], dsts[i],
+                                    src_strides[i], dst_strides[i],
+                                    src_heights[i], src_widths[i], ppl);
+    }
+#else
     vp9_post_proc_down_and_across(srcs[i], dsts[i],
                                   src_strides[i], dst_strides[i],
                                   src_heights[i], src_widths[i], ppl);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
 void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
@@ -293,15 +506,32 @@ void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     const int src_stride = src_strides[i];
-    const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
     const int src_width = src_widths[i] - 4;
     const int src_height = src_heights[i] - 4;
-
     const int dst_stride = dst_strides[i];
-    uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+    assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
+           (dst->flags & YV12_FLAG_HIGHBITDEPTH));
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+      const uint16_t *const src = CONVERT_TO_SHORTPTR(srcs[i] + 2 * src_stride
+                                                      + 2);
+      uint16_t *const dst = CONVERT_TO_SHORTPTR(dsts[i] + 2 * dst_stride + 2);
+      vp9_highbd_post_proc_down_and_across(src, dst, src_stride, dst_stride,
+                                           src_height, src_width, ppl);
+    } else {
+      const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
+      uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
+
+      vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
+                                    src_height, src_width, ppl);
+    }
+#else
+    const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
+    uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
     vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
                                   src_height, src_width, ppl);
+#endif
   }
 }
 
@@ -405,6 +635,9 @@ int vp9_post_proc_frame(struct VP9Common *cm,
 #if CONFIG_VP9_POSTPROC || CONFIG_INTERNAL_STATS
   if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
                                VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate post-processing buffer");
index b49f1304151d31bee73657a80672c24b161c0e8b..1a46b39ad214121872876b5cadb37f97a1642129 100644 (file)
@@ -288,8 +288,9 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
     uint8_t *pre;
     MV32 scaled_mv;
     int xs, ys, subpel_x, subpel_y;
+    const int is_scaled = vp9_is_scaled(sf);
 
-    if (vp9_is_scaled(sf)) {
+    if (is_scaled) {
       pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
       scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
       xs = sf->x_step_q4;
@@ -385,12 +386,6 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                ? average_split_mvs(pd, mi, ref, block)
                : mi->mbmi.mv[ref].as_mv;
 
-
-    // TODO(jkoleszar): This clamping is done in the incorrect place for the
-    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
-    // MV. Note however that it performs the subsampling aware scaling so
-    // that the result is always q4.
-    // mv_precision precision is MV_PRECISION_Q4.
     const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
                                                pd->subsampling_x,
                                                pd->subsampling_y);
@@ -400,6 +395,7 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
         subpel_x, subpel_y;
     uint8_t *ref_frame, *buf_ptr;
     const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
+    const int is_scaled = vp9_is_scaled(sf);
 
     // Get reference frame pointer, width and height.
     if (plane == 0) {
@@ -412,7 +408,7 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
       ref_frame = plane == 1 ? ref_buf->u_buffer : ref_buf->v_buffer;
     }
 
-    if (vp9_is_scaled(sf)) {
+    if (is_scaled) {
       // Co-ordinate of containing block to pixel precision.
       int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
       int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
@@ -451,7 +447,8 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
     subpel_x = scaled_mv.col & SUBPEL_MASK;
     subpel_y = scaled_mv.row & SUBPEL_MASK;
 
-    // Calculate the top left corner of the best matching block in the reference frame.
+    // Calculate the top left corner of the best matching block in the
+    // reference frame.
     x0 += scaled_mv.col >> SUBPEL_BITS;
     y0 += scaled_mv.row >> SUBPEL_BITS;
     x0_16 += scaled_mv.col;
@@ -463,20 +460,20 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
 
     // Do border extension if there is motion or the
     // width/height is not a multiple of 8 pixels.
-    if (scaled_mv.col || scaled_mv.row ||
+    if (is_scaled || scaled_mv.col || scaled_mv.row ||
         (frame_width & 0x7) || (frame_height & 0x7)) {
       // Get reference block bottom right coordinate.
       int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
       int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
       int x_pad = 0, y_pad = 0;
 
-      if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) {
+      if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
         x0 -= VP9_INTERP_EXTEND - 1;
         x1 += VP9_INTERP_EXTEND;
         x_pad = 1;
       }
 
-      if (subpel_y || (sf->y_step_q4 & SUBPEL_MASK)) {
+      if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
         y0 -= VP9_INTERP_EXTEND - 1;
         y1 += VP9_INTERP_EXTEND;
         y_pad = 1;
index 0f52ae15c351c0d41e4ed47569ac10ae448cf417..99460d1a12dce65332f8081a91ff2e62c40c1304 100644 (file)
@@ -6,7 +6,6 @@ print <<EOF
 
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_idct.h"
 
 struct macroblockd;
 
@@ -450,160 +449,160 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   # Intra prediction
   #
-  add_proto qw/void vp9_high_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d207_predictor_4x4/;
 
-  add_proto qw/void vp9_high_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d45_predictor_4x4/;
 
-  add_proto qw/void vp9_high_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d63_predictor_4x4/;
 
-  add_proto qw/void vp9_high_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_h_predictor_4x4/;
 
-  add_proto qw/void vp9_high_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d117_predictor_4x4/;
 
-  add_proto qw/void vp9_high_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d135_predictor_4x4/;
 
-  add_proto qw/void vp9_high_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d153_predictor_4x4/;
 
-  add_proto qw/void vp9_high_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_v_predictor_4x4 neon/, "$sse_x86inc";
 
-  add_proto qw/void vp9_high_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_tm_predictor_4x4/, "$sse_x86inc";
 
-  add_proto qw/void vp9_high_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_dc_predictor_4x4/, "$sse_x86inc";
 
-  add_proto qw/void vp9_high_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_dc_top_predictor_4x4/;
 
-  add_proto qw/void vp9_high_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_dc_left_predictor_4x4/;
 
-  add_proto qw/void vp9_high_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_dc_128_predictor_4x4/;
 
-  add_proto qw/void vp9_high_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d207_predictor_8x8/;
 
-  add_proto qw/void vp9_high_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d45_predictor_8x8/;
 
-  add_proto qw/void vp9_high_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d63_predictor_8x8/;
 
-  add_proto qw/void vp9_high_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_h_predictor_8x8/;
 
-  add_proto qw/void vp9_high_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d117_predictor_8x8/;
 
-  add_proto qw/void vp9_high_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d135_predictor_8x8/;
 
-  add_proto qw/void vp9_high_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d153_predictor_8x8/;
 
-  add_proto qw/void vp9_high_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_v_predictor_8x8/, "$sse2_x86inc";
 
-  add_proto qw/void vp9_high_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_tm_predictor_8x8/, "$sse2_x86inc";
 
-  add_proto qw/void vp9_high_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_dc_predictor_8x8/, "$sse2_x86inc";;
 
-  add_proto qw/void vp9_high_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_dc_top_predictor_8x8/;
 
-  add_proto qw/void vp9_high_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_dc_left_predictor_8x8/;
 
-  add_proto qw/void vp9_high_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_dc_128_predictor_8x8/;
 
-  add_proto qw/void vp9_high_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d207_predictor_16x16/;
 
-  add_proto qw/void vp9_high_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d45_predictor_16x16/;
 
-  add_proto qw/void vp9_high_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d63_predictor_16x16/;
 
-  add_proto qw/void vp9_high_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_h_predictor_16x16/;
 
-  add_proto qw/void vp9_high_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d117_predictor_16x16/;
 
-  add_proto qw/void vp9_high_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d135_predictor_16x16/;
 
-  add_proto qw/void vp9_high_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d153_predictor_16x16/;
 
-  add_proto qw/void vp9_high_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_v_predictor_16x16 neon/, "$sse2_x86inc";
 
-  add_proto qw/void vp9_high_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_tm_predictor_16x16/, "$sse2_x86_64";
 
-  add_proto qw/void vp9_high_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_dc_predictor_16x16/, "$sse2_x86inc";
 
-  add_proto qw/void vp9_high_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_dc_top_predictor_16x16/;
 
-  add_proto qw/void vp9_high_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_dc_left_predictor_16x16/;
 
-  add_proto qw/void vp9_high_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_dc_128_predictor_16x16/;
 
-  add_proto qw/void vp9_high_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d207_predictor_32x32/;
 
-  add_proto qw/void vp9_high_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d45_predictor_32x32/;
 
-  add_proto qw/void vp9_high_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d63_predictor_32x32/;
 
-  add_proto qw/void vp9_high_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_h_predictor_32x32/;
 
-  add_proto qw/void vp9_high_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d117_predictor_32x32/;
 
-  add_proto qw/void vp9_high_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d135_predictor_32x32/;
 
-  add_proto qw/void vp9_high_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_d153_predictor_32x32/;
 
-  add_proto qw/void vp9_high_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_v_predictor_32x32/, "$sse2_x86inc";
 
-  add_proto qw/void vp9_high_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_tm_predictor_32x32/, "$sse2_x86_64";
 
-  add_proto qw/void vp9_high_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_dc_predictor_32x32/, "$sse2_x86_64";
 
-  add_proto qw/void vp9_high_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_dc_top_predictor_32x32/;
 
-  add_proto qw/void vp9_high_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_dc_left_predictor_32x32/;
 
-  add_proto qw/void vp9_high_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps";
+  add_proto qw/void vp9_high_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_high_dc_128_predictor_32x32/;
 
   #
@@ -633,6 +632,59 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vp9_high_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
   specialize qw/vp9_high_convolve8_avg_vert/, "$sse2_x86_64";
 
+  #
+  # Loopfilter
+  #
+  add_proto qw/void vp9_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vp9_highbd_lpf_vertical_16 sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vp9_highbd_lpf_vertical_16_dual sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vp9_highbd_lpf_vertical_8 sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vp9_highbd_lpf_vertical_8_dual sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vp9_highbd_lpf_vertical_4 sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vp9_highbd_lpf_vertical_4_dual sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vp9_highbd_lpf_horizontal_16 sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vp9_highbd_lpf_horizontal_8 sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vp9_highbd_lpf_horizontal_8_dual sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vp9_highbd_lpf_horizontal_4 sse2/;
+
+  add_proto qw/void vp9_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vp9_highbd_lpf_horizontal_4_dual sse2/;
+
+  #
+  # post proc
+  #
+  if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+    add_proto qw/void vp9_highbd_mbpost_proc_down/, "uint16_t *dst, int pitch, int rows, int cols, int flimit";
+    specialize qw/vp9_highbd_mbpost_proc_down/;
+
+    add_proto qw/void vp9_highbd_mbpost_proc_across_ip/, "uint16_t *src, int pitch, int rows, int cols, int flimit";
+    specialize qw/vp9_highbd_mbpost_proc_across_ip/;
+
+    add_proto qw/void vp9_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
+    specialize qw/vp9_highbd_post_proc_down_and_across/;
+
+    add_proto qw/void vp9_highbd_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
+    specialize qw/vp9_highbd_plane_add_noise/;
+  }
+
   #
   # dct
   #
@@ -1793,8 +1845,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vp9_high_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_high_fdct32x32_rd/;
 
-  add_proto qw/void vp9_high_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
-  specialize qw/vp9_high_temporal_filter_apply/;
+  add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+  specialize qw/vp9_highbd_temporal_filter_apply/;
 
 }
 # End vp9_high encoder functions
diff --git a/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
new file mode 100644 (file)
index 0000000..32e4b20
--- /dev/null
@@ -0,0 +1,1119 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
+    __m128i ubounded;
+    __m128i lbounded;
+    __m128i retval;
+
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi16(1);
+    const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
+    const __m128i max = _mm_subs_epi16(
+        _mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
+    const __m128i min = _mm_subs_epi16(zero, t80);
+    ubounded = _mm_cmpgt_epi16(value, max);
+    lbounded = _mm_cmplt_epi16(value, min);
+    retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
+    ubounded = _mm_and_si128(ubounded, max);
+    lbounded = _mm_and_si128(lbounded, min);
+    retval = _mm_or_si128(retval, ubounded);
+    retval = _mm_or_si128(retval, lbounded);
+    return retval;
+}
+
+// TODO(debargha, peter): Break up large functions into smaller ones
+// in this file.
+static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
+                                                   int p,
+                                                   const uint8_t *_blimit,
+                                                   const uint8_t *_limit,
+                                                   const uint8_t *_thresh,
+                                                   int bd) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i blimit = _mm_slli_epi16(
+      _mm_unpacklo_epi8(
+          _mm_load_si128((const __m128i *)_blimit), zero), bd - 8);
+  const __m128i limit = _mm_slli_epi16(
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), bd - 8);
+  const __m128i thresh = _mm_slli_epi16(
+      _mm_unpacklo_epi8(
+          _mm_load_si128((const __m128i *)_thresh), zero), bd - 8);
+  __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
+  __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
+  __m128i ps1, qs1, ps0, qs0;
+  __m128i abs_p0q0, abs_p1q1, ffff, work;
+  __m128i filt, work_a, filter1, filter2;
+  __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
+  __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
+  __m128i flat2_q0, flat2_p0;
+  __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0;
+  __m128i pixelFilter_p, pixelFilter_q;
+  __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+  __m128i sum_p7, sum_q7, sum_p3, sum_q3;
+  __m128i t4, t3, t80, t1;
+  __m128i eight, four;
+
+  q4 = _mm_load_si128((__m128i *)(s + 4 * p));
+  p4 = _mm_load_si128((__m128i *)(s - 5 * p));
+  q3 = _mm_load_si128((__m128i *)(s + 3 * p));
+  p3 = _mm_load_si128((__m128i *)(s - 4 * p));
+  q2 = _mm_load_si128((__m128i *)(s + 2 * p));
+  p2 = _mm_load_si128((__m128i *)(s - 3 * p));
+  q1 = _mm_load_si128((__m128i *)(s + 1 * p));
+  p1 = _mm_load_si128((__m128i *)(s - 2 * p));
+  q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+  p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+
+  //  high_filter_mask
+  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+
+  ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+
+  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
+
+  //  high_hev_mask (in C code this is actually called from high_filter4)
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // abs(p1 - q1) / 2
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p1, p0),
+                                    _mm_subs_epu16(p0, p1)),
+                       _mm_or_si128(_mm_subs_epu16(q1, q0),
+                                    _mm_subs_epu16(q0, q1)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+                                    _mm_subs_epu16(p1, p2)),
+                       _mm_or_si128(_mm_subs_epu16(q2, q1),
+                                    _mm_subs_epu16(q1, q2)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
+                                    _mm_subs_epu16(p2, p3)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q2),
+                                    _mm_subs_epu16(q2, q3)));
+  mask = _mm_max_epi16(work, mask);
+
+  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_cmpeq_epi16(mask, zero);  // return ~mask
+
+  // lp filter
+  // high_filter4
+  t4 = _mm_set1_epi16(4);
+  t3 = _mm_set1_epi16(3);
+  t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
+  t1 = _mm_set1_epi16(0x1);
+
+  ps1 = _mm_subs_epi16(p1, t80);
+  qs1 = _mm_subs_epi16(q1, t80);
+  ps0 = _mm_subs_epi16(p0, t80);
+  qs0 = _mm_subs_epi16(q0, t80);
+
+  filt = _mm_and_si128(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), hev);
+  work_a = _mm_subs_epi16(qs0, ps0);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+  filt = _mm_and_si128(filt, mask);
+
+  filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+  filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+  // Filter1 >> 3
+  filter1 = _mm_srai_epi16(filter1, 0x3);
+  filter2 = _mm_srai_epi16(filter2, 0x3);
+
+  qs0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd),
+      t80);
+  ps0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd),
+      t80);
+  filt = _mm_adds_epi16(filter1, t1);
+  filt = _mm_srai_epi16(filt, 1);
+  filt = _mm_andnot_si128(hev, filt);
+
+  qs1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
+      t80);
+  ps1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
+      t80);
+  // end high_filter4
+  // loopfilter done
+
+  // high_flat_mask4
+  flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
+                                    _mm_subs_epu16(p0, p2)),
+                       _mm_or_si128(_mm_subs_epu16(p3, p0),
+                                    _mm_subs_epu16(p0, p3)));
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q0),
+                                    _mm_subs_epu16(q0, q2)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q0),
+                                    _mm_subs_epu16(q0, q3)));
+  flat = _mm_max_epi16(work, flat);
+  work = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  flat = _mm_max_epi16(work, flat);
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+  flat = _mm_cmpeq_epi16(flat, zero);
+  // end flat_mask4
+
+  // flat & mask = flat && mask (as used in filter8)
+  // (because, in both vars, each block of 16 either all 1s or all 0s)
+  flat = _mm_and_si128(flat, mask);
+
+  p5 = _mm_load_si128((__m128i *)(s - 6 * p));
+  q5 = _mm_load_si128((__m128i *)(s + 5 * p));
+  p6 = _mm_load_si128((__m128i *)(s - 7 * p));
+  q6 = _mm_load_si128((__m128i *)(s + 6 * p));
+  p7 = _mm_load_si128((__m128i *)(s - 8 * p));
+  q7 = _mm_load_si128((__m128i *)(s + 7 * p));
+
+  // high_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
+  // but referred to as p0-p4 & q0-q4 in fn)
+  flat2 = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p4, p0),
+                                     _mm_subs_epu16(p0, p4)),
+                        _mm_or_si128(_mm_subs_epu16(q4, q0),
+                                     _mm_subs_epu16(q0, q4)));
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p5, p0),
+                                    _mm_subs_epu16(p0, p5)),
+                       _mm_or_si128(_mm_subs_epu16(q5, q0),
+                                    _mm_subs_epu16(q0, q5)));
+  flat2 = _mm_max_epi16(work, flat2);
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p6, p0),
+                                    _mm_subs_epu16(p0, p6)),
+                       _mm_or_si128(_mm_subs_epu16(q6, q0),
+                                    _mm_subs_epu16(q0, q6)));
+  flat2 = _mm_max_epi16(work, flat2);
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p7, p0),
+                                    _mm_subs_epu16(p0, p7)),
+                       _mm_or_si128(_mm_subs_epu16(q7, q0),
+                                    _mm_subs_epu16(q0, q7)));
+  flat2 = _mm_max_epi16(work, flat2);
+
+  flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, bd - 8));
+  flat2 = _mm_cmpeq_epi16(flat2, zero);
+  flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+  // end high_flat_mask5
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // flat and wide flat calculations
+  eight = _mm_set1_epi16(8);
+  four = _mm_set1_epi16(4);
+
+  pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5),
+                                _mm_add_epi16(p4, p3));
+  pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5),
+                                _mm_add_epi16(q4, q3));
+
+  pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
+  pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+  pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
+  pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+  pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
+                                                      pixelFilter_q));
+  pixetFilter_p2p1p0 =   _mm_add_epi16(four,
+                                       _mm_add_epi16(pixetFilter_p2p1p0,
+                                                     pixetFilter_q2q1q0));
+  flat2_p0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(p7, p0)), 4);
+  flat2_q0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(q7, q0)), 4);
+  flat_p0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(p3, p0)), 3);
+  flat_q0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(q3, q0)), 3);
+
+  sum_p7 = _mm_add_epi16(p7, p7);
+  sum_q7 = _mm_add_epi16(q7, q7);
+  sum_p3 = _mm_add_epi16(p3, p3);
+  sum_q3 = _mm_add_epi16(q3, q3);
+
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6);
+  flat2_p1 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4);
+  flat2_q1 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4);
+
+  pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
+  pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
+  flat_p1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(sum_p3, p1)), 3);
+  flat_q1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                                         _mm_add_epi16(sum_q3, q1)), 3);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  sum_p3 = _mm_add_epi16(sum_p3, p3);
+  sum_q3 = _mm_add_epi16(sum_q3, q3);
+
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
+  flat2_p2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p2)), 4);
+  flat2_q2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q2)), 4);
+
+  pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
+  pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
+  flat_p2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(sum_p3, p2)), 3);
+  flat_q2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                                         _mm_add_epi16(sum_q3, q2)), 3);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
+  flat2_p3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p3)), 4);
+  flat2_q3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q3)), 4);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
+  flat2_p4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p4)), 4);
+  flat2_q4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q4)), 4);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
+  flat2_p5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p5)), 4);
+  flat2_q5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q5)), 4);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
+  flat2_p6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p6)), 4);
+  flat2_q6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q6)), 4);
+
+  //  wide flat
+  //  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  //  high_filter8
+  p2 = _mm_andnot_si128(flat, p2);
+  //  p2 remains unchanged if !(flat && mask)
+  flat_p2 = _mm_and_si128(flat, flat_p2);
+  //  when (flat && mask)
+  p2 = _mm_or_si128(p2, flat_p2);  // full list of p2 values
+  q2 = _mm_andnot_si128(flat, q2);
+  flat_q2 = _mm_and_si128(flat, flat_q2);
+  q2 = _mm_or_si128(q2, flat_q2);  // full list of q2 values
+
+  ps1 = _mm_andnot_si128(flat, ps1);
+  //  p1 takes the value assigned to in in filter4 if !(flat && mask)
+  flat_p1 = _mm_and_si128(flat, flat_p1);
+  //  when (flat && mask)
+  p1 = _mm_or_si128(ps1, flat_p1);  // full list of p1 values
+  qs1 = _mm_andnot_si128(flat, qs1);
+  flat_q1 = _mm_and_si128(flat, flat_q1);
+  q1 = _mm_or_si128(qs1, flat_q1);  // full list of q1 values
+
+  ps0 = _mm_andnot_si128(flat, ps0);
+  //  p0 takes the value assigned to in in filter4 if !(flat && mask)
+  flat_p0 = _mm_and_si128(flat, flat_p0);
+  //  when (flat && mask)
+  p0 = _mm_or_si128(ps0, flat_p0);  // full list of p0 values
+  qs0 = _mm_andnot_si128(flat, qs0);
+  flat_q0 = _mm_and_si128(flat, flat_q0);
+  q0 = _mm_or_si128(qs0, flat_q0);  // full list of q0 values
+  // end high_filter8
+
+  // high_filter16
+  p6 = _mm_andnot_si128(flat2, p6);
+  //  p6 remains unchanged if !(flat2 && flat && mask)
+  flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+  //  get values for when (flat2 && flat && mask)
+  p6 = _mm_or_si128(p6, flat2_p6);  // full list of p6 values
+  q6 = _mm_andnot_si128(flat2, q6);
+  //  q6 remains unchanged if !(flat2 && flat && mask)
+  flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+  //  get values for when (flat2 && flat && mask)
+  q6 = _mm_or_si128(q6, flat2_q6);  // full list of q6 values
+  _mm_store_si128((__m128i *)(s - 7 * p), p6);
+  _mm_store_si128((__m128i *)(s + 6 * p), q6);
+
+  p5 = _mm_andnot_si128(flat2, p5);
+  //  p5 remains unchanged if !(flat2 && flat && mask)
+  flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+  //  get values for when (flat2 && flat && mask)
+  p5 = _mm_or_si128(p5, flat2_p5);
+  //  full list of p5 values
+  q5 = _mm_andnot_si128(flat2, q5);
+  //  q5 remains unchanged if !(flat2 && flat && mask)
+  flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+  //  get values for when (flat2 && flat && mask)
+  q5 = _mm_or_si128(q5, flat2_q5);
+  //  full list of q5 values
+  _mm_store_si128((__m128i *)(s - 6 * p), p5);
+  _mm_store_si128((__m128i *)(s + 5 * p), q5);
+
+  p4 = _mm_andnot_si128(flat2, p4);
+  //  p4 remains unchanged if !(flat2 && flat && mask)
+  flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+  //  get values for when (flat2 && flat && mask)
+  p4 = _mm_or_si128(p4, flat2_p4);  // full list of p4 values
+  q4 = _mm_andnot_si128(flat2, q4);
+  //  q4 remains unchanged if !(flat2 && flat && mask)
+  flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+  //  get values for when (flat2 && flat && mask)
+  q4 = _mm_or_si128(q4, flat2_q4);  // full list of q4 values
+  _mm_store_si128((__m128i *)(s - 5 * p), p4);
+  _mm_store_si128((__m128i *)(s + 4 * p), q4);
+
+  p3 = _mm_andnot_si128(flat2, p3);
+  //  p3 takes value from high_filter8 if !(flat2 && flat && mask)
+  flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+  //  get values for when (flat2 && flat && mask)
+  p3 = _mm_or_si128(p3, flat2_p3);  // full list of p3 values
+  q3 = _mm_andnot_si128(flat2, q3);
+  //  q3 takes value from high_filter8 if !(flat2 && flat && mask)
+  flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+  //  get values for when (flat2 && flat && mask)
+  q3 = _mm_or_si128(q3, flat2_q3);  // full list of q3 values
+  _mm_store_si128((__m128i *)(s - 4 * p), p3);
+  _mm_store_si128((__m128i *)(s + 3 * p), q3);
+
+  p2 = _mm_andnot_si128(flat2, p2);
+  //  p2 takes value from high_filter8 if !(flat2 && flat && mask)
+  flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+  //  get values for when (flat2 && flat && mask)
+  p2 = _mm_or_si128(p2, flat2_p2);
+  //  full list of p2 values
+  q2 = _mm_andnot_si128(flat2, q2);
+  //  q2 takes value from high_filter8 if !(flat2 && flat && mask)
+  flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+  //  get values for when (flat2 && flat && mask)
+  q2 = _mm_or_si128(q2, flat2_q2);  // full list of q2 values
+  _mm_store_si128((__m128i *)(s - 3 * p), p2);
+  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+
+  p1 = _mm_andnot_si128(flat2, p1);
+  //  p1 takes value from high_filter8 if !(flat2 && flat && mask)
+  flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+  //  get values for when (flat2 && flat && mask)
+  p1 = _mm_or_si128(p1, flat2_p1);  // full list of p1 values
+  q1 = _mm_andnot_si128(flat2, q1);
+  //  q1 takes value from high_filter8 if !(flat2 && flat && mask)
+  flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+  //  get values for when (flat2 && flat && mask)
+  q1 = _mm_or_si128(q1, flat2_q1);  // full list of q1 values
+  _mm_store_si128((__m128i *)(s - 2 * p), p1);
+  _mm_store_si128((__m128i *)(s + 1 * p), q1);
+
+  p0 = _mm_andnot_si128(flat2, p0);
+  //  p0 takes value from high_filter8 if !(flat2 && flat && mask)
+  flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+  //  get values for when (flat2 && flat && mask)
+  p0 = _mm_or_si128(p0, flat2_p0);  // full list of p0 values
+  q0 = _mm_andnot_si128(flat2, q0);
+  //  q0 takes value from high_filter8 if !(flat2 && flat && mask)
+  flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+  //  get values for when (flat2 && flat && mask)
+  q0 = _mm_or_si128(q0, flat2_q0);  // full list of q0 values
+  _mm_store_si128((__m128i *)(s - 1 * p), p0);
+  _mm_store_si128((__m128i *)(s - 0 * p), q0);
+}
+
+static void highbd_mb_lpf_horizontal_edge_w_sse2_16(uint16_t *s,
+                                                    int p,
+                                                    const uint8_t *_blimit,
+                                                    const uint8_t *_limit,
+                                                    const uint8_t *_thresh,
+                                                    int bd) {
+  highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
+  highbd_mb_lpf_horizontal_edge_w_sse2_8(s + 8, p, _blimit, _limit, _thresh,
+                                         bd);
+}
+
+// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
+void vp9_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
+                                       const uint8_t *_blimit,
+                                       const uint8_t *_limit,
+                                       const uint8_t *_thresh,
+                                       int count, int bd) {
+  if (count == 1)
+    highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
+  else
+    highbd_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh, bd);
+}
+
+void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh,
+                                      int count, int bd) {
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_op2, 16);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_op1, 16);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_op0, 16);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq2, 16);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq1, 16);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq0, 16);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i blimit = _mm_slli_epi16(
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero),
+      bd - 8);
+  const __m128i limit = _mm_slli_epi16(
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero),
+      bd - 8);
+  const __m128i thresh = _mm_slli_epi16(
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero),
+      bd - 8);
+  __m128i mask, hev, flat;
+  __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
+  __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
+  __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
+  __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
+  __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
+  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
+  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
+  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+  const __m128i four = _mm_set1_epi16(4);
+  __m128i workp_a, workp_b, workp_shft;
+
+  const __m128i t4 = _mm_set1_epi16(4);
+  const __m128i t3 = _mm_set1_epi16(3);
+  const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
+  const __m128i t1 = _mm_set1_epi16(0x1);
+  const __m128i ps1 = _mm_subs_epi16(p1, t80);
+  const __m128i ps0 = _mm_subs_epi16(p0, t80);
+  const __m128i qs0 = _mm_subs_epi16(q0, t80);
+  const __m128i qs1 = _mm_subs_epi16(q1, t80);
+  __m128i filt;
+  __m128i work_a;
+  __m128i filter1, filter2;
+
+  (void)count;
+
+  // filter_mask and hev_mask
+  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
+                          _mm_subs_epu16(p0, p1));
+  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
+                          _mm_subs_epu16(q0, q1));
+
+  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
+                          _mm_subs_epu16(q0, p0));
+  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
+                          _mm_subs_epu16(q1, p1));
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_max_epi16(abs_p1p0, mask);
+  // mask |= (abs(p1 - p0) > limit) * -1;
+  mask = _mm_max_epi16(abs_q1q0, mask);
+  // mask |= (abs(q1 - q0) > limit) * -1;
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+                                    _mm_subs_epu16(p1, p2)),
+                       _mm_or_si128(_mm_subs_epu16(q2, q1),
+                                    _mm_subs_epu16(q1, q2)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
+                                    _mm_subs_epu16(p2, p3)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q2),
+                                    _mm_subs_epu16(q2, q3)));
+  mask = _mm_max_epi16(work, mask);
+  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  // flat_mask4
+  flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
+                                    _mm_subs_epu16(p0, p2)),
+                       _mm_or_si128(_mm_subs_epu16(q2, q0),
+                                    _mm_subs_epu16(q0, q2)));
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p0),
+                                    _mm_subs_epu16(p0, p3)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q0),
+                                    _mm_subs_epu16(q0, q3)));
+  flat = _mm_max_epi16(work, flat);
+  flat = _mm_max_epi16(abs_p1p0, flat);
+  flat = _mm_max_epi16(abs_q1q0, flat);
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);  // flat & mask
+
+  // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+  workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+  workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+  workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
+
+  workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
+
+  // lp filter
+  filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+  filt = _mm_and_si128(filt, hev);
+  work_a = _mm_subs_epi16(qs0, ps0);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  // (vp9_filter + 3 * (qs0 - ps0)) & mask
+  filt = signed_char_clamp_bd_sse2(filt, bd);
+  filt = _mm_and_si128(filt, mask);
+
+  filter1 = _mm_adds_epi16(filt, t4);
+  filter2 = _mm_adds_epi16(filt, t3);
+
+  // Filter1 >> 3
+  filter1 = signed_char_clamp_bd_sse2(filter1, bd);
+  filter1 = _mm_srai_epi16(filter1, 3);
+
+  // Filter2 >> 3
+  filter2 = signed_char_clamp_bd_sse2(filter2, bd);
+  filter2 = _mm_srai_epi16(filter2, 3);
+
+  // filt >> 1
+  filt = _mm_adds_epi16(filter1, t1);
+  filt = _mm_srai_epi16(filt, 1);
+  // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+  filt = _mm_andnot_si128(hev, filt);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  q0 = _mm_load_si128((__m128i *)flat_oq0);
+  work_a = _mm_andnot_si128(flat, work_a);
+  q0 = _mm_and_si128(flat, q0);
+  q0 = _mm_or_si128(work_a, q0);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  q1 = _mm_load_si128((__m128i *)flat_oq1);
+  work_a = _mm_andnot_si128(flat, work_a);
+  q1 = _mm_and_si128(flat, q1);
+  q1 = _mm_or_si128(work_a, q1);
+
+  work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q2 = _mm_load_si128((__m128i *)flat_oq2);
+  work_a = _mm_andnot_si128(flat, work_a);
+  q2 = _mm_and_si128(flat, q2);
+  q2 = _mm_or_si128(work_a, q2);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  p0 = _mm_load_si128((__m128i *)flat_op0);
+  work_a = _mm_andnot_si128(flat, work_a);
+  p0 = _mm_and_si128(flat, p0);
+  p0 = _mm_or_si128(work_a, p0);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  p1 = _mm_load_si128((__m128i *)flat_op1);
+  work_a = _mm_andnot_si128(flat, work_a);
+  p1 = _mm_and_si128(flat, p1);
+  p1 = _mm_or_si128(work_a, p1);
+
+  work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p2 = _mm_load_si128((__m128i *)flat_op2);
+  work_a = _mm_andnot_si128(flat, work_a);
+  p2 = _mm_and_si128(flat, p2);
+  p2 = _mm_or_si128(work_a, p2);
+
+  _mm_store_si128((__m128i *)(s - 3 * p), p2);
+  _mm_store_si128((__m128i *)(s - 2 * p), p1);
+  _mm_store_si128((__m128i *)(s - 1 * p), p0);
+  _mm_store_si128((__m128i *)(s + 0 * p), q0);
+  _mm_store_si128((__m128i *)(s + 1 * p), q1);
+  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+}
+
+void vp9_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
+                                           const uint8_t *_blimit0,
+                                           const uint8_t *_limit0,
+                                           const uint8_t *_thresh0,
+                                           const uint8_t *_blimit1,
+                                           const uint8_t *_limit1,
+                                           const uint8_t *_thresh1,
+                                           int bd) {
+  vp9_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+  vp9_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1,
+                                   1, bd);
+}
+
+void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh,
+                                      int count, int bd) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i blimit = _mm_slli_epi16(
+      _mm_unpacklo_epi8(
+          _mm_load_si128((const __m128i *)_blimit), zero), bd - 8);
+  const __m128i limit = _mm_slli_epi16(
+      _mm_unpacklo_epi8(
+          _mm_load_si128((const __m128i *)_limit), zero), bd - 8);
+  const __m128i thresh = _mm_slli_epi16(
+      _mm_unpacklo_epi8(
+          _mm_load_si128((const __m128i *)_thresh), zero), bd - 8);
+  __m128i mask, hev, flat;
+  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
+                                        _mm_subs_epu16(p0, p1));
+  const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
+                                        _mm_subs_epu16(q0, q1));
+  const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
+                                  _mm_subs_epu16(q0, p0));
+  __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
+                                  _mm_subs_epu16(q1, p1));
+  __m128i work;
+  const __m128i t4 = _mm_set1_epi16(4);
+  const __m128i t3 = _mm_set1_epi16(3);
+  const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
+  const __m128i tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), bd - 8);
+  const __m128i tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), bd - 8);
+  const __m128i t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 16 - bd);
+  // equivalent to shifting 0x1f left by bitdepth - 8
+  // and setting new bits to 1
+  const __m128i t1 = _mm_set1_epi16(0x1);
+  const __m128i t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 16 - bd);
+  // equivalent to shifting 0x7f left by bitdepth - 8
+  // and setting new bits to 1
+  const __m128i ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)),
+                                     t80);
+  const __m128i ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)),
+                                     t80);
+  const __m128i qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)),
+                                     t80);
+  const __m128i qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)),
+                                     t80);
+  __m128i filt;
+  __m128i work_a;
+  __m128i filter1, filter2;
+
+  (void)count;
+
+  // filter_mask and hev_mask
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_max_epi16(flat, mask);
+  // mask |= (abs(p1 - p0) > limit) * -1;
+  // mask |= (abs(q1 - q0) > limit) * -1;
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+                                    _mm_subs_epu16(p1, p2)),
+                       _mm_or_si128(_mm_subs_epu16(p3, p2),
+                                    _mm_subs_epu16(p2, p3)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q1),
+                                    _mm_subs_epu16(q1, q2)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q2),
+                                    _mm_subs_epu16(q2, q3)));
+  mask = _mm_max_epi16(work, mask);
+  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  // filter4
+  filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+  filt = _mm_and_si128(filt, hev);
+  work_a = _mm_subs_epi16(qs0, ps0);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+  // (vp9_filter + 3 * (qs0 - ps0)) & mask
+  filt = _mm_and_si128(filt, mask);
+
+  filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+  filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+  // Filter1 >> 3
+  work_a = _mm_cmpgt_epi16(zero, filter1);  // get the values that are <0
+  filter1 = _mm_srli_epi16(filter1, 3);
+  work_a = _mm_and_si128(work_a, tffe0);  // sign bits for the values < 0
+  filter1 = _mm_and_si128(filter1, t1f);  // clamp the range
+  filter1 = _mm_or_si128(filter1, work_a);  // reinsert the sign bits
+
+  // Filter2 >> 3
+  work_a = _mm_cmpgt_epi16(zero, filter2);
+  filter2 = _mm_srli_epi16(filter2, 3);
+  work_a = _mm_and_si128(work_a, tffe0);
+  filter2 = _mm_and_si128(filter2, t1f);
+  filter2 = _mm_or_si128(filter2, work_a);
+
+  // filt >> 1
+  filt = _mm_adds_epi16(filter1, t1);
+  work_a = _mm_cmpgt_epi16(zero, filt);
+  filt = _mm_srli_epi16(filt, 1);
+  work_a = _mm_and_si128(work_a, tff80);
+  filt = _mm_and_si128(filt, t7f);
+  filt = _mm_or_si128(filt, work_a);
+
+  filt = _mm_andnot_si128(hev, filt);
+
+  q0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
+  q1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), t80);
+  p0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
+  p1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80);
+
+  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+}
+
+void vp9_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
+                                           const uint8_t *_blimit0,
+                                           const uint8_t *_limit0,
+                                           const uint8_t *_thresh0,
+                                           const uint8_t *_blimit1,
+                                           const uint8_t *_limit1,
+                                           const uint8_t *_thresh1,
+                                           int bd) {
+  vp9_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+  vp9_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1,
+                                   bd);
+}
+
+static INLINE void highbd_transpose(uint16_t *src[], int in_p,
+                                    uint16_t *dst[], int out_p,
+                                    int num_8x8_to_transpose) {
+  int idx8x8 = 0;
+  __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
+  do {
+    uint16_t *in = src[idx8x8];
+    uint16_t *out = dst[idx8x8];
+
+    p0 = _mm_loadu_si128((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
+    p1 = _mm_loadu_si128((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
+    p2 = _mm_loadu_si128((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
+    p3 = _mm_loadu_si128((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
+    p4 = _mm_loadu_si128((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
+    p5 = _mm_loadu_si128((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
+    p6 = _mm_loadu_si128((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
+    p7 = _mm_loadu_si128((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
+    // 00 10 01 11 02 12 03 13
+    x0 = _mm_unpacklo_epi16(p0, p1);
+    // 20 30 21 31 22 32 23 33
+    x1 = _mm_unpacklo_epi16(p2, p3);
+    // 40 50 41 51 42 52 43 53
+    x2 = _mm_unpacklo_epi16(p4, p5);
+    // 60 70 61 71 62 72 63 73
+    x3 = _mm_unpacklo_epi16(p6, p7);
+    // 00 10 20 30 01 11 21 31
+    x4 = _mm_unpacklo_epi32(x0, x1);
+    // 40 50 60 70 41 51 61 71
+    x5 = _mm_unpacklo_epi32(x2, x3);
+    // 00 10 20 30 40 50 60 70
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 01 11 21 31 41 51 61 71
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 0*out_p), x6);
+    // 00 10 20 30 40 50 60 70
+    _mm_storeu_si128((__m128i *)(out + 1*out_p), x7);
+    // 01 11 21 31 41 51 61 71
+
+    // 02 12 22 32 03 13 23 33
+    x4 = _mm_unpackhi_epi32(x0, x1);
+    // 42 52 62 72 43 53 63 73
+    x5 = _mm_unpackhi_epi32(x2, x3);
+    // 02 12 22 32 42 52 62 72
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 03 13 23 33 43 53 63 73
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 2*out_p), x6);
+    // 02 12 22 32 42 52 62 72
+    _mm_storeu_si128((__m128i *)(out + 3*out_p), x7);
+    // 03 13 23 33 43 53 63 73
+
+    // 04 14 05 15 06 16 07 17
+    x0 = _mm_unpackhi_epi16(p0, p1);
+    // 24 34 25 35 26 36 27 37
+    x1 = _mm_unpackhi_epi16(p2, p3);
+    // 44 54 45 55 46 56 47 57
+    x2 = _mm_unpackhi_epi16(p4, p5);
+    // 64 74 65 75 66 76 67 77
+    x3 = _mm_unpackhi_epi16(p6, p7);
+    // 04 14 24 34 05 15 25 35
+    x4 = _mm_unpacklo_epi32(x0, x1);
+    // 44 54 64 74 45 55 65 75
+    x5 = _mm_unpacklo_epi32(x2, x3);
+    // 04 14 24 34 44 54 64 74
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 05 15 25 35 45 55 65 75
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 4*out_p), x6);
+    // 04 14 24 34 44 54 64 74
+    _mm_storeu_si128((__m128i *)(out + 5*out_p), x7);
+    // 05 15 25 35 45 55 65 75
+
+    // 06 16 26 36 07 17 27 37
+    x4 = _mm_unpackhi_epi32(x0, x1);
+    // 46 56 66 76 47 57 67 77
+    x5 = _mm_unpackhi_epi32(x2, x3);
+    // 06 16 26 36 46 56 66 76
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 07 17 27 37 47 57 67 77
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 6*out_p), x6);
+    // 06 16 26 36 46 56 66 76
+    _mm_storeu_si128((__m128i *)(out + 7*out_p), x7);
+    // 07 17 27 37 47 57 67 77
+  } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1,
+                                        int in_p, uint16_t *out, int out_p) {
+  uint16_t *src0[1];
+  uint16_t *src1[1];
+  uint16_t *dest0[1];
+  uint16_t *dest1[1];
+  src0[0] = in0;
+  src1[0] = in1;
+  dest0[0] = out;
+  dest1[0] = out + 8;
+  highbd_transpose(src0, in_p, dest0, out_p, 1);
+  highbd_transpose(src1, in_p, dest1, out_p, 1);
+}
+
+void vp9_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
+                                    const uint8_t *blimit,
+                                    const uint8_t *limit,
+                                    const uint8_t *thresh,
+                                    int count, int bd) {
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 8 * 8);
+  uint16_t *src[1];
+  uint16_t *dst[1];
+  (void)count;
+
+  // Transpose 8x8
+  src[0] = s - 4;
+  dst[0] = t_dst;
+
+  highbd_transpose(src, p, dst, 8, 1);
+
+  // Loop filtering
+  vp9_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+                                   bd);
+
+  src[0] = t_dst;
+  dst[0] = s - 4;
+
+  // Transpose back
+  highbd_transpose(src, 8, dst, p, 1);
+}
+
+void vp9_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
+                                         const uint8_t *blimit0,
+                                         const uint8_t *limit0,
+                                         const uint8_t *thresh0,
+                                         const uint8_t *blimit1,
+                                         const uint8_t *limit1,
+                                         const uint8_t *thresh1,
+                                         int bd) {
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 16 * 8);
+  uint16_t *src[2];
+  uint16_t *dst[2];
+
+  // Transpose 8x16
+  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  vp9_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                        thresh0, blimit1, limit1, thresh1, bd);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  highbd_transpose(src, 16, dst, p, 2);
+}
+
+void vp9_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
+                                    const uint8_t *blimit,
+                                    const uint8_t *limit,
+                                    const uint8_t *thresh,
+                                    int count, int bd) {
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 8 * 8);
+  uint16_t *src[1];
+  uint16_t *dst[1];
+  (void)count;
+
+  // Transpose 8x8
+  src[0] = s - 4;
+  dst[0] = t_dst;
+
+  highbd_transpose(src, p, dst, 8, 1);
+
+  // Loop filtering
+  vp9_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+                                   bd);
+
+  src[0] = t_dst;
+  dst[0] = s - 4;
+
+  // Transpose back
+  highbd_transpose(src, 8, dst, p, 1);
+}
+
+void vp9_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
+                                         const uint8_t *blimit0,
+                                         const uint8_t *limit0,
+                                         const uint8_t *thresh0,
+                                         const uint8_t *blimit1,
+                                         const uint8_t *limit1,
+                                         const uint8_t *thresh1,
+                                         int bd) {
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 16 * 8);
+  uint16_t *src[2];
+  uint16_t *dst[2];
+
+  // Transpose 8x16
+  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  vp9_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                        thresh0, blimit1, limit1, thresh1, bd);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  highbd_transpose(src, 16, dst, p, 2);
+}
+
+void vp9_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh,
+                                     int bd) {
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 8 * 16);
+  uint16_t *src[2];
+  uint16_t *dst[2];
+
+  src[0] = s - 8;
+  src[1] = s;
+  dst[0] = t_dst;
+  dst[1] = t_dst + 8 * 8;
+
+  // Transpose 16x8
+  highbd_transpose(src, p, dst, 8, 2);
+
+  // Loop filtering
+  highbd_mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit,
+                                         thresh, bd);
+  src[0] = t_dst;
+  src[1] = t_dst + 8 * 8;
+  dst[0] = s - 8;
+  dst[1] = s;
+
+  // Transpose back
+  highbd_transpose(src, 8, dst, p, 2);
+}
+
+void vp9_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
+                                          int p,
+                                          const uint8_t *blimit,
+                                          const uint8_t *limit,
+                                          const uint8_t *thresh,
+                                          int bd) {
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 256);
+
+  //  Transpose 16x16
+  highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
+  highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+
+  //  Loop filtering
+  highbd_mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
+                                          thresh, bd);
+
+  //  Transpose back
+  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
+  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+}
index 4e85caf45cdef061c50465c5d0c778ee6d246ac9..db55de16de09a54c68de0d538fb21bc206663ca2 100644 (file)
@@ -196,6 +196,64 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
   if (eob > 0) {
     TX_TYPE tx_type = DCT_DCT;
     tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (xd->lossless) {
+        tx_type = DCT_DCT;
+        vp9_high_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+      } else {
+        const PLANE_TYPE plane_type = pd->plane_type;
+        switch (tx_size) {
+          case TX_4X4:
+            tx_type = get_tx_type_4x4(plane_type, xd, block);
+            vp9_high_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_8X8:
+            tx_type = get_tx_type(plane_type, xd);
+            vp9_high_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_16X16:
+            tx_type = get_tx_type(plane_type, xd);
+            vp9_high_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_32X32:
+            tx_type = DCT_DCT;
+            vp9_high_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          default:
+            assert(0 && "Invalid transform size");
+        }
+      }
+    } else {
+      if (xd->lossless) {
+        tx_type = DCT_DCT;
+        vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+      } else {
+        const PLANE_TYPE plane_type = pd->plane_type;
+        switch (tx_size) {
+          case TX_4X4:
+            tx_type = get_tx_type_4x4(plane_type, xd, block);
+            vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_8X8:
+            tx_type = get_tx_type(plane_type, xd);
+            vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_16X16:
+            tx_type = get_tx_type(plane_type, xd);
+            vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_32X32:
+            tx_type = DCT_DCT;
+            vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+            break;
+          default:
+            assert(0 && "Invalid transform size");
+            return;
+        }
+      }
+    }
+#else
     if (xd->lossless) {
       tx_type = DCT_DCT;
       vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
@@ -220,8 +278,10 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
           break;
         default:
           assert(0 && "Invalid transform size");
+          return;
       }
     }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
     if (eob == 1) {
       vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
@@ -592,13 +652,18 @@ static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   update |= read_delta_q(rb, &cm->y_dc_delta_q);
   update |= read_delta_q(rb, &cm->uv_dc_delta_q);
   update |= read_delta_q(rb, &cm->uv_ac_delta_q);
-  if (update)
+  if (update || cm->bit_depth != cm->dequant_bit_depth) {
     vp9_init_dequantizer(cm);
+    cm->dequant_bit_depth = cm->bit_depth;
+  }
 
   xd->lossless = cm->base_qindex == 0 &&
                  cm->y_dc_delta_q == 0 &&
                  cm->uv_dc_delta_q == 0 &&
                  cm->uv_ac_delta_q == 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+  xd->bd = (int)cm->bit_depth;
+#endif
 }
 
 static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) {
@@ -1139,8 +1204,17 @@ BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb) {
 
 static void read_bitdepth_colorspace_sampling(
     VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
-  if (cm->profile >= PROFILE_2)
+  if (cm->profile >= PROFILE_2) {
     cm->bit_depth = vp9_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10;
+#if CONFIG_VP9_HIGHBITDEPTH
+    cm->use_highbitdepth = 1;
+#endif
+  } else {
+    cm->bit_depth = VPX_BITS_8;
+#if CONFIG_VP9_HIGHBITDEPTH
+    cm->use_highbitdepth = 0;
+#endif
+  }
   cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3);
   if (cm->color_space != SRGB) {
     vp9_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
@@ -1244,6 +1318,10 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
         // case (normative).
         cm->color_space = BT_601;
         cm->subsampling_y = cm->subsampling_x = 1;
+        cm->bit_depth = VPX_BITS_8;
+#if CONFIG_VP9_HIGHBITDEPTH
+        cm->use_highbitdepth = 0;
+#endif
       }
 
       pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
@@ -1284,6 +1362,9 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
       }
     }
   }
+#if CONFIG_VP9_HIGHBITDEPTH
+  get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;
+#endif
 
   if (pbi->need_resync) {
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
index 6ee3d7037faad92d4463ba1e91bc6a5b0c8293ea..b7e1a009b824d24e4e6cc59a2f2b6da677e13c07 100644 (file)
@@ -69,6 +69,7 @@ VP9Decoder *vp9_decoder_create() {
   cm->current_video_frame = 0;
   pbi->ready_for_new_data = 1;
   cm->bit_depth = VPX_BITS_8;
+  cm->dequant_bit_depth = VPX_BITS_8;
 
   // vp9_init_dequantizer() is first called here. Add check in
   // frame_init_dequantizer() to avoid unnecessary calling of
index b96f00fd19cd15c2336c00149671cc0c7047b3ef..15de5c473b3e4ced4cd98e1d40fd08007629a679 100644 (file)
@@ -34,6 +34,9 @@ static int segment_id[MAX_SEGMENTS] = { 5, 3, 1, 0, 2, 4, 6, 7 };
 #define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN]
 
 DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = {0};
+#if CONFIG_VP9_HIGHBITDEPTH
+DECLARE_ALIGNED(16, static const uint16_t, vp9_highbd_64_zeros[64]) = {0};
+#endif
 
 unsigned int vp9_vaq_segment_id(int energy) {
   ENERGY_IN_BOUNDS(energy);
@@ -126,14 +129,40 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
     const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow;
     const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow;
     int avg;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      high_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+                    CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh, &sse,
+                    &avg);
+      sse >>= 2 * (xd->bd - 8);
+      avg >>= (xd->bd - 8);
+    } else {
+      variance(x->plane[0].src.buf, x->plane[0].src.stride,
+               vp9_64_zeros, 0, bw, bh, &sse, &avg);
+    }
+#else
     variance(x->plane[0].src.buf, x->plane[0].src.stride,
              vp9_64_zeros, 0, bw, bh, &sse, &avg);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     var = sse - (((int64_t)avg * avg) / (bw * bh));
     return (256 * var) / (bw * bh);
   } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+                               x->plane[0].src.stride,
+                               CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros),
+                               0, &sse);
+    } else {
+      var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+                               x->plane[0].src.stride,
+                               vp9_64_zeros, 0, &sse);
+    }
+#else
     var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
                              x->plane[0].src.stride,
                              vp9_64_zeros, 0, &sse);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     return (256 * var) >> num_pels_log2_lookup[bs];
   }
 }
index f658ddafb1fd7805426a5a55d4a0ab4a701f1b5a..5114fc55f43b1a0fd8dceeb42e87b8c55242b9aa 100644 (file)
@@ -120,16 +120,28 @@ static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w) {
 }
 
 static void pack_mb_tokens(vp9_writer *w,
-                           TOKENEXTRA **tp, const TOKENEXTRA *const stop) {
+                           TOKENEXTRA **tp, const TOKENEXTRA *const stop,
+                           vpx_bit_depth_t bit_depth) {
   TOKENEXTRA *p = *tp;
 
   while (p < stop && p->token != EOSB_TOKEN) {
     const int t = p->token;
     const struct vp9_token *const a = &vp9_coef_encodings[t];
-    const vp9_extra_bit *const b = &vp9_extra_bits[t];
     int i = 0;
     int v = a->value;
     int n = a->len;
+#if CONFIG_VP9_HIGHBITDEPTH
+    const vp9_extra_bit *b;
+    if (bit_depth == VPX_BITS_12)
+      b = &vp9_extra_bits_high12[t];
+    else if (bit_depth == VPX_BITS_10)
+      b = &vp9_extra_bits_high10[t];
+    else
+      b = &vp9_extra_bits[t];
+#else
+    const vp9_extra_bit *const b = &vp9_extra_bits[t];
+    (void) bit_depth;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
     /* skip one or two nodes */
     if (p->skip_eob_node) {
@@ -387,7 +399,7 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
   }
 
   assert(*tok < tok_end);
-  pack_mb_tokens(w, tok, tok_end);
+  pack_mb_tokens(w, tok, tok_end, cm->bit_depth);
 }
 
 static void write_partition(const VP9_COMMON *const cm,
index 681b2a575b584d7148b31a92f1412fe72c2aa1e0..11cb27f4305845653a3cbf8e04d256e7daa82f83 100644 (file)
@@ -143,7 +143,7 @@ static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride,
 
   // Otherwise, we try to dampen the filter if the delta is not too high.
   delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising))
-           >> 8) + 1;
+           >> num_pels_log2_lookup[bs]) + 1;
 
   if (delta >= delta_thresh(bs, increase_denoising)) {
     return COPY_BLOCK;
index e99e7b5e473679ef6c343d7d538a706c5ce0e712..825fcf3069e8f3d9f5a216bf6a0ff64b7b071dd3 100644 (file)
@@ -61,16 +61,51 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
 // Eventually this should be replaced by custom no-reference routines,
 //  which will be faster.
 static const uint8_t VP9_VAR_OFFS[64] = {
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128
 };
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static const uint16_t VP9_HIGH_VAR_OFFS_8[64] = {
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static const uint16_t VP9_HIGH_VAR_OFFS_10[64] = {
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4
+};
+
+static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = {
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16
+};
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,
                                               const struct buf_2d *ref,
                                               BLOCK_SIZE bs) {
@@ -80,6 +115,32 @@ static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static unsigned int high_get_sby_perpixel_variance(
+    VP9_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs, int bd) {
+  unsigned int var, sse;
+  switch (bd) {
+    case 10:
+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10),
+                               0, &sse);
+      break;
+    case 12:
+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12),
+                               0, &sse);
+      break;
+    case 8:
+    default:
+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8),
+                               0, &sse);
+      break;
+  }
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
                                                    const struct buf_2d *ref,
                                                    int mi_row, int mi_col,
@@ -419,6 +480,22 @@ static void choose_partitioning(VP9_COMP *cpi,
   } else {
     d = VP9_VAR_OFFS;
     dp = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      switch (xd->bd) {
+        case 10:
+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10);
+          break;
+        case 12:
+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12);
+          break;
+        case 8:
+        default:
+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8);
+          break;
+      }
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
   // Fill in the entire tree of 8x8 variances for splits.
@@ -734,7 +811,17 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   // Set to zero to make sure we do not use the previous encoded frame stats
   mbmi->skip = 0;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    x->source_variance =
+        high_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize, xd->bd);
+  } else {
+    x->source_variance =
+        get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
+#else
   x->source_variance = get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Save rdmult before it might be changed, so it can be restored later.
   orig_rdmult = x->rdmult;
@@ -3150,9 +3237,34 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
 
   for (i = 0; i < cm->mb_rows; i++) {
     for (j = 0; j < cm->mb_cols; j++) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        switch (cm->bit_depth) {
+          case VPX_BITS_8:
+            vp9_high_get16x16var(src, src_stride, last_src, last_stride,
+                                 &var16->sse, &var16->sum);
+            break;
+          case VPX_BITS_10:
+            vp9_high_10_get16x16var(src, src_stride, last_src, last_stride,
+                                    &var16->sse, &var16->sum);
+            break;
+          case VPX_BITS_12:
+            vp9_high_12_get16x16var(src, src_stride, last_src, last_stride,
+                                    &var16->sse, &var16->sum);
+            break;
+          default:
+            assert(0 && "cm->bit_depth should be VPX_BITS_8, VPX_BITS_10"
+                   " or VPX_BITS_12");
+            return -1;
+        }
+      } else {
+        vp9_get16x16var(src, src_stride, last_src, last_stride,
+                        &var16->sse, &var16->sum);
+      }
+#else
       vp9_get16x16var(src, src_stride, last_src, last_stride,
                       &var16->sse, &var16->sum);
-
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       var16->var = var16->sse -
           (((uint32_t)var16->sum * var16->sum) >> 8);
 
@@ -3294,7 +3406,15 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 
   cm->tx_mode = select_tx_mode(cpi);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth)
+    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+  else
+    x->fwd_txm4x4 = xd->lossless ? vp9_high_fwht4x4 : vp9_high_fdct4x4;
+  x->high_itxm_add = xd->lossless ? vp9_high_iwht4x4_add : vp9_high_idct4x4_add;
+#else
   x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
 
   if (xd->lossless) {
index 2eae1497002cc055cc3c40fe59e7631e398e4ba5..3b456b9720b28cf632b36a2a19433a538a2cfdcb 100644 (file)
@@ -51,6 +51,29 @@ void vp9_subtract_block_c(int rows, int cols,
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_subtract_block_c(int rows, int cols,
+                               int16_t *diff, ptrdiff_t diff_stride,
+                               const uint8_t *src8, ptrdiff_t src_stride,
+                               const uint8_t *pred8, ptrdiff_t pred_stride,
+                               int bd) {
+  int r, c;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  (void) bd;
+
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++) {
+      diff[c] = src[c] - pred[c];
+    }
+
+    diff += diff_stride;
+    pred += pred_stride;
+    src  += src_stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
@@ -58,6 +81,13 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_high_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                            pd->dst.buf, pd->dst.stride, x->e_mbd.bd);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   vp9_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
                      pd->dst.buf, pd->dst.stride);
 }
@@ -124,6 +154,8 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
   int64_t rd_cost0, rd_cost1;
   int rate0, rate1, error0, error1, t0, t1;
   int best, band, pt, i, final_eob;
+  const TOKENVALUE *dct_value_tokens;
+  const int16_t *dct_value_cost;
 
   assert((!type && !plane) || (type && plane));
   assert(eob <= default_eob);
@@ -140,9 +172,24 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
   tokens[eob][0].qc = 0;
   tokens[eob][1] = tokens[eob][0];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->bd == 12) {
+    dct_value_tokens = vp9_dct_value_tokens_high12_ptr;
+    dct_value_cost = vp9_dct_value_cost_high12_ptr;
+  } else if (xd->bd == 10) {
+    dct_value_tokens = vp9_dct_value_tokens_high10_ptr;
+    dct_value_cost = vp9_dct_value_cost_high10_ptr;
+  } else {
+    dct_value_tokens = vp9_dct_value_tokens_ptr;
+    dct_value_cost = vp9_dct_value_cost_ptr;
+  }
+#else
+  dct_value_tokens = vp9_dct_value_tokens_ptr;
+  dct_value_cost = vp9_dct_value_cost_ptr;
+#endif
   for (i = 0; i < eob; i++)
     token_cache[scan[i]] =
-        vp9_pt_energy_class[vp9_dct_value_tokens_ptr[qcoeff[scan[i]]].token];
+        vp9_pt_energy_class[dct_value_tokens[qcoeff[scan[i]]].token];
 
   for (i = eob; i-- > 0;) {
     int base_bits, d2, dx;
@@ -156,7 +203,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
       /* Evaluate the first possibility for this state. */
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
-      t0 = (vp9_dct_value_tokens_ptr + x)->token;
+      t0 = (dct_value_tokens + x)->token;
       /* Consider both possible successor states. */
       if (next < default_eob) {
         band = band_translate[i + 1];
@@ -169,8 +216,13 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
       UPDATE_RD_COST();
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
-      base_bits = vp9_dct_value_cost_ptr[x];
+      base_bits = dct_value_cost[x];
       dx = mul * (dqcoeff[rc] - coeff[rc]);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dx >>= xd->bd - 8;
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       d2 = dx * dx;
       tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
       tokens[i][0].error = d2 + (best ? error1 : error0);
@@ -203,7 +255,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
         t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
         t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
       } else {
-        t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
+        t0 = t1 = (dct_value_tokens + x)->token;
       }
       if (next < default_eob) {
         band = band_translate[i + 1];
@@ -222,10 +274,18 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
       UPDATE_RD_COST();
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
-      base_bits = vp9_dct_value_cost_ptr[x];
+      base_bits = dct_value_cost[x];
 
       if (shortcut) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
+        } else {
+          dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+        }
+#else
         dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
         d2 = dx * dx;
       }
       tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
@@ -310,7 +370,7 @@ static INLINE void high_fdct32x32(int rd_transform, const int16_t *src,
   else
     vp9_high_fdct32x32(src, dst, src_stride);
 }
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
@@ -328,6 +388,44 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        high_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+        vp9_high_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                   p->round_fp, p->quant_fp, p->quant_shift,
+                                   qcoeff, dqcoeff, pd->dequant, p->zbin_extra,
+                                   eob, scan_order->scan, scan_order->iscan);
+        break;
+      case TX_16X16:
+        vp9_high_fdct16x16(src_diff, coeff, diff_stride);
+        vp9_high_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                             pd->dequant, p->zbin_extra, eob,
+                             scan_order->scan, scan_order->iscan);
+        break;
+      case TX_8X8:
+        vp9_high_fdct8x8(src_diff, coeff, diff_stride);
+        vp9_high_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                             pd->dequant, p->zbin_extra, eob,
+                             scan_order->scan, scan_order->iscan);
+        break;
+      case TX_4X4:
+        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        vp9_high_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                             pd->dequant, p->zbin_extra, eob,
+                             scan_order->scan, scan_order->iscan);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
@@ -379,6 +477,40 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        vp9_high_fdct32x32_1(src_diff, coeff, diff_stride);
+        vp9_high_quantize_dc_32x32(coeff, x->skip_block, p->round,
+                                   p->quant_fp[0], qcoeff, dqcoeff,
+                                   pd->dequant[0], eob);
+        break;
+      case TX_16X16:
+        vp9_high_fdct16x16_1(src_diff, coeff, diff_stride);
+        vp9_high_quantize_dc(coeff, x->skip_block, p->round,
+                             p->quant_fp[0], qcoeff, dqcoeff,
+                             pd->dequant[0], eob);
+        break;
+      case TX_8X8:
+        vp9_high_fdct8x8_1(src_diff, coeff, diff_stride);
+        vp9_high_quantize_dc(coeff, x->skip_block, p->round,
+                             p->quant_fp[0], qcoeff, dqcoeff,
+                             pd->dequant[0], eob);
+        break;
+      case TX_4X4:
+        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        vp9_high_quantize_dc(coeff, x->skip_block, p->round,
+                             p->quant_fp[0], qcoeff, dqcoeff,
+                             pd->dequant[0], eob);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   switch (tx_size) {
     case TX_32X32:
       vp9_fdct32x32_1(src_diff, coeff, diff_stride);
@@ -426,6 +558,44 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+     switch (tx_size) {
+      case TX_32X32:
+        high_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+        vp9_high_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                  p->round, p->quant, p->quant_shift, qcoeff,
+                                  dqcoeff, pd->dequant, p->zbin_extra, eob,
+                                  scan_order->scan, scan_order->iscan);
+        break;
+      case TX_16X16:
+        vp9_high_fdct16x16(src_diff, coeff, diff_stride);
+        vp9_high_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                            p->quant, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, p->zbin_extra, eob,
+                            scan_order->scan, scan_order->iscan);
+        break;
+      case TX_8X8:
+        vp9_high_fdct8x8(src_diff, coeff, diff_stride);
+        vp9_high_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                            p->quant, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, p->zbin_extra, eob,
+                            scan_order->scan, scan_order->iscan);
+        break;
+      case TX_4X4:
+        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        vp9_high_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                            p->quant, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, p->zbin_extra, eob,
+                            scan_order->scan, scan_order->iscan);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
@@ -520,6 +690,34 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
 
   if (x->skip_encode || p->eobs[block] == 0)
     return;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        vp9_high_idct32x32_add(dqcoeff, dst, pd->dst.stride,
+                               p->eobs[block], xd->bd);
+        break;
+      case TX_16X16:
+        vp9_high_idct16x16_add(dqcoeff, dst, pd->dst.stride,
+                               p->eobs[block], xd->bd);
+        break;
+      case TX_8X8:
+        vp9_high_idct8x8_add(dqcoeff, dst, pd->dst.stride,
+                             p->eobs[block], xd->bd);
+        break;
+      case TX_4X4:
+        // this is like vp9_short_idct4x4 but has a special case around eob<=1
+        // which is significant (not just an optimization) for the lossless
+        // case.
+        x->high_itxm_add(dqcoeff, dst, pd->dst.stride,
+                         p->eobs[block], xd->bd);
+        break;
+      default:
+        assert(0 && "Invalid transform size");
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   switch (tx_size) {
     case TX_32X32:
@@ -557,8 +755,15 @@ static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
 
   vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 
-  if (p->eobs[block] > 0)
+  if (p->eobs[block] > 0) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+       x->high_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd);
+       return;
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+  }
 }
 
 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
@@ -622,6 +827,115 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   src = &p->src.buf[4 * (j * src_stride + i)];
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        scan_order = &vp9_default_scan_orders[TX_32X32];
+        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+        vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,
+                                x->skip_encode ? src : dst,
+                                x->skip_encode ? src_stride : dst_stride,
+                                dst, dst_stride, i, j, plane);
+        if (!x->skip_recode) {
+          vp9_high_subtract_block(32, 32, src_diff, diff_stride,
+                                  src, src_stride, dst, dst_stride, xd->bd);
+          high_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+          vp9_high_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                    p->round, p->quant, p->quant_shift, qcoeff,
+                                    dqcoeff, pd->dequant, p->zbin_extra, eob,
+                                    scan_order->scan, scan_order->iscan);
+        }
+        if (!x->skip_encode && *eob) {
+          vp9_high_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+        }
+        break;
+      case TX_16X16:
+        tx_type = get_tx_type(pd->plane_type, xd);
+        scan_order = &vp9_scan_orders[TX_16X16][tx_type];
+        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+        vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
+                                x->skip_encode ? src : dst,
+                                x->skip_encode ? src_stride : dst_stride,
+                                dst, dst_stride, i, j, plane);
+        if (!x->skip_recode) {
+          vp9_high_subtract_block(16, 16, src_diff, diff_stride,
+                                  src, src_stride, dst, dst_stride, xd->bd);
+          vp9_high_fht16x16(src_diff, coeff, diff_stride, tx_type);
+          vp9_high_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, p->zbin_extra, eob,
+                              scan_order->scan, scan_order->iscan);
+        }
+        if (!x->skip_encode && *eob) {
+          vp9_high_iht16x16_add(tx_type, dqcoeff, dst, dst_stride,
+                                *eob, xd->bd);
+        }
+        break;
+      case TX_8X8:
+        tx_type = get_tx_type(pd->plane_type, xd);
+        scan_order = &vp9_scan_orders[TX_8X8][tx_type];
+        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+        vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
+                                x->skip_encode ? src : dst,
+                                x->skip_encode ? src_stride : dst_stride,
+                                dst, dst_stride, i, j, plane);
+        if (!x->skip_recode) {
+          vp9_high_subtract_block(8, 8, src_diff, diff_stride,
+                                  src, src_stride, dst, dst_stride, xd->bd);
+          vp9_high_fht8x8(src_diff, coeff, diff_stride, tx_type);
+          vp9_high_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, p->zbin_extra, eob,
+                              scan_order->scan, scan_order->iscan);
+        }
+        if (!x->skip_encode && *eob) {
+          vp9_high_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob,
+                              xd->bd);
+        }
+        break;
+      case TX_4X4:
+        tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
+        scan_order = &vp9_scan_orders[TX_4X4][tx_type];
+        mode = plane == 0 ? get_y_mode(xd->mi[0].src_mi, block) : mbmi->uv_mode;
+        vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
+                                x->skip_encode ? src : dst,
+                                x->skip_encode ? src_stride : dst_stride,
+                                dst, dst_stride, i, j, plane);
+
+        if (!x->skip_recode) {
+          vp9_high_subtract_block(4, 4, src_diff, diff_stride,
+                                  src, src_stride, dst, dst_stride, xd->bd);
+          if (tx_type != DCT_DCT)
+            vp9_high_fht4x4(src_diff, coeff, diff_stride, tx_type);
+          else
+            x->fwd_txm4x4(src_diff, coeff, diff_stride);
+          vp9_high_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, p->zbin_extra, eob,
+                              scan_order->scan, scan_order->iscan);
+        }
+
+        if (!x->skip_encode && *eob) {
+          if (tx_type == DCT_DCT)
+            // this is like vp9_short_idct4x4 but has a special case around
+            // eob<=1 which is significant (not just an optimization) for the
+            // lossless case.
+            x->high_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+          else
+            vp9_high_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd);
+        }
+        break;
+      default:
+        assert(0);
+        return;
+    }
+    if (*eob)
+      *(args->skip) = 0;
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   switch (tx_size) {
     case TX_32X32:
       scan_order = &vp9_default_scan_orders[TX_32X32];
index 5f5af192ca017373f68ef921e5a5bcff6958e071..da57370b005cadc3edf12aa21da3fa0aadf1cf15 100644 (file)
@@ -604,6 +604,612 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
                                            : maximum * bandwidth / 1000;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
+    cpi->fn_ptr[BT].sdf = SDF; \
+    cpi->fn_ptr[BT].sdaf = SDAF; \
+    cpi->fn_ptr[BT].vf = VF; \
+    cpi->fn_ptr[BT].svf = SVF; \
+    cpi->fn_ptr[BT].svaf = SVAF; \
+    cpi->fn_ptr[BT].sdx3f = SDX3F; \
+    cpi->fn_ptr[BT].sdx8f = SDX8F; \
+    cpi->fn_ptr[BT].sdx4df = SDX4DF;
+
+#define MAKE_BFP_SAD_WRAPPER(fnname) \
+static unsigned int fnname##_bits8(const uint8_t *src_ptr, \
+                                   int source_stride, \
+                                   const uint8_t *ref_ptr, \
+                                   int ref_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride); \
+} \
+static unsigned int fnname##_bits10(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2; \
+} \
+static unsigned int fnname##_bits12(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4; \
+}
+
+#define MAKE_BFP_SADAVG_WRAPPER(fnname) static unsigned int \
+fnname##_bits8(const uint8_t *src_ptr, \
+               int source_stride, \
+               const uint8_t *ref_ptr, \
+               int ref_stride, \
+               const uint8_t *second_pred) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred); \
+} \
+static unsigned int fnname##_bits10(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride, \
+                                    const uint8_t *second_pred) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+                second_pred) >> 2; \
+} \
+static unsigned int fnname##_bits12(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride, \
+                                    const uint8_t *second_pred) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+                second_pred) >> 4; \
+}
+
+#define MAKE_BFP_SAD3_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+                           int source_stride, \
+                           const uint8_t *ref_ptr, \
+                           int  ref_stride, \
+                           unsigned int *sad_array) {  \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 3; i++) \
+    sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 3; i++) \
+    sad_array[i] >>= 4; \
+}
+
+#define MAKE_BFP_SAD8_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+                           int source_stride, \
+                           const uint8_t *ref_ptr, \
+                           int  ref_stride, \
+                           unsigned int *sad_array) {  \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 8; i++) \
+    sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 8; i++) \
+    sad_array[i] >>= 4; \
+}
+#define MAKE_BFP_SAD4D_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+                           int source_stride, \
+                           const uint8_t* const ref_ptr[], \
+                           int  ref_stride, \
+                           unsigned int *sad_array) {  \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t* const ref_ptr[], \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 4; i++) \
+  sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t* const ref_ptr[], \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 4; i++) \
+  sad_array[i] >>= 4; \
+}
+
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad32x16x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad16x32x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad64x32x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad32x64x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad32x32_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_high_sad32x32x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad32x32x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad32x32x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad64x64_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_high_sad64x64x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad64x64x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad64x64x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad16x16_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_high_sad16x16x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad16x16x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad16x16x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad16x8_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_high_sad16x8x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad16x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad16x8x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad8x16_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_high_sad8x16x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad8x16x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad8x16x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad8x8_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_high_sad8x8x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad8x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad8x8x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad8x4_avg)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad8x4x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad8x4x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad4x8_avg)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad4x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad4x8x4d)
+MAKE_BFP_SAD_WRAPPER(vp9_high_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(vp9_high_sad4x4_avg)
+MAKE_BFP_SAD3_WRAPPER(vp9_high_sad4x4x3)
+MAKE_BFP_SAD8_WRAPPER(vp9_high_sad4x4x8)
+MAKE_BFP_SAD4D_WRAPPER(vp9_high_sad4x4x4d)
+
+static void highbd_set_var_fns(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (cm->use_highbitdepth) {
+    switch (cm->bit_depth) {
+      case VPX_BITS_8:
+        HIGHBD_BFP(BLOCK_32X16,
+                   vp9_high_sad32x16_bits8,
+                   vp9_high_sad32x16_avg_bits8,
+                   vp9_high_variance32x16,
+                   vp9_high_sub_pixel_variance32x16,
+                   vp9_high_sub_pixel_avg_variance32x16,
+                   NULL,
+                   NULL,
+                   vp9_high_sad32x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X32,
+                   vp9_high_sad16x32_bits8,
+                   vp9_high_sad16x32_avg_bits8,
+                   vp9_high_variance16x32,
+                   vp9_high_sub_pixel_variance16x32,
+                   vp9_high_sub_pixel_avg_variance16x32,
+                   NULL,
+                   NULL,
+                   vp9_high_sad16x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_64X32,
+                   vp9_high_sad64x32_bits8,
+                   vp9_high_sad64x32_avg_bits8,
+                   vp9_high_variance64x32,
+                   vp9_high_sub_pixel_variance64x32,
+                   vp9_high_sub_pixel_avg_variance64x32,
+                   NULL,
+                   NULL,
+                   vp9_high_sad64x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_32X64,
+                   vp9_high_sad32x64_bits8,
+                   vp9_high_sad32x64_avg_bits8,
+                   vp9_high_variance32x64,
+                   vp9_high_sub_pixel_variance32x64,
+                   vp9_high_sub_pixel_avg_variance32x64,
+                   NULL,
+                   NULL,
+                   vp9_high_sad32x64x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_32X32,
+                   vp9_high_sad32x32_bits8,
+                   vp9_high_sad32x32_avg_bits8,
+                   vp9_high_variance32x32,
+                   vp9_high_sub_pixel_variance32x32,
+                   vp9_high_sub_pixel_avg_variance32x32,
+                   vp9_high_sad32x32x3_bits8,
+                   vp9_high_sad32x32x8_bits8,
+                   vp9_high_sad32x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_64X64,
+                   vp9_high_sad64x64_bits8,
+                   vp9_high_sad64x64_avg_bits8,
+                   vp9_high_variance64x64,
+                   vp9_high_sub_pixel_variance64x64,
+                   vp9_high_sub_pixel_avg_variance64x64,
+                   vp9_high_sad64x64x3_bits8,
+                   vp9_high_sad64x64x8_bits8,
+                   vp9_high_sad64x64x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X16,
+                   vp9_high_sad16x16_bits8,
+                   vp9_high_sad16x16_avg_bits8,
+                   vp9_high_variance16x16,
+                   vp9_high_sub_pixel_variance16x16,
+                   vp9_high_sub_pixel_avg_variance16x16,
+                   vp9_high_sad16x16x3_bits8,
+                   vp9_high_sad16x16x8_bits8,
+                   vp9_high_sad16x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X8,
+                   vp9_high_sad16x8_bits8,
+                   vp9_high_sad16x8_avg_bits8,
+                   vp9_high_variance16x8,
+                   vp9_high_sub_pixel_variance16x8,
+                   vp9_high_sub_pixel_avg_variance16x8,
+                   vp9_high_sad16x8x3_bits8,
+                   vp9_high_sad16x8x8_bits8,
+                   vp9_high_sad16x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X16,
+                   vp9_high_sad8x16_bits8,
+                   vp9_high_sad8x16_avg_bits8,
+                   vp9_high_variance8x16,
+                   vp9_high_sub_pixel_variance8x16,
+                   vp9_high_sub_pixel_avg_variance8x16,
+                   vp9_high_sad8x16x3_bits8,
+                   vp9_high_sad8x16x8_bits8,
+                   vp9_high_sad8x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X8,
+                   vp9_high_sad8x8_bits8,
+                   vp9_high_sad8x8_avg_bits8,
+                   vp9_high_variance8x8,
+                   vp9_high_sub_pixel_variance8x8,
+                   vp9_high_sub_pixel_avg_variance8x8,
+                   vp9_high_sad8x8x3_bits8,
+                   vp9_high_sad8x8x8_bits8,
+                   vp9_high_sad8x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X4,
+                   vp9_high_sad8x4_bits8,
+                   vp9_high_sad8x4_avg_bits8,
+                   vp9_high_variance8x4,
+                   vp9_high_sub_pixel_variance8x4,
+                   vp9_high_sub_pixel_avg_variance8x4,
+                   NULL,
+                   vp9_high_sad8x4x8_bits8,
+                   vp9_high_sad8x4x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X8,
+                   vp9_high_sad4x8_bits8,
+                   vp9_high_sad4x8_avg_bits8,
+                   vp9_high_variance4x8,
+                   vp9_high_sub_pixel_variance4x8,
+                   vp9_high_sub_pixel_avg_variance4x8,
+                   NULL,
+                   vp9_high_sad4x8x8_bits8,
+                   vp9_high_sad4x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X4,
+                   vp9_high_sad4x4_bits8,
+                   vp9_high_sad4x4_avg_bits8,
+                   vp9_high_variance4x4,
+                   vp9_high_sub_pixel_variance4x4,
+                   vp9_high_sub_pixel_avg_variance4x4,
+                   vp9_high_sad4x4x3_bits8,
+                   vp9_high_sad4x4x8_bits8,
+                   vp9_high_sad4x4x4d_bits8)
+        break;
+
+      case VPX_BITS_10:
+        HIGHBD_BFP(BLOCK_32X16,
+                   vp9_high_sad32x16_bits10,
+                   vp9_high_sad32x16_avg_bits10,
+                   vp9_high_10_variance32x16,
+                   vp9_high_10_sub_pixel_variance32x16,
+                   vp9_high_10_sub_pixel_avg_variance32x16,
+                   NULL,
+                   NULL,
+                   vp9_high_sad32x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X32,
+                   vp9_high_sad16x32_bits10,
+                   vp9_high_sad16x32_avg_bits10,
+                   vp9_high_10_variance16x32,
+                   vp9_high_10_sub_pixel_variance16x32,
+                   vp9_high_10_sub_pixel_avg_variance16x32,
+                   NULL,
+                   NULL,
+                   vp9_high_sad16x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_64X32,
+                   vp9_high_sad64x32_bits10,
+                   vp9_high_sad64x32_avg_bits10,
+                   vp9_high_10_variance64x32,
+                   vp9_high_10_sub_pixel_variance64x32,
+                   vp9_high_10_sub_pixel_avg_variance64x32,
+                   NULL,
+                   NULL,
+                   vp9_high_sad64x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_32X64,
+                   vp9_high_sad32x64_bits10,
+                   vp9_high_sad32x64_avg_bits10,
+                   vp9_high_10_variance32x64,
+                   vp9_high_10_sub_pixel_variance32x64,
+                   vp9_high_10_sub_pixel_avg_variance32x64,
+                   NULL,
+                   NULL,
+                   vp9_high_sad32x64x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_32X32,
+                   vp9_high_sad32x32_bits10,
+                   vp9_high_sad32x32_avg_bits10,
+                   vp9_high_10_variance32x32,
+                   vp9_high_10_sub_pixel_variance32x32,
+                   vp9_high_10_sub_pixel_avg_variance32x32,
+                   vp9_high_sad32x32x3_bits10,
+                   vp9_high_sad32x32x8_bits10,
+                   vp9_high_sad32x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_64X64,
+                   vp9_high_sad64x64_bits10,
+                   vp9_high_sad64x64_avg_bits10,
+                   vp9_high_10_variance64x64,
+                   vp9_high_10_sub_pixel_variance64x64,
+                   vp9_high_10_sub_pixel_avg_variance64x64,
+                   vp9_high_sad64x64x3_bits10,
+                   vp9_high_sad64x64x8_bits10,
+                   vp9_high_sad64x64x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X16,
+                   vp9_high_sad16x16_bits10,
+                   vp9_high_sad16x16_avg_bits10,
+                   vp9_high_10_variance16x16,
+                   vp9_high_10_sub_pixel_variance16x16,
+                   vp9_high_10_sub_pixel_avg_variance16x16,
+                   vp9_high_sad16x16x3_bits10,
+                   vp9_high_sad16x16x8_bits10,
+                   vp9_high_sad16x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X8,
+                   vp9_high_sad16x8_bits10,
+                   vp9_high_sad16x8_avg_bits10,
+                   vp9_high_10_variance16x8,
+                   vp9_high_10_sub_pixel_variance16x8,
+                   vp9_high_10_sub_pixel_avg_variance16x8,
+                   vp9_high_sad16x8x3_bits10,
+                   vp9_high_sad16x8x8_bits10,
+                   vp9_high_sad16x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X16,
+                   vp9_high_sad8x16_bits10,
+                   vp9_high_sad8x16_avg_bits10,
+                   vp9_high_10_variance8x16,
+                   vp9_high_10_sub_pixel_variance8x16,
+                   vp9_high_10_sub_pixel_avg_variance8x16,
+                   vp9_high_sad8x16x3_bits10,
+                   vp9_high_sad8x16x8_bits10,
+                   vp9_high_sad8x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X8,
+                   vp9_high_sad8x8_bits10,
+                   vp9_high_sad8x8_avg_bits10,
+                   vp9_high_10_variance8x8,
+                   vp9_high_10_sub_pixel_variance8x8,
+                   vp9_high_10_sub_pixel_avg_variance8x8,
+                   vp9_high_sad8x8x3_bits10,
+                   vp9_high_sad8x8x8_bits10,
+                   vp9_high_sad8x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X4,
+                   vp9_high_sad8x4_bits10,
+                   vp9_high_sad8x4_avg_bits10,
+                   vp9_high_10_variance8x4,
+                   vp9_high_10_sub_pixel_variance8x4,
+                   vp9_high_10_sub_pixel_avg_variance8x4,
+                   NULL,
+                   vp9_high_sad8x4x8_bits10,
+                   vp9_high_sad8x4x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_4X8,
+                   vp9_high_sad4x8_bits10,
+                   vp9_high_sad4x8_avg_bits10,
+                   vp9_high_10_variance4x8,
+                   vp9_high_10_sub_pixel_variance4x8,
+                   vp9_high_10_sub_pixel_avg_variance4x8,
+                   NULL,
+                   vp9_high_sad4x8x8_bits10,
+                   vp9_high_sad4x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_4X4,
+                   vp9_high_sad4x4_bits10,
+                   vp9_high_sad4x4_avg_bits10,
+                   vp9_high_10_variance4x4,
+                   vp9_high_10_sub_pixel_variance4x4,
+                   vp9_high_10_sub_pixel_avg_variance4x4,
+                   vp9_high_sad4x4x3_bits10,
+                   vp9_high_sad4x4x8_bits10,
+                   vp9_high_sad4x4x4d_bits10)
+        break;
+
+      case VPX_BITS_12:
+        HIGHBD_BFP(BLOCK_32X16,
+                   vp9_high_sad32x16_bits12,
+                   vp9_high_sad32x16_avg_bits12,
+                   vp9_high_12_variance32x16,
+                   vp9_high_12_sub_pixel_variance32x16,
+                   vp9_high_12_sub_pixel_avg_variance32x16,
+                   NULL,
+                   NULL,
+                   vp9_high_sad32x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X32,
+                   vp9_high_sad16x32_bits12,
+                   vp9_high_sad16x32_avg_bits12,
+                   vp9_high_12_variance16x32,
+                   vp9_high_12_sub_pixel_variance16x32,
+                   vp9_high_12_sub_pixel_avg_variance16x32,
+                   NULL,
+                   NULL,
+                   vp9_high_sad16x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_64X32,
+                   vp9_high_sad64x32_bits12,
+                   vp9_high_sad64x32_avg_bits12,
+                   vp9_high_12_variance64x32,
+                   vp9_high_12_sub_pixel_variance64x32,
+                   vp9_high_12_sub_pixel_avg_variance64x32,
+                   NULL,
+                   NULL,
+                   vp9_high_sad64x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_32X64,
+                   vp9_high_sad32x64_bits12,
+                   vp9_high_sad32x64_avg_bits12,
+                   vp9_high_12_variance32x64,
+                   vp9_high_12_sub_pixel_variance32x64,
+                   vp9_high_12_sub_pixel_avg_variance32x64,
+                   NULL,
+                   NULL,
+                   vp9_high_sad32x64x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_32X32,
+                   vp9_high_sad32x32_bits12,
+                   vp9_high_sad32x32_avg_bits12,
+                   vp9_high_12_variance32x32,
+                   vp9_high_12_sub_pixel_variance32x32,
+                   vp9_high_12_sub_pixel_avg_variance32x32,
+                   vp9_high_sad32x32x3_bits12,
+                   vp9_high_sad32x32x8_bits12,
+                   vp9_high_sad32x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_64X64,
+                   vp9_high_sad64x64_bits12,
+                   vp9_high_sad64x64_avg_bits12,
+                   vp9_high_12_variance64x64,
+                   vp9_high_12_sub_pixel_variance64x64,
+                   vp9_high_12_sub_pixel_avg_variance64x64,
+                   vp9_high_sad64x64x3_bits12,
+                   vp9_high_sad64x64x8_bits12,
+                   vp9_high_sad64x64x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X16,
+                   vp9_high_sad16x16_bits12,
+                   vp9_high_sad16x16_avg_bits12,
+                   vp9_high_12_variance16x16,
+                   vp9_high_12_sub_pixel_variance16x16,
+                   vp9_high_12_sub_pixel_avg_variance16x16,
+                   vp9_high_sad16x16x3_bits12,
+                   vp9_high_sad16x16x8_bits12,
+                   vp9_high_sad16x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X8,
+                   vp9_high_sad16x8_bits12,
+                   vp9_high_sad16x8_avg_bits12,
+                   vp9_high_12_variance16x8,
+                   vp9_high_12_sub_pixel_variance16x8,
+                   vp9_high_12_sub_pixel_avg_variance16x8,
+                   vp9_high_sad16x8x3_bits12,
+                   vp9_high_sad16x8x8_bits12,
+                   vp9_high_sad16x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X16,
+                   vp9_high_sad8x16_bits12,
+                   vp9_high_sad8x16_avg_bits12,
+                   vp9_high_12_variance8x16,
+                   vp9_high_12_sub_pixel_variance8x16,
+                   vp9_high_12_sub_pixel_avg_variance8x16,
+                   vp9_high_sad8x16x3_bits12,
+                   vp9_high_sad8x16x8_bits12,
+                   vp9_high_sad8x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X8,
+                   vp9_high_sad8x8_bits12,
+                   vp9_high_sad8x8_avg_bits12,
+                   vp9_high_12_variance8x8,
+                   vp9_high_12_sub_pixel_variance8x8,
+                   vp9_high_12_sub_pixel_avg_variance8x8,
+                   vp9_high_sad8x8x3_bits12,
+                   vp9_high_sad8x8x8_bits12,
+                   vp9_high_sad8x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X4,
+                   vp9_high_sad8x4_bits12,
+                   vp9_high_sad8x4_avg_bits12,
+                   vp9_high_12_variance8x4,
+                   vp9_high_12_sub_pixel_variance8x4,
+                   vp9_high_12_sub_pixel_avg_variance8x4,
+                   NULL,
+                   vp9_high_sad8x4x8_bits12,
+                   vp9_high_sad8x4x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_4X8,
+                   vp9_high_sad4x8_bits12,
+                   vp9_high_sad4x8_avg_bits12,
+                   vp9_high_12_variance4x8,
+                   vp9_high_12_sub_pixel_variance4x8,
+                   vp9_high_12_sub_pixel_avg_variance4x8,
+                   NULL,
+                   vp9_high_sad4x8x8_bits12,
+                   vp9_high_sad4x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_4X4,
+                   vp9_high_sad4x4_bits12,
+                   vp9_high_sad4x4_avg_bits12,
+                   vp9_high_12_variance4x4,
+                   vp9_high_12_sub_pixel_variance4x4,
+                   vp9_high_12_sub_pixel_avg_variance4x4,
+                   vp9_high_sad4x4x3_bits12,
+                   vp9_high_sad4x4x8_bits12,
+                   vp9_high_sad4x4x4d_bits12)
+        break;
+
+      default:
+        assert(0 && "cm->bit_depth should be VPX_BITS_8, "
+                    "VPX_BITS_10 or VPX_BITS_12");
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -622,7 +1228,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   if (cpi->oxcf.use_highbitdepth) {
     cpi->mb.e_mbd.bd = (int)cm->bit_depth;
   }
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
 
@@ -693,6 +1299,10 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   cpi->ext_refresh_frame_flags_pending = 0;
   cpi->ext_refresh_frame_context_pending = 0;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  highbd_set_var_fns(cpi);
+#endif
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
     vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
@@ -1066,6 +1676,10 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
       vp9_sub_pixel_avg_variance4x4,
       vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  highbd_set_var_fns(cpi);
+#endif
+
   /* vp9_init_quantizer() is first called here. Add check in
    * vp9_frame_init_quantizer() so that vp9_init_quantizer is only
    * called later when needed. This will avoid unnecessary calls of
@@ -1193,6 +1807,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
 
 #endif
 }
+
 static int64_t get_sse(const uint8_t *a, int a_stride,
                        const uint8_t *b, int b_stride,
                        int width, int height) {
@@ -1234,6 +1849,63 @@ static int64_t get_sse(const uint8_t *a, int a_stride,
   return total_sse;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+                                    const uint8_t *b8, int b_stride,
+                                    int width, int height,
+                                    unsigned int input_shift) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t total_sse = 0;
+  int x, y;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      int64_t diff;
+      diff = (a[x] >> input_shift) - (b[x] >> input_shift);
+      total_sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+  return total_sse;
+}
+
+static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
+                              const uint8_t *b, int b_stride,
+                              int width, int height) {
+  int64_t total_sse = 0;
+  int x, y;
+  const int dw = width % 16;
+  const int dh = height % 16;
+  unsigned int sse = 0;
+  int sum = 0;
+  if (dw > 0) {
+    high_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
+                  dw, height, &sse, &sum);
+    total_sse += sse;
+  }
+  if (dh > 0) {
+    high_variance(&a[(height - dh) * a_stride], a_stride,
+                  &b[(height - dh) * b_stride], b_stride,
+                  width - dw, dh, &sse, &sum);
+    total_sse += sse;
+  }
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      vp9_high_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      total_sse += sse;
+      pa += 16;
+      pb += 16;
+    }
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+  return total_sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 typedef struct {
   double psnr[4];       // total/y/u/v
   uint64_t sse[4];      // total/y/u/v
@@ -1273,11 +1945,69 @@ static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
                                   (double)total_sse);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b,
+                             PSNR_STATS *psnr,
+                             unsigned int bit_depth,
+                             unsigned int in_bit_depth) {
+  const int widths[3] = {a->y_width,  a->uv_width,  a->uv_width };
+  const int heights[3] = {a->y_height, a->uv_height, a->uv_height};
+  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer,  a->v_buffer };
+  const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride};
+  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer,  b->v_buffer };
+  const int b_strides[3] = {b->y_stride, b->uv_stride, b->uv_stride};
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+  const double peak = (double)((1 << in_bit_depth) - 1);
+  const unsigned int input_shift = bit_depth - in_bit_depth;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    uint64_t sse;
+    if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (input_shift) {
+        sse = highbd_get_sse_shift(a_planes[i], a_strides[i],
+                                   b_planes[i], b_strides[i], w, h,
+                                   input_shift);
+      } else {
+        sse = highbd_get_sse(a_planes[i], a_strides[i],
+                             b_planes[i], b_strides[i], w, h);
+      }
+    } else {
+      sse = get_sse(a_planes[i], a_strides[i],
+                    b_planes[i], b_strides[i],
+                    w, h);
+    }
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
+                                  (double)total_sse);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void generate_psnr_packet(VP9_COMP *cpi) {
   struct vpx_codec_cx_pkt pkt;
   int i;
   PSNR_STATS psnr;
+#if CONFIG_VP9_HIGHBITDEPTH
+  calc_highbd_psnr(cpi->Source, cpi->common.frame_to_show, &psnr,
+                   cpi->mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+#else
   calc_psnr(cpi->Source, cpi->common.frame_to_show, &psnr);
+#endif
+
   for (i = 0; i < 4; ++i) {
     pkt.data.psnr.samples[i] = psnr.samples[i];
     pkt.data.psnr.sse[i] = psnr.sse[i];
@@ -1386,6 +2116,36 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
   uint8_t *src = s->y_buffer;
   int h = cm->height;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
+
+    do {
+      fwrite(src16, s->y_width, 2,  yuv_rec_file);
+      src16 += s->y_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->u_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2,  yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->v_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2, yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    fflush(yuv_rec_file);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   do {
     fwrite(src, s->y_width, 1,  yuv_rec_file);
     src += s->y_stride;
@@ -1411,8 +2171,14 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
 }
 #endif
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                                YV12_BUFFER_CONFIG *dst,
+                                                int bd) {
+#else
 static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
                                                 YV12_BUFFER_CONFIG *dst) {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t
   int i;
   const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
@@ -1428,15 +2194,31 @@ static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
   const int dst_heights[3] = {dst->y_crop_height, dst->uv_crop_height,
                               dst->uv_crop_height};
 
-  for (i = 0; i < MAX_MB_PLANE; ++i)
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_highbd_resize_plane(srcs[i], src_heights[i], src_widths[i],
+                              src_strides[i], dsts[i], dst_heights[i],
+                              dst_widths[i], dst_strides[i], bd);
+    } else {
+      vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
+                       dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
+    }
+#else
     vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
                      dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
-
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
   vp9_extend_frame_borders(dst);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                                   YV12_BUFFER_CONFIG *dst, int bd) {
+#else
 static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                    YV12_BUFFER_CONFIG *dst) {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   const int src_w = src->y_crop_width;
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
@@ -1460,10 +2242,24 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                      src_stride + (x / factor) * src_w / dst_w;
         uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+          vp9_high_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+                             kernel[x_q4 & 0xf], 16 * src_w / dst_w,
+                             kernel[y_q4 & 0xf], 16 * src_h / dst_h,
+                             16 / factor, 16 / factor, bd);
+        } else {
+          vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+                        kernel[x_q4 & 0xf], 16 * src_w / dst_w,
+                        kernel[y_q4 & 0xf], 16 * src_h / dst_h,
+                        16 / factor, 16 / factor);
+        }
+#else
         vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
                       kernel[x_q4 & 0xf], 16 * src_w / dst_w,
                       kernel[y_q4 & 0xf], 16 * src_h / dst_h,
                       16 / factor, 16 / factor);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       }
     }
   }
@@ -1632,9 +2428,14 @@ void vp9_scale_references(VP9_COMP *cpi) {
                                cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
                                cm->use_highbitdepth,
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
                                VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+#if CONFIG_VP9_HIGHBITDEPTH
+      scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf,
+                             (int)cm->bit_depth);
+#else
       scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
     } else {
       cpi->scaled_ref_idx[ref_frame - 1] = idx;
@@ -1698,11 +2499,12 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
         cpi->rc.total_target_vs_actual,
         (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target),
         cpi->rc.total_actual_bits, cm->base_qindex,
-        vp9_convert_qindex_to_q(cm->base_qindex),
-        (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
-        vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality),
+        vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth),
+        (double)vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) / 4.0,
+        vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality,
+                                cm->bit_depth),
         cpi->rc.avg_q,
-        vp9_convert_qindex_to_q(cpi->oxcf.cq_level),
+        vp9_convert_qindex_to_q(cpi->oxcf.cq_level, cm->bit_depth),
         cpi->refresh_last_frame, cpi->refresh_golden_frame,
         cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost,
         cpi->twopass.bits_left,
@@ -1824,11 +2626,22 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
            rc->this_key_frame_forced &&
            (rc->projected_frame_size < rc->max_frame_bandwidth)) {
         int last_q = q;
-        int kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+        int kf_err;
 
         int high_err_target = cpi->ambient_err;
         int low_err_target = cpi->ambient_err >> 1;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          kf_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm),
+                                        cm->bit_depth);
+        } else {
+          kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+        }
+#else
+        kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
         // Prevent possible divide by zero error below for perfect KF
         kf_err += !kf_err;
 
@@ -1999,7 +2812,11 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                           YV12_BUFFER_CONFIG *scaled) {
   if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
       cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
+#else
     scale_and_extend_frame_nonnormative(unscaled, scaled);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     return scaled;
   } else {
     return unscaled;
@@ -2289,7 +3106,17 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   // fixed interval. Note the reconstruction error if it is the frame before
   // the force key frame
   if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      cpi->ambient_err = vp9_highbd_get_y_sse(cpi->Source,
+                                              get_frame_new_buffer(cm),
+                                              cm->bit_depth);
+    } else {
+      cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+    }
+#else
     cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
   // If the encoder forced a KEY_FRAME decision
@@ -2415,13 +3242,19 @@ static void init_motion_estimation(VP9_COMP *cpi) {
   }
 }
 
-static void check_initial_width(VP9_COMP *cpi, int subsampling_x,
-                                int subsampling_y) {
+static void check_initial_width(VP9_COMP *cpi,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                int use_highbitdepth,
+#endif
+                                int subsampling_x, int subsampling_y) {
   VP9_COMMON *const cm = &cpi->common;
 
   if (!cpi->initial_width) {
     cm->subsampling_x = subsampling_x;
     cm->subsampling_y = subsampling_y;
+#if CONFIG_VP9_HIGHBITDEPTH
+    cm->use_highbitdepth = use_highbitdepth;
+#endif
 
     alloc_raw_frame_buffers(cpi);
     alloc_ref_frame_buffers(cpi);
@@ -2443,8 +3276,12 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
   int res = 0;
   const int subsampling_x = sd->uv_width  < sd->y_width;
   const int subsampling_y = sd->uv_height < sd->y_height;
-
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int use_highbitdepth = sd->flags & YV12_FLAG_HIGHBITDEPTH;
+  check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
+#else
   check_initial_width(cpi, subsampling_x, subsampling_y);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   vpx_usec_timer_start(&timer);
 
@@ -2760,7 +3597,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     vp9_setup_scale_factors_for_frame(&ref_buf->sf,
                                       buf->y_crop_width, buf->y_crop_height,
                                       cm->width, cm->height);
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     if (vp9_is_scaled(&ref_buf->sf))
       vp9_extend_frame_borders(buf);
   }
@@ -2783,7 +3620,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
                                        vp9_high_idct4x4_add;
 #else
     cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
     vp9_first_pass(cpi, source);
   } else if (oxcf->pass == 2 &&
@@ -2836,7 +3673,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
         YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
         YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
         PSNR_STATS psnr;
+#if CONFIG_VP9_HIGHBITDEPTH
+        calc_highbd_psnr(orig, recon, &psnr, cpi->mb.e_mbd.bd,
+                         cpi->oxcf.input_bit_depth);
+#else
         calc_psnr(orig, recon, &psnr);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
         cpi->total += psnr.psnr[0];
         cpi->total_y += psnr.psnr[1];
@@ -2856,7 +3698,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 #endif
           vp9_clear_system_state();
 
+#if CONFIG_VP9_HIGHBITDEPTH
+          calc_highbd_psnr(orig, recon, &psnr, cpi->mb.e_mbd.bd,
+                           cpi->oxcf.input_bit_depth);
+#else
           calc_psnr(orig, pp, &psnr2);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
           cpi->totalp += psnr2.psnr[0];
           cpi->totalp_y += psnr2.psnr[1];
@@ -2865,12 +3712,32 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
           cpi->totalp_sq_error += psnr2.sse[0];
           cpi->totalp_samples += psnr2.samples[0];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (cm->use_highbitdepth) {
+            frame_ssim2 = vp9_highbd_calc_ssim(
+                orig, recon, &weight, xd->bd,
+                xd->bd - cpi->oxcf.input_bit_depth);
+          } else {
+            frame_ssim2 = vp9_calc_ssim(orig, recon, 1, &weight);
+          }
+#else
           frame_ssim2 = vp9_calc_ssim(orig, recon, &weight);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
           cpi->summed_quality += frame_ssim2 * weight;
           cpi->summed_weights += weight;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (cm->use_highbitdepth) {
+            frame_ssim2 = vp9_highbd_calc_ssim(
+                orig, &cm->post_proc_buffer, &weight,
+                xd->bd, xd->bd - cpi->oxcf.input_bit_depth);
+          } else {
+            frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, &weight);
+          }
+#else
           frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, &weight);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
           cpi->summedp_quality += frame_ssim2 * weight;
           cpi->summedp_weights += weight;
@@ -2889,7 +3756,18 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
       if (cpi->b_calculate_ssimg) {
         double y, u, v, frame_all;
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cm->use_high) {
+          frame_all = vp9_highbd_calc_ssimg(cpi->Source, cm->frame_to_show, &y,
+                                            &u, &v, xd->bd,
+                                            xd->bd - cpi->oxcf.input_bit_depth);
+        } else {
+          frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u,
+                                     &v);
+        }
+#else
         frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
         cpi->total_ssimg_y += y;
         cpi->total_ssimg_u += u;
         cpi->total_ssimg_v += v;
@@ -2986,8 +3864,11 @@ int vp9_set_internal_size(VP9_COMP *cpi,
 int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
                          unsigned int height) {
   VP9_COMMON *cm = &cpi->common;
-
+#if CONFIG_VP9_HIGHBITDEPTH
+  check_initial_width(cpi, 1, 1, cm->use_highbitdepth);
+#else
   check_initial_width(cpi, 1, 1);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   if (width) {
     cm->width = width;
@@ -3033,6 +3914,35 @@ int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) {
                       a->y_crop_width, a->y_crop_height);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+int vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                         const YV12_BUFFER_CONFIG *b,
+                         vpx_bit_depth_t bit_depth) {
+  unsigned int sse;
+  int sum;
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      high_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                    a->y_crop_width, a->y_crop_height, &sse, &sum);
+      return (int) sse;
+    case VPX_BITS_10:
+      high_10_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                       a->y_crop_width, a->y_crop_height, &sse, &sum);
+      return (int) sse;
+    case VPX_BITS_12:
+      high_12_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                       a->y_crop_width, a->y_crop_height, &sse, &sum);
+      return (int) sse;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 int vp9_get_quantizer(VP9_COMP *cpi) {
   return cpi->common.base_qindex;
index 80774de9247ce86e2e7fc29de31e49cfa566d8be..9bd16bc2753395a7ed11bae46b4733079c43c3ca 100644 (file)
@@ -483,6 +483,11 @@ static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
 }
 
 int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+#if CONFIG_VP9_HIGHBITDEPTH
+int vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                         const YV12_BUFFER_CONFIG *b,
+                         vpx_bit_depth_t bit_depth);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 void vp9_alloc_compressor_data(VP9_COMP *cpi);
 
index e8517c8892528dac2ea47ab32269f76e5eb18b89..5b01bc9f2aa53e1d42b5832a3789322c9037d22f 100644 (file)
@@ -55,6 +55,52 @@ static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
+                                         uint8_t *dst8, int dst_pitch,
+                                         int w, int h,
+                                         int extend_top, int extend_left,
+                                         int extend_bottom, int extend_right) {
+  int i, linesize;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+  // copy the left and right most columns out
+  const uint16_t *src_ptr1 = src;
+  const uint16_t *src_ptr2 = src + w - 1;
+  uint16_t *dst_ptr1 = dst - extend_left;
+  uint16_t *dst_ptr2 = dst + w;
+
+  for (i = 0; i < h; i++) {
+    vpx_memset16(dst_ptr1, src_ptr1[0], extend_left);
+    vpx_memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(uint16_t));
+    vpx_memset16(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_pitch;
+    src_ptr2 += src_pitch;
+    dst_ptr1 += dst_pitch;
+    dst_ptr2 += dst_pitch;
+  }
+
+  // Now copy the top and bottom lines into each line of the respective
+  // borders
+  src_ptr1 = dst - extend_left;
+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+  dst_ptr2 = dst + dst_pitch * (h) - extend_left;
+  linesize = extend_left + extend_right + w;
+
+  for (i = 0; i < extend_top; i++) {
+    vpx_memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
+    dst_ptr1 += dst_pitch;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    vpx_memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
+    dst_ptr2 += dst_pitch;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                YV12_BUFFER_CONFIG *dst) {
   // Extend src frame in buffer
@@ -75,6 +121,26 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   const int eb_uv = eb_y >> uv_height_subsampling;
   const int er_uv = er_y >> uv_width_subsampling;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_copy_and_extend_plane(src->y_buffer, src->y_stride,
+                                 dst->y_buffer, dst->y_stride,
+                                 src->y_width, src->y_height,
+                                 et_y, el_y, eb_y, er_y);
+
+    highbd_copy_and_extend_plane(src->u_buffer, src->uv_stride,
+                                 dst->u_buffer, dst->uv_stride,
+                                 src->uv_width, src->uv_height,
+                                 et_uv, el_uv, eb_uv, er_uv);
+
+    highbd_copy_and_extend_plane(src->v_buffer, src->uv_stride,
+                                 dst->v_buffer, dst->uv_stride,
+                                 src->uv_width, src->uv_height,
+                                 et_uv, el_uv, eb_uv, er_uv);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   copy_and_extend_plane(src->y_buffer, src->y_stride,
                         dst->y_buffer, dst->y_stride,
                         src->y_width, src->y_height,
index 0282e9f9a358b0a07bbf7db486817e70b77a5431..9b1fa650812489604850952544c92bdfae3afc9c 100644 (file)
@@ -281,6 +281,60 @@ static unsigned int get_prediction_error(BLOCK_SIZE bsize,
   return sse;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+                                                      int bd) {
+  switch (bd) {
+    default:
+      switch (bsize) {
+        case BLOCK_8X8:
+          return vp9_high_mse8x8;
+        case BLOCK_16X8:
+          return vp9_high_mse16x8;
+        case BLOCK_8X16:
+          return vp9_high_mse8x16;
+        default:
+          return vp9_high_mse16x16;
+      }
+      break;
+    case 10:
+      switch (bsize) {
+        case BLOCK_8X8:
+          return vp9_high_10_mse8x8;
+        case BLOCK_16X8:
+          return vp9_high_10_mse16x8;
+        case BLOCK_8X16:
+          return vp9_high_10_mse8x16;
+        default:
+          return vp9_high_10_mse16x16;
+      }
+      break;
+    case 12:
+      switch (bsize) {
+        case BLOCK_8X8:
+          return vp9_high_12_mse8x8;
+        case BLOCK_16X8:
+          return vp9_high_12_mse16x8;
+        case BLOCK_8X16:
+          return vp9_high_12_mse8x16;
+        default:
+          return vp9_high_12_mse16x16;
+      }
+      break;
+  }
+}
+
+static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
+                                                const struct buf_2d *src,
+                                                const struct buf_2d *ref,
+                                                int bd) {
+  unsigned int sse;
+  const vp9_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+  return sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 // Refine the motion search range according to the frame dimension
 // for first pass test.
 static int get_search_range(const VP9_COMMON *cm) {
@@ -311,6 +365,11 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
 
   // Override the default variance function to use MSE.
   v_fn_ptr.vf = get_block_variance_fn(bsize);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Center the initial step/diamond search on best mv.
   tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
@@ -562,6 +621,24 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
          (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
       vp9_encode_intra_block_plane(x, bsize, 0);
       this_error = vp9_get_mb_ss(x->plane[0].src_diff);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        switch (cm->bit_depth) {
+          case VPX_BITS_8:
+            break;
+          case VPX_BITS_10:
+            this_error >>= 4;
+            break;
+          case VPX_BITS_12:
+            this_error >>= 8;
+            break;
+          default:
+            assert(0 && "cm->bit_depth should be VPX_BITS_8, "
+                        "VPX_BITS_10 or VPX_BITS_12");
+            return;
+        }
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
       if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
         vp9_clear_system_state();
@@ -601,8 +678,18 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
         struct buf_2d unscaled_last_source_buf_2d;
 
         xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
-        motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                            &xd->plane[0].pre[0]);
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          motion_error = highbd_get_prediction_error(
+              bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+        } else {
+          motion_error = get_prediction_error(
+              bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+        }
+#else
+        motion_error = get_prediction_error(
+            bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
         // Compute the motion error of the 0,0 motion using the last source
         // frame as the reference. Skip the further motion search on
@@ -611,8 +698,18 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
             cpi->unscaled_last_source->y_buffer + recon_yoffset;
         unscaled_last_source_buf_2d.stride =
             cpi->unscaled_last_source->y_stride;
-        raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                &unscaled_last_source_buf_2d);
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          raw_motion_error = highbd_get_prediction_error(
+              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
+        } else {
+          raw_motion_error = get_prediction_error(
+              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d);
+        }
+#else
+        raw_motion_error = get_prediction_error(
+            bsize, &x->plane[0].src, &unscaled_last_source_buf_2d);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
         // TODO(pengchong): Replace the hard-coded threshold
         if (raw_motion_error > 25 || lc != NULL) {
@@ -648,8 +745,18 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
             int gf_motion_error;
 
             xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
-            gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                   &xd->plane[0].pre[0]);
+#if CONFIG_VP9_HIGHBITDEPTH
+            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+              gf_motion_error = highbd_get_prediction_error(
+                  bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+            } else {
+              gf_motion_error = get_prediction_error(
+                  bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+            }
+#else
+            gf_motion_error = get_prediction_error(
+                bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
             first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
                                      &gf_motion_error);
index a25dc61aae3d113339125710cabedcc9cb97789b..258b45994f29806477280c5301bbb1ee18af01aa 100644 (file)
@@ -284,18 +284,73 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
   int tc = bc;                                                             \
                                                                            \
   bestmv->row *= 8;                                                        \
-  bestmv->col *= 8;                                                        \
-  if (second_pred != NULL) {                                               \
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);                \
-    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \
-    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);                  \
-  } else {                                                                 \
-    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);          \
-  }                                                                        \
-  *distortion = besterr;                                                   \
+  bestmv->col *= 8;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define SETUP_CENTER_ERROR                                                   \
+  if (second_pred != NULL) {                                                 \
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {                       \
+      DECLARE_ALIGNED_ARRAY(16, uint16_t, comp_pred16, 64 * 64);             \
+      vp9_high_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,     \
+                             y_stride);                                      \
+      besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, z, src_stride,   \
+                        sse1);                                               \
+    } else {                                                                 \
+      DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);                \
+      vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \
+      besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);                  \
+    }                                                                        \
+  } else {                                                                   \
+    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);            \
+  }                                                                          \
+  *distortion = besterr;                                                     \
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
 
-int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
+#else
+
+#define SETUP_CENTER_ERROR                                                   \
+  if (second_pred != NULL) {                                                 \
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);                  \
+    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);   \
+    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);                    \
+  } else {                                                                   \
+    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);            \
+  }                                                                          \
+  *distortion = besterr;                                                     \
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+
+
+
+static INLINE int divide_and_round(const int n, const int d) {
+  return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_sad_list_wellbehaved(int *sad_list) {
+  return sad_list[0] >= sad_list[1] &&
+         sad_list[0] >= sad_list[2] &&
+         sad_list[0] >= sad_list[3] &&
+         sad_list[0] >= sad_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static void get_cost_surf_min(int *sad_list, int *ir, int *ic,
+                              int bits) {
+  *ic = divide_and_round((sad_list[1] - sad_list[3]) * (1 << (bits - 1)),
+                         (sad_list[1] - 2 * sad_list[0] + sad_list[3]));
+  *ir = divide_and_round((sad_list[4] - sad_list[2]) * (1 << (bits - 1)),
+                         (sad_list[4] - 2 * sad_list[0] + sad_list[2]));
+}
+
+int vp9_find_best_sub_pixel_surface_fit(const MACROBLOCK *x,
                                         MV *bestmv, const MV *ref_mv,
                                         int allow_hp,
                                         int error_per_bit,
@@ -309,7 +364,127 @@ int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
                                         const uint8_t *second_pred,
                                         int w, int h) {
   SETUP_SUBPEL_SEARCH;
+  SETUP_CENTER_ERROR;
+  (void) halfiters;
+  (void) quarteriters;
+  (void) eighthiters;
+  (void) whichdir;
+  (void) allow_hp;
+  (void) forced_stop;
+  (void) hstep;
 
+  if (sad_list &&
+      sad_list[0] != INT_MAX && sad_list[1] != INT_MAX &&
+      sad_list[2] != INT_MAX && sad_list[3] != INT_MAX &&
+      sad_list[4] != INT_MAX &&
+      is_sad_list_wellbehaved(sad_list)) {
+    int ir, ic;
+    unsigned int minpt;
+    get_cost_surf_min(sad_list, &ir, &ic, 3);
+    if (ir != 0 || ic != 0) {
+      CHECK_BETTER(minpt, tr + ir, tc + ic);
+    }
+  }
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+int vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
+                                             MV *bestmv, const MV *ref_mv,
+                                             int allow_hp,
+                                             int error_per_bit,
+                                             const vp9_variance_fn_ptr_t *vfp,
+                                             int forced_stop,
+                                             int iters_per_step,
+                                             int *sad_list,
+                                             int *mvjcost, int *mvcost[2],
+                                             int *distortion,
+                                             unsigned int *sse1,
+                                             const uint8_t *second_pred,
+                                             int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+  SETUP_CENTER_ERROR;
+  if (sad_list &&
+      sad_list[0] != INT_MAX && sad_list[1] != INT_MAX &&
+      sad_list[2] != INT_MAX && sad_list[3] != INT_MAX &&
+      sad_list[4] != INT_MAX &&
+      is_sad_list_wellbehaved(sad_list)) {
+    unsigned int minpt;
+    int ir, ic;
+    get_cost_surf_min(sad_list, &ir, &ic, 1);
+    if (ir != 0 || ic != 0) {
+      CHECK_BETTER(minpt, tr + ir * hstep, tc + ic * hstep);
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  tr = br;
+  tc = bc;
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
+                                        MV *bestmv, const MV *ref_mv,
+                                        int allow_hp,
+                                        int error_per_bit,
+                                        const vp9_variance_fn_ptr_t *vfp,
+                                        int forced_stop,
+                                        int iters_per_step,
+                                        int *sad_list,
+                                        int *mvjcost, int *mvcost[2],
+                                        int *distortion,
+                                        unsigned int *sse1,
+                                        const uint8_t *second_pred,
+                                        int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+  SETUP_CENTER_ERROR;
   if (sad_list &&
       sad_list[0] != INT_MAX && sad_list[1] != INT_MAX &&
       sad_list[2] != INT_MAX && sad_list[3] != INT_MAX &&
@@ -401,6 +576,7 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
                                  const uint8_t *second_pred,
                                  int w, int h) {
   SETUP_SUBPEL_SEARCH;
+  SETUP_CENTER_ERROR;
   (void) sad_list;  // to silence compiler warning
 
   // Each subsequent iteration checks at least one point in
@@ -484,6 +660,52 @@ static INLINE int is_mv_in(const MACROBLOCK *x, const MV *mv) {
 #define MAX_PATTERN_CANDIDATES      8  // max number of canddiates per scale
 #define PATTERN_CANDIDATES_REF      3  // number of refinement candidates
 
+// Calculate and return a sad+mvcost list around an integer best pel.
+static INLINE void calc_int_sad_cost_list(MACROBLOCK *x,
+                                          const MV *ref_mv,
+                                          int sadpb,
+                                          const vp9_variance_fn_ptr_t *fn_ptr,
+                                          const MV *best_mv,
+                                          int *cost_list) {
+  static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}};
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
+  const MV fcenter_mv = {ref_mv->row >> 3, ref_mv->col >> 3};
+  int br = best_mv->row;
+  int bc = best_mv->col;
+  MV this_mv;
+  int i;
+
+  this_mv.row = br;
+  this_mv.col = bc;
+  cost_list[0] = fn_ptr->sdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &this_mv),
+                             in_what->stride) +
+      mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+  if (check_bounds(x, br, bc, 1)) {
+    for (i = 0; i < 4; i++) {
+      const MV this_mv = {br + neighbors[i].row,
+        bc + neighbors[i].col};
+      cost_list[i + 1] = fn_ptr->sdf(what->buf, what->stride,
+                                     get_buf_from_mv(in_what, &this_mv),
+                                     in_what->stride) +
+          mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+    }
+  } else {
+    for (i = 0; i < 4; i++) {
+      const MV this_mv = {br + neighbors[i].row,
+        bc + neighbors[i].col};
+      if (!is_mv_in(x, &this_mv))
+        cost_list[i + 1] = INT_MAX;
+      else
+        cost_list[i + 1] = fn_ptr->sdf(what->buf, what->stride,
+                                       get_buf_from_mv(in_what, &this_mv),
+                                       in_what->stride) +
+            mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+    }
+  }
+}
+
 // Generic pattern search function that searches over multiple scales.
 // Each scale can have a different number of candidates and shape of
 // candidates as indicated in the num_candidates and candidates arrays
@@ -1378,10 +1600,10 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
-
 int vp9_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x,
                            MV *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
+                           int *cost_list,
                            const vp9_variance_fn_ptr_t *fn_ptr,
                            const MV *ref_mv, MV *dst_mv) {
   MV temp_mv;
@@ -1434,6 +1656,11 @@ int vp9_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x,
       *dst_mv = best_mv;
     }
   }
+
+  // Return cost list.
+  if (cost_list) {
+    calc_int_sad_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+  }
   return bestsme;
 }
 
@@ -1792,7 +2019,7 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
     case NSTEP:
       var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
                                    MAX_MVSEARCH_STEPS - 1 - step_param,
-                                   1, fn_ptr, ref_mv, tmp_mv);
+                                   1, sad_list, fn_ptr, ref_mv, tmp_mv);
       break;
     default:
       assert(!"Invalid search method.");
index 9b4734a6f0bc73843b676e1fa2a15e6ddb1325ee..3156cb21ed17bdea01bbff3904a62c72d2ed19cd 100644 (file)
@@ -70,6 +70,7 @@ int vp9_init_search_range(int size);
 int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
                            MV *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
+                           int *cost_list,
                            const vp9_variance_fn_ptr_t *fn_ptr,
                            const MV *ref_mv, MV *dst_mv);
 
@@ -107,6 +108,8 @@ typedef int (fractional_mv_step_fp) (
 
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_more;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_surface_fit;
 
 typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
                                     const MV *ref_mv, int sad_per_bit,
index 2fc05e7fe2041c202a49dbaace16a52c1cd98207..85984fd7ef6deea76f13698f959c40c5fea791fa 100644 (file)
@@ -40,7 +40,15 @@ static int try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMP *const cpi,
 
   vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->mb.e_mbd, filt_level, 1,
                         partial_frame);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show, cm->bit_depth);
+  } else {
+    filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
+  }
+#else
   filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Re-instate the unfiltered frame
   vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
@@ -145,7 +153,26 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
     const int q = vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth);
     // These values were determined by linear fitting the result of the
     // searched level, filt_guess = q * 0.316206 + 3.87252
+#if CONFIG_VP9_HIGHDEPTH
+    int filt_guess;
+    switch (cm->bit_depth) {
+      case VPX_BITS_8:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+        break;
+      case VPX_BITS_10:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
+        break;
+      case VPX_BITS_12:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
+        break;
+      default:
+        assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 "
+                    "or VPX_BITS_12");
+        return;
+    }
+#else
     int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     if (cm->frame_type == KEY_FRAME)
       filt_guess -= 4;
     lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
index a788c1d8e3fd5ac48ab6588be7553bc7c8834035..428767a44182af5b3ec456b67f380b40e0f42354 100644 (file)
@@ -241,13 +241,44 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
             tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
   }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+                                 dc_quant >> (xd->bd - 5), &rate, &dist);
+  } else {
+    vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+                                 dc_quant >> 3, &rate, &dist);
+  }
+#else
   vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
                                dc_quant >> 3, &rate, &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   *out_rate_sum = rate >> 1;
   *out_dist_sum = dist << 3;
 
-  vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
-                               ac_quant >> 3, &rate, &dist);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_model_rd_from_var_lapndz(var,
+                                 1 << num_pels_log2_lookup[bsize],
+                                 ac_quant >> (xd->bd - 5),
+                                 &rate,
+                                 &dist);
+  } else {
+    vp9_model_rd_from_var_lapndz(var,
+                                 1 << num_pels_log2_lookup[bsize],
+                                 ac_quant >> 3,
+                                 &rate,
+                                 &dist);
+  }
+#else
+  vp9_model_rd_from_var_lapndz(var,
+                               1 << num_pels_log2_lookup[bsize],
+                               ac_quant >> 3,
+                               &rate,
+                               &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   *out_rate_sum += rate;
   *out_dist_sum += dist << 4;
 }
@@ -293,9 +324,17 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
     // The encode_breakout input
     const unsigned int min_thresh =
         MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
+#if CONFIG_VP9_HIGHBITDEPTH
+    const int shift = 2 * xd->bd - 16;
+#endif
 
     // Calculate threshold according to dequant value.
     thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {
+      thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift);
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
 
     // Adjust ac threshold according to partition size.
@@ -303,6 +342,11 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
         8 - (b_width_log2(bsize) + b_height_log2(bsize));
 
     thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {
+      thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift);
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   } else {
     thresh_ac = 0;
     thresh_dc = 0;
@@ -438,9 +482,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
   unsigned int sse_y = UINT_MAX;
-
-  const int intra_cost_penalty =
-      20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
   const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
                                            intra_cost_penalty, 0);
   const int intra_mode_cost = 50;
@@ -461,14 +504,25 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   // tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
   PRED_BUFFER tmp[4];
   DECLARE_ALIGNED_ARRAY(16, uint8_t, pred_buf, 3 * 64 * 64);
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, pred_buf_16, 3 * 64 * 64);
+#endif
   struct buf_2d orig_dst = pd->dst;
   PRED_BUFFER *best_pred = NULL;
   PRED_BUFFER *this_mode_pred = NULL;
+  const int pixels_in_block = bh * bw;
 
   if (cpi->sf.reuse_inter_pred_sby) {
     int i;
     for (i = 0; i < 3; i++) {
-      tmp[i].data = &pred_buf[bw * bh * i];
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth)
+        tmp[i].data = CONVERT_TO_BYTEPTR(&pred_buf_16[pixels_in_block * i]);
+      else
+        tmp[i].data = &pred_buf[pixels_in_block * i];
+#else
+      tmp[i].data = &pred_buf[pixels_in_block * i];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       tmp[i].stride = bw;
       tmp[i].in_use = 0;
     }
@@ -703,8 +757,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   if (best_pred != NULL && cpi->sf.reuse_inter_pred_sby &&
       best_pred->data != orig_dst.buf) {
     pd->dst = orig_dst;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      vp9_high_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride,
+                             NULL, 0, NULL, 0, bw, bh, xd->bd);
+    } else {
+      vp9_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride,
+                        NULL, 0, NULL, 0, bw, bh);
+    }
+#else
     vp9_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride, NULL, 0,
                       NULL, 0, bw, bh);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
   mbmi->mode          = best_mode;
index 17369d4c7396d644c9b3572a3e150a4c78562158..a32776ac7d5e0e48088235519a17b0b3af6730ac 100644 (file)
@@ -155,7 +155,7 @@ int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
   }
 #else
   int rdmult = 88 * q * q / 24;
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
@@ -187,7 +187,7 @@ static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
 #else
   (void) bit_depth;
   q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   // TODO(debargha): Adjust the function below.
   return MAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
 }
@@ -213,7 +213,7 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
 #else
   cpi->mb.sadperbit16 = sad_per_bit16lut_8[qindex];
   cpi->mb.sadperbit4 = sad_per_bit4lut_8[qindex];
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
 static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {
@@ -598,3 +598,24 @@ void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
     if (sf->disable_split_mask & (1 << i))
       rd->thresh_mult_sub8x8[i] = INT_MAX;
 }
+
+int vp9_get_intra_cost_penalty(int qindex, int qdelta,
+                               vpx_bit_depth_t bit_depth) {
+  const int q = vp9_dc_quant(qindex, qdelta, bit_depth);
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return 20 * q;
+    case VPX_BITS_10:
+      return 5 * q;
+    case VPX_BITS_12:
+      return ROUND_POWER_OF_TWO(5 * q, 2);
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  return 20 * q;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
index 5dcb2f8d75a29739783dd5d05fb655c56b6992ad..214f6965b727a0cd4a253efc70542bf7160f0218 100644 (file)
@@ -162,6 +162,10 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd,
                           int mi_row, int mi_col,
                           const struct scale_factors *scale,
                           const struct scale_factors *scale_uv);
+
+int vp9_get_intra_cost_penalty(int qindex, int qdelta,
+                               vpx_bit_depth_t bit_depth);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
index 9413d46b9c10ede6821dbc754ce159f21d27896f..bf0de4b29e53f1fc483994ac64ff9aaa27c5df22 100644 (file)
@@ -228,9 +228,13 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
     // Fast approximate the modelling function.
     if (cpi->oxcf.speed > 4) {
       int64_t rate;
-      int64_t dist;
       int64_t square_error = sse;
       int quantizer = (pd->dequant[1] >> 3);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        quantizer >>= (xd->bd - 8);
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
       if (quantizer < 120)
         rate = (square_error * (280 - quantizer)) >> 8;
@@ -240,8 +244,19 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
       rate_sum += rate;
       dist_sum += dist;
     } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
+                                     pd->dequant[1] >> (xd->bd - 5),
+                                     &rate, &dist);
+      } else {
+        vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
+                                     pd->dequant[1] >> 3, &rate, &dist);
+      }
+#else
       vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
                                    pd->dequant[1] >> 3, &rate, &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       rate_sum += rate;
       dist_sum += dist;
     }
@@ -266,6 +281,31 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
   return error;
 }
 
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vp9_high_block_error_c(const tran_low_t *coeff,
+                               const tran_low_t *dqcoeff,
+                               intptr_t block_size,
+                               int64_t *ssz, int bd) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+  int shift = 2 * (bd - 8);
+  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int64_t diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+    sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 /* The trailing '0' is a terminator which is used inside cost_coeffs() to
  * decide whether to include cost of a trailing EOB node or not (i.e. we
  * can skip this if the last coefficient in this transform block, e.g. the
@@ -351,8 +391,14 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
 
   return cost;
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void dist_block(int plane, int block, TX_SIZE tx_size,
+                       struct rdcost_block_args* args, int bd) {
+#else
 static void dist_block(int plane, int block, TX_SIZE tx_size,
                        struct rdcost_block_args* args) {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   const int ss_txfrm_size = tx_size << 1;
   MACROBLOCK* const x = args->x;
   MACROBLOCKD* const xd = &x->e_mbd;
@@ -362,14 +408,24 @@ static void dist_block(int plane, int block, TX_SIZE tx_size,
   int shift = tx_size == TX_32X32 ? 0 : 2;
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+#if CONFIG_VP9_HIGHBITDEPTH
+  args->dist = vp9_high_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+                                    &this_sse, bd) >> shift;
+#else
   args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
                                &this_sse) >> shift;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   args->sse  = this_sse >> shift;
 
   if (x->skip_encode && !is_inter_block(&xd->mi[0].src_mi->mbmi)) {
     // TODO(jingning): tune the model to better capture the distortion.
     int64_t p = (pd->dequant[1] * pd->dequant[1] *
                     (1 << ss_txfrm_size)) >> (shift + 2);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      p >>= ((xd->bd - 8) * 2);
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     args->dist += (p >> 4);
     args->sse  += p;
   }
@@ -399,12 +455,28 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
 
   if (!is_inter_block(mbmi)) {
     vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      dist_block(plane, block, tx_size, args, xd->bd);
+    } else {
+      dist_block(plane, block, tx_size, args, 8);
+    }
+#else
     dist_block(plane, block, tx_size, args);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   } else if (max_txsize_lookup[plane_bsize] == tx_size) {
     if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) {
       // full forward transform and quantization
       vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dist_block(plane, block, tx_size, args, xd->bd);
+      } else {
+        dist_block(plane, block, tx_size, args, 8);
+      }
+#else
       dist_block(plane, block, tx_size, args);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {
       // compute DC coefficient
       tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
@@ -424,7 +496,15 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
   } else {
     // full forward transform and quantization
     vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      dist_block(plane, block, tx_size, args, xd->bd);
+    } else {
+      dist_block(plane, block, tx_size, args, 8);
+    }
+#else
     dist_block(plane, block, tx_size, args);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
   rate_block(plane, block, plane_bsize, tx_size, args);
@@ -659,6 +739,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy;
   uint8_t best_dst[8 * 8];
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint16_t best_dst16[8 * 8];
+#endif
 
   assert(ib < 4);
 
@@ -666,6 +749,108 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   vpx_memcpy(tl, l, sizeof(tl));
   xd->mi[0].src_mi->mbmi.tx_size = TX_4X4;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+      int64_t this_rd;
+      int ratey = 0;
+      int64_t distortion = 0;
+      int rate = bmode_costs[mode];
+
+      if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
+        continue;
+
+      // Only do the oblique modes if the best so far is
+      // one of the neighboring directional modes
+      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+        if (conditional_skipintra(mode, *best_mode))
+            continue;
+      }
+
+      vpx_memcpy(tempa, ta, sizeof(ta));
+      vpx_memcpy(templ, tl, sizeof(tl));
+
+      for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+        for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
+          const int block = ib + idy * 2 + idx;
+          const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+          uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+          int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
+                                                              p->src_diff);
+          tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+          xd->mi[0].src_mi->bmi[block].as_mode = mode;
+          vp9_predict_intra_block(xd, block, 1,
+                                  TX_4X4, mode,
+                                  x->skip_encode ? src : dst,
+                                  x->skip_encode ? src_stride : dst_stride,
+                                  dst, dst_stride, idx, idy, 0);
+          vp9_high_subtract_block(4, 4, src_diff, 8, src, src_stride,
+                                  dst, dst_stride, xd->bd);
+          if (xd->lossless) {
+            const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+            vp9_high_fwht4x4(src_diff, coeff, 8);
+            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                                 so->scan, so->neighbors,
+                                 cpi->sf.use_fast_coef_costing);
+            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+              goto next_highbd;
+            vp9_high_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
+                                 dst, dst_stride,
+                                 p->eobs[block], xd->bd);
+          } else {
+            int64_t unused;
+            const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
+            const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
+            vp9_high_fht4x4(src_diff, coeff, 8, tx_type);
+            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                                 so->scan, so->neighbors,
+                                 cpi->sf.use_fast_coef_costing);
+            distortion += vp9_high_block_error(coeff,
+                                               BLOCK_OFFSET(pd->dqcoeff, block),
+                                               16, &unused, xd->bd) >> 2;
+            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+              goto next_highbd;
+            vp9_high_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
+                                dst, dst_stride, p->eobs[block], xd->bd);
+          }
+        }
+      }
+
+      rate += ratey;
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+      if (this_rd < best_rd) {
+        *bestrate = rate;
+        *bestratey = ratey;
+        *bestdistortion = distortion;
+        best_rd = this_rd;
+        *best_mode = mode;
+        vpx_memcpy(a, tempa, sizeof(tempa));
+        vpx_memcpy(l, templ, sizeof(templ));
+        for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
+          vpx_memcpy(best_dst16 + idy * 8,
+                     CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+                     num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+        }
+      }
+    next_highbd:
+      {}
+    }
+    if (best_rd >= rd_thresh || x->skip_encode)
+      return best_rd;
+
+    for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
+      vpx_memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+                 best_dst16 + idy * 8,
+                 num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+    }
+
+    return best_rd;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     int64_t this_rd;
     int ratey = 0;
@@ -1118,6 +1303,16 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
                                                pd->pre[ref].stride)];
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_high_build_inter_predictor(pre, pd->pre[ref].stride,
+                                   dst, pd->dst.stride,
+                                   &mi->bmi[i].as_mv[ref].as_mv,
+                                   &xd->block_refs[ref]->sf, width, height, ref,
+                                   kernel, MV_PRECISION_Q3,
+                                   mi_col * MI_SIZE + 4 * (i % 2),
+                                   mi_row * MI_SIZE + 4 * (i / 2), xd->bd);
+  } else {
     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
                               dst, pd->dst.stride,
                               &mi->bmi[i].as_mv[ref].as_mv,
@@ -1126,11 +1321,32 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
                               mi_col * MI_SIZE + 4 * (i % 2),
                               mi_row * MI_SIZE + 4 * (i / 2));
   }
+#else
+    vp9_build_inter_predictor(pre, pd->pre[ref].stride,
+                              dst, pd->dst.stride,
+                              &mi->bmi[i].as_mv[ref].as_mv,
+                              &xd->block_refs[ref]->sf, width, height, ref,
+                              kernel, MV_PRECISION_Q3,
+                              mi_col * MI_SIZE + 4 * (i % 2),
+                              mi_row * MI_SIZE + 4 * (i / 2));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_high_subtract_block(
+        height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
+        src, p->src.stride, dst, pd->dst.stride, xd->bd);
+  } else {
+    vp9_subtract_block(
+        height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
+        src, p->src.stride, dst, pd->dst.stride);
+  }
+#else
   vp9_subtract_block(height, width,
                      raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
-                     src, p->src.stride,
-                     dst, pd->dst.stride);
+                     src, p->src.stride, dst, pd->dst.stride);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   k = i;
   for (idy = 0; idy < height / 4; ++idy) {
@@ -1143,8 +1359,19 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
       x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                     coeff, 8);
       vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        thisdistortion += vp9_high_block_error(coeff,
+                                               BLOCK_OFFSET(pd->dqcoeff, k),
+                                               16, &ssz, xd->bd);
+      } else {
+        thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
+                                          16, &ssz);
+      }
+#else
       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
                                         16, &ssz);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       thissse += ssz;
       thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
                               so->scan, so->neighbors,
@@ -1901,7 +2128,12 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   int_mv ref_mv[2];
   int ite, ref;
   // Prediction buffer from second frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint8_t *second_pred;
+  uint8_t *second_pred_alloc;
+#else
   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
 
   // Do joint motion search in compound mode to get more accurate mv.
@@ -1912,6 +2144,15 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
   };
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint16_t));
+    second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc);
+  } else {
+    second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint8_t));
+    second_pred = second_pred_alloc;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   for (ref = 0; ref < 2; ++ref) {
     ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
@@ -1950,6 +2191,28 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     ref_yv12[1] = xd->plane[0].pre[1];
 
     // Get pred block from second frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_high_build_inter_predictor(ref_yv12[!id].buf,
+                                     ref_yv12[!id].stride,
+                                     second_pred, pw,
+                                     &frame_mv[refs[!id]].as_mv,
+                                     &xd->block_refs[!id]->sf,
+                                     pw, ph, 0,
+                                     kernel, MV_PRECISION_Q3,
+                                     mi_col * MI_SIZE, mi_row * MI_SIZE,
+                                     xd->bd);
+    } else {
+      vp9_build_inter_predictor(ref_yv12[!id].buf,
+                                ref_yv12[!id].stride,
+                                second_pred, pw,
+                                &frame_mv[refs[!id]].as_mv,
+                                &xd->block_refs[!id]->sf,
+                                pw, ph, 0,
+                                kernel, MV_PRECISION_Q3,
+                                mi_col * MI_SIZE, mi_row * MI_SIZE);
+    }
+#else
     vp9_build_inter_predictor(ref_yv12[!id].buf,
                               ref_yv12[!id].stride,
                               second_pred, pw,
@@ -1958,6 +2221,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                               pw, ph, 0,
                               kernel, MV_PRECISION_Q3,
                               mi_col * MI_SIZE, mi_row * MI_SIZE);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
     // Compound motion search on first ref frame.
     if (id)
@@ -2026,7 +2290,11 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                 x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  vpx_free(second_pred_alloc);
+#else
   vpx_free(second_pred);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
 static INLINE void restore_dst_buf(MACROBLOCKD *xd,
@@ -2068,12 +2336,26 @@ static void rd_encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
 
     // Calculate threshold according to dequant value.
     thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      const int shift = 2 * xd->bd - 16;
+      if (shift > 0)
+        thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift);
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
 
     // Adjust threshold according to partition size.
     thresh_ac >>= 8 - (b_width_log2(bsize) +
         b_height_log2(bsize));
     thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      const int shift = 2 * xd->bd - 16;
+      if (shift > 0)
+        thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift);
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   } else {
     thresh_ac = 0;
     thresh_dc = 0;
@@ -2145,8 +2427,13 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int refs[2] = { mbmi->ref_frame[0],
     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int_mv cur_mv[2];
-  int64_t this_rd = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, tmp_buf16, MAX_MB_PLANE * 64 * 64);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf8, MAX_MB_PLANE * 64 * 64);
+  uint8_t *tmp_buf = tmp_buf8;
+#else
   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   int pred_exists = 0;
   int intpel_mv;
   int64_t rd, tmp_rd, best_rd = INT64_MAX;
@@ -2163,6 +2450,14 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       (((mi_row + mi_col) >> bsl) +
        get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
+  } else {
+    tmp_buf = tmp_buf8;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   if (pred_filter_search) {
     INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
     if (xd->up_available)
@@ -2449,7 +2744,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     single_skippable[this_mode][refs[0]] = *skippable;
 
   restore_dst_buf(xd, orig_dst, orig_dst_stride);
-  return this_rd;  // if 0, this will be re-calculated by caller
+  return 0;  // The rate-distortion cost will be re-calculated by caller.
 }
 
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
@@ -2510,24 +2805,23 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   ctx->mic = *xd->mi[0].src_mi;
 }
 
-// Updating rd_thresh_freq_fact[] here means that the different
-// partition/block sizes are handled independently based on the best
-// choice for the current partition. It may well be better to keep a scaled
-// best rd so far value and update rd_thresh_freq_fact based on the mode/size
-// combination that wins out.
 static void update_rd_thresh_fact(VP9_COMP *cpi, int bsize,
                                   int best_mode_index) {
   if (cpi->sf.adaptive_rd_thresh > 0) {
     const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
     int mode;
     for (mode = 0; mode < top_mode; ++mode) {
-      int *const fact = &cpi->rd.thresh_freq_fact[bsize][mode];
-
-      if (mode == best_mode_index) {
-        *fact -= (*fact >> 3);
-      } else {
-        *fact = MIN(*fact + RD_THRESH_INC,
-                    cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+      const BLOCK_SIZE min_size = MAX(bsize - 1, BLOCK_4X4);
+      const BLOCK_SIZE max_size = MIN(bsize + 2, BLOCK_64X64);
+      BLOCK_SIZE bs;
+      for (bs = min_size; bs <= max_size; ++bs) {
+        int *const fact = &cpi->rd.thresh_freq_fact[bs][mode];
+        if (mode == best_mode_index) {
+          *fact -= (*fact >> 4);
+        } else {
+          *fact = MIN(*fact + RD_THRESH_INC,
+                      cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+        }
       }
     }
   }
@@ -2577,15 +2871,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t dist_uv[TX_SIZES];
   int skip_uv[TX_SIZES];
   PREDICTION_MODE mode_uv[TX_SIZES];
-  const int intra_cost_penalty =
-      20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
   int best_skip2 = 0;
   uint8_t ref_frame_skip_mask[2] = { 0 };
   uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
   int mode_skip_start = cpi->sf.mode_skip_start + 1;
   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = rd_opt->thresh_freq_fact[bsize];
-  int mode_threshold[MAX_MODES];
+  int64_t mode_threshold[MAX_MODES];
   int *mode_map = rd_opt->mode_map[bsize];
   const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
   vp9_zero(best_mbmode);
@@ -2676,6 +2970,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
+  if (cpi->sf.alt_ref_search_fp)
+    if (!cm->show_frame)
+      if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
+        mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
+
   if (bsize > cpi->sf.max_intra_bsize) {
     ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
     ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
@@ -3013,9 +3312,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         // based on qp, activity mask and history
         if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
             (mode_index > MIN_EARLY_TERM_INDEX)) {
-          const int qstep = xd->plane[0].dequant[1];
+          int qstep = xd->plane[0].dequant[1];
           // TODO(debargha): Enhance this by specializing for each mode_index
           int scale = 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            qstep >>= (xd->bd - 8);
+          }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
           if (x->source_variance < UINT_MAX) {
             const int var_adjust = (x->source_variance < 16);
             scale -= var_adjust;
@@ -3331,8 +3635,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t dist_uv;
   int skip_uv;
   PREDICTION_MODE mode_uv = DC_PRED;
-  const int intra_cost_penalty =
-      20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
@@ -3750,9 +4054,14 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         // based on qp, activity mask and history
         if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
             (ref_index > MIN_EARLY_TERM_INDEX)) {
-          const int qstep = xd->plane[0].dequant[1];
+          int qstep = xd->plane[0].dequant[1];
           // TODO(debargha): Enhance this by specializing for each mode_index
           int scale = 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            qstep >>= (xd->bd - 8);
+          }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
           if (x->source_variance < UINT_MAX) {
             const int var_adjust = (x->source_variance < 16);
             scale -= var_adjust;
index 52c603fb6484ba0b079efd0596b6cffe2588dbb1..50cb108ddd777fab8c596898add5e21482c372f4 100644 (file)
@@ -54,7 +54,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
                                       BLOCK_SIZE bsize,
                                       PICK_MODE_CONTEXT *ctx,
                                       int64_t best_rd_so_far);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
index 4e6efaeb969f45c40167778f522259c0014280af..4a8a52156285449cc734d6a4898869dd25f40fa0 100644 (file)
@@ -312,7 +312,7 @@ static void interpolate(const uint8_t *const input, int inlength,
 static void down2_symeven(const uint8_t *const input, int length,
                           uint8_t *output) {
   // Actual filter len = 2 * filter_len_half.
-  static const int16_t *filter = vp9_down2_symeven_half_filter;
+  const int16_t *filter = vp9_down2_symeven_half_filter;
   const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2;
   int i, j;
   uint8_t *optr = output;
@@ -368,7 +368,7 @@ static void down2_symeven(const uint8_t *const input, int length,
 static void down2_symodd(const uint8_t *const input, int length,
                          uint8_t *output) {
   // Actual filter len = 2 * filter_len_half - 1.
-  static const int16_t *filter = vp9_down2_symodd_half_filter;
+  const int16_t *filter = vp9_down2_symodd_half_filter;
   const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2;
   int i, j;
   uint8_t *optr = output;
@@ -529,6 +529,302 @@ void vp9_resize_plane(const uint8_t *const input,
   free(arrbuf);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_interpolate(const uint16_t *const input, int inlength,
+                               uint16_t *output, int outlength, int bd) {
+  const int64_t delta =
+      (((uint64_t)inlength << 32) + outlength / 2) / outlength;
+  const int64_t offset = inlength > outlength ?
+      (((int64_t)(inlength - outlength) << 31) + outlength / 2) / outlength :
+      -(((int64_t)(outlength - inlength) << 31) + outlength / 2) / outlength;
+  uint16_t *optr = output;
+  int x, x1, x2, sum, k, int_pel, sub_pel;
+  int64_t y;
+
+  const interp_kernel *interp_filters =
+      choose_interp_filter(inlength, outlength);
+
+  x = 0;
+  y = offset;
+  while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) {
+    x++;
+    y += delta;
+  }
+  x1 = x;
+  x = outlength - 1;
+  y = delta * x + offset;
+  while ((y >> INTERP_PRECISION_BITS) +
+         (int64_t)(INTERP_TAPS / 2) >= inlength) {
+    x--;
+    y -= delta;
+  }
+  x2 = x;
+  if (x1 > x2) {
+    for (x = 0, y = offset; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k) {
+        const int pk = int_pel - INTERP_TAPS / 2 + 1 + k;
+        sum += filter[k] *
+            input[(pk < 0 ? 0 : (pk >= inlength ? inlength - 1 : pk))];
+      }
+      *optr++ = clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+  } else {
+    // Initial part.
+    for (x = 0, y = offset; x < x1; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] *
+            input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ?
+                   0 : int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+    // Middle part.
+    for (; x <= x2; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k];
+      *optr++ = clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+    // End part.
+    for (; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >=
+                                  inlength ?  inlength - 1 :
+                                  int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+  }
+}
+
+static void highbd_down2_symeven(const uint16_t *const input, int length,
+                                 uint16_t *output, int bd) {
+  // Actual filter len = 2 * filter_len_half.
+  static const int16_t *filter = vp9_down2_symeven_half_filter;
+  const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2;
+  int i, j;
+  uint16_t *optr = output;
+  int l1 = filter_len_half;
+  int l2 = (length - filter_len_half);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_high(sum, bd);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_high(sum, bd);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_high(sum, bd);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_high(sum, bd);
+    }
+  }
+}
+
+static void highbd_down2_symodd(const uint16_t *const input, int length,
+                              uint16_t *output, int bd) {
+  // Actual filter len = 2 * filter_len_half - 1.
+  static const int16_t *filter = vp9_down2_symodd_half_filter;
+  const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2;
+  int i, j;
+  uint16_t *optr = output;
+  int l1 = filter_len_half - 1;
+  int l2 = (length - filter_len_half + 1);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_high(sum, bd);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_high(sum, bd);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_high(sum, bd);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_high(sum, bd);
+    }
+  }
+}
+
+static void highbd_resize_multistep(const uint16_t *const input,
+                                    int length,
+                                    uint16_t *output,
+                                    int olength,
+                                    uint16_t *buf,
+                                    int bd) {
+  int steps;
+  if (length == olength) {
+    memcpy(output, input, sizeof(uint16_t) * length);
+    return;
+  }
+  steps = get_down2_steps(length, olength);
+
+  if (steps > 0) {
+    int s;
+    uint16_t *out = NULL;
+    uint16_t *tmpbuf = NULL;
+    uint16_t *otmp, *otmp2;
+    int filteredlength = length;
+    if (!tmpbuf) {
+      tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) * length);
+      otmp = tmpbuf;
+    } else {
+      otmp = buf;
+    }
+    otmp2 = otmp + get_down2_length(length, 1);
+    for (s = 0; s < steps; ++s) {
+      const int proj_filteredlength = get_down2_length(filteredlength, 1);
+      const uint16_t *const in = (s == 0 ? input : out);
+      if (s == steps - 1 && proj_filteredlength == olength)
+        out = output;
+      else
+        out = (s & 1 ? otmp2 : otmp);
+      if (filteredlength & 1)
+        highbd_down2_symodd(in, filteredlength, out, bd);
+      else
+        highbd_down2_symeven(in, filteredlength, out, bd);
+      filteredlength = proj_filteredlength;
+    }
+    if (filteredlength != olength) {
+      highbd_interpolate(out, filteredlength, output, olength, bd);
+    }
+    if (tmpbuf)
+      free(tmpbuf);
+  } else {
+    highbd_interpolate(input, length, output, olength, bd);
+  }
+}
+
+static void highbd_fill_col_to_arr(uint16_t *img, int stride, int len,
+                                   uint16_t *arr) {
+  int i;
+  uint16_t *iptr = img;
+  uint16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = *iptr;
+  }
+}
+
+static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len,
+                                   uint16_t *arr) {
+  int i;
+  uint16_t *iptr = img;
+  uint16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = *aptr++;
+  }
+}
+
+void vp9_highbd_resize_plane(const uint8_t *const input,
+                             int height,
+                             int width,
+                             int in_stride,
+                             uint8_t *output,
+                             int height2,
+                             int width2,
+                             int out_stride,
+                             int bd) {
+  int i;
+  uint16_t *intbuf = (uint16_t *)malloc(sizeof(uint16_t) * width2 * height);
+  uint16_t *tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) *
+                                        (width < height ? height : width));
+  uint16_t *arrbuf = (uint16_t *)malloc(sizeof(uint16_t) * (height + height2));
+  for (i = 0; i < height; ++i) {
+    highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
+                            intbuf + width2 * i, width2, tmpbuf, bd);
+  }
+  for (i = 0; i < width2; ++i) {
+    highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+    highbd_resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf,
+                            bd);
+    highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2,
+                           arrbuf + height);
+  }
+  free(intbuf);
+  free(tmpbuf);
+  free(arrbuf);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 void vp9_resize_frame420(const uint8_t *const y,
                          int y_stride,
                          const uint8_t *const u, const uint8_t *const v,
@@ -574,3 +870,51 @@ void vp9_resize_frame444(const uint8_t *const y, int y_stride,
   vp9_resize_plane(v, height, width, uv_stride,
                    ov, oheight, owidth, ouv_stride);
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_resize_frame420(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride,
+                                int height, int width,
+                                uint8_t *oy, int oy_stride,
+                                uint8_t *ou, uint8_t *ov, int ouv_stride,
+                                int oheight, int owidth, int bd) {
+  vp9_highbd_resize_plane(y, height, width, y_stride,
+                          oy, oheight, owidth, oy_stride, bd);
+  vp9_highbd_resize_plane(u, height / 2, width / 2, uv_stride,
+                          ou, oheight / 2, owidth / 2, ouv_stride, bd);
+  vp9_highbd_resize_plane(v, height / 2, width / 2, uv_stride,
+                          ov, oheight / 2, owidth / 2, ouv_stride, bd);
+}
+
+void vp9_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride,
+                                int height, int width,
+                                uint8_t *oy, int oy_stride,
+                                uint8_t *ou, uint8_t *ov, int ouv_stride,
+                                int oheight, int owidth, int bd) {
+  vp9_highbd_resize_plane(y, height, width, y_stride,
+                          oy, oheight, owidth, oy_stride, bd);
+  vp9_highbd_resize_plane(u, height, width / 2, uv_stride,
+                          ou, oheight, owidth / 2, ouv_stride, bd);
+  vp9_highbd_resize_plane(v, height, width / 2, uv_stride,
+                          ov, oheight, owidth / 2, ouv_stride, bd);
+}
+
+void vp9_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride,
+                                int height, int width,
+                                uint8_t *oy, int oy_stride,
+                                uint8_t *ou, uint8_t *ov, int ouv_stride,
+                                int oheight, int owidth, int bd) {
+  vp9_highbd_resize_plane(y, height, width, y_stride,
+                          oy, oheight, owidth, oy_stride, bd);
+  vp9_highbd_resize_plane(u, height, width, uv_stride,
+                          ou, oheight, owidth, ouv_stride, bd);
+  vp9_highbd_resize_plane(v, height, width, uv_stride,
+                          ov, oheight, owidth, ouv_stride, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
index 1818cd47efbca10b38302e525b39d8f71f32a060..067af53f993653abd07f6cd5e249fe5ea4be4df4 100644 (file)
@@ -65,4 +65,60 @@ void vp9_resize_frame444(const uint8_t *const y,
                          int oheight,
                          int owidth);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_resize_plane(const uint8_t *const input,
+                             int height,
+                             int width,
+                             int in_stride,
+                             uint8_t *output,
+                             int height2,
+                             int width2,
+                             int out_stride,
+                             int bd);
+void vp9_highbd_resize_frame420(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u,
+                                const uint8_t *const v,
+                                int uv_stride,
+                                int height,
+                                int width,
+                                uint8_t *oy,
+                                int oy_stride,
+                                uint8_t *ou,
+                                uint8_t *ov,
+                                int ouv_stride,
+                                int oheight,
+                                int owidth,
+                                int bd);
+void vp9_highbd_resize_frame422(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u,
+                                const uint8_t *const v,
+                                int uv_stride,
+                                int height,
+                                int width,
+                                uint8_t *oy,
+                                int oy_stride,
+                                uint8_t *ou,
+                                uint8_t *ov,
+                                int ouv_stride,
+                                int oheight,
+                                int owidth,
+                                int bd);
+void vp9_highbd_resize_frame444(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u,
+                                const uint8_t *const v,
+                                int uv_stride,
+                                int height,
+                                int width,
+                                uint8_t *oy,
+                                int oy_stride,
+                                uint8_t *ou,
+                                uint8_t *ov,
+                                int ouv_stride,
+                                int oheight,
+                                int owidth,
+                                int bd);
+#endif    // CONFIG_VP9_HIGHBITDEPTH
 #endif    // VP9_ENCODER_VP9_RESIZE_H_
index 52e9a8e7b90178bd5c3796162149a3c97c368711..9368ac1f7f954e425070b2aa01c3ef93333a9110 100644 (file)
@@ -96,28 +96,25 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     if (MIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask = DISABLE_ALL_SPLIT;
       sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
+      sf->partition_search_breakout_dist_thr = (1 << 25);
     } else {
       sf->max_intra_bsize = BLOCK_32X32;
       sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
       sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
+      sf->partition_search_breakout_dist_thr = (1 << 23);
     }
+    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
     sf->adaptive_pred_interp_filter = 0;
     sf->adaptive_mode_search = 1;
     sf->cb_partition_search = !boosted;
     sf->cb_pred_filter_search = 1;
     sf->alt_ref_search_fp = 1;
-    sf->motion_field_mode_search = !boosted;
     sf->recode_loop = ALLOW_RECODE_KFMAXBW;
     sf->adaptive_rd_thresh = 3;
     sf->mode_skip_start = 6;
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
     sf->adaptive_interp_filter_search = 1;
-
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->partition_search_breakout_dist_thr = (1 << 25);
-    else
-      sf->partition_search_breakout_dist_thr = (1 << 23);
     sf->partition_search_breakout_rate_thr = 1000;
   }
 
@@ -133,6 +130,7 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->use_lp32x32fdct = 1;
     sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
     sf->use_fast_coef_costing = 1;
+    sf->motion_field_mode_search = !boosted;
 
     if (MIN(cm->width, cm->height) >= 720)
       sf->partition_search_breakout_dist_thr = (1 << 26);
@@ -423,6 +421,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
   } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_more;
+  } else if (sf->mv.subpel_search_method == SUBPEL_SURFACE_FIT) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_surface_fit;
   }
 
   cpi->mb.optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
index ed840086330f8571f10428bb80cf1c08a3b6219a..f224e65de2d437cb1e463951d87fbf0027498809 100644 (file)
@@ -79,6 +79,8 @@ typedef enum {
 typedef enum {
   SUBPEL_TREE = 0,
   SUBPEL_TREE_PRUNED = 1,
+  SUBPEL_TREE_PRUNED_MORE = 2,
+  SUBPEL_SURFACE_FIT = 3,
   // Other methods to come
 } SUBPEL_SEARCH_METHODS;
 
index 8435640c8c132cb5b550beddb2c2cae348b7cc75..d02d0c6d69208a3a2e2954474299462de4468e93 100644 (file)
@@ -43,6 +43,44 @@ void vp9_ssim_parms_8x8_c(uint8_t *s, int sp, uint8_t *r, int rp,
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_ssim_parms_8x8_c(uint16_t *s, int sp, uint16_t *r, int rp,
+                                 uint32_t *sum_s, uint32_t *sum_r,
+                                 uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                                 uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+void vp9_highbd_ssim_parms_8x8_shift_c(uint16_t *s, int sp, uint16_t *r, int rp,
+                                       uint32_t *sum_s, uint32_t *sum_r,
+                                       uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                                       uint32_t *sum_sxr, unsigned int bd,
+                                       unsigned int shift) {
+  int i, j;
+  const int max_val = (1 << bd) - 1;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      int sj = s[j];
+      int rj = r[j];
+      *sum_s += sj;
+      *sum_r += rj;
+      *sum_sq_s += sj * sj;
+      *sum_sq_r += rj * rj;
+      *sum_sxr += sj * rj;
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static const int64_t cc1 =  26634;  // (64^2*(.01*255)^2
 static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
 
@@ -73,6 +111,37 @@ static double ssim_8x8(uint8_t *s, int sp, uint8_t *r, int rp) {
   return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static double high_ssim_8x8_shift(uint16_t *s, int sp, uint16_t *r, int rp,
+                                  unsigned int bd, unsigned int shift) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  const int oshift = bd - 8;
+  vp9_highbd_ssim_parms_8x8_shift(s, sp, r, rp, &sum_s,
+                                  &sum_r, &sum_sq_s, &sum_sq_r,
+                                  &sum_sxr, bd, shift);
+  return similarity(sum_s >> oshift,
+                    sum_r >> oshift,
+                    sum_sq_s >> (2 * oshift),
+                    sum_sq_r >> (2 * oshift),
+                    sum_sxr >> (2 * oshift),
+                    64);
+}
+
+static double high_ssim_8x8(uint16_t *s, int sp, uint16_t *r, int rp,
+                            unsigned int bd) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  const int oshift = bd - 8;
+  vp9_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                            &sum_sxr);
+  return similarity(sum_s >> oshift,
+                    sum_r >> oshift,
+                    sum_sq_s >> (2 * oshift),
+                    sum_sq_r >> (2 * oshift),
+                    sum_sxr >> (2 * oshift),
+                    64);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 // We are using a 8x8 moving window with starting location of each 8x8 window
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
 // block boundaries to penalize blocking artifacts.
@@ -94,6 +163,47 @@ double vp9_ssim2(uint8_t *img1, uint8_t *img2, int stride_img1,
   ssim_total /= samples;
   return ssim_total;
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+double vp9_highbd_ssim2(uint8_t *img1, uint8_t *img2, int stride_img1,
+                        int stride_img2, int width, int height,
+                        unsigned int bd, unsigned int shift) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  if (shift) {
+    // sample point start with each 4x4 location
+    for (i = 0; i <= height - 8;
+         i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+      for (j = 0; j <= width - 8; j += 4) {
+        double v = high_ssim_8x8_shift(CONVERT_TO_SHORTPTR(img1 + j),
+                                       stride_img1,
+                                       CONVERT_TO_SHORTPTR(img2 + j),
+                                       stride_img2,
+                                       bd, shift);
+        ssim_total += v;
+        samples++;
+      }
+    }
+  } else {
+    // sample point start with each 4x4 location
+    for (i = 0; i <= height - 8;
+         i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+      for (j = 0; j <= width - 8; j += 4) {
+        double v = high_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2,
+                                 bd);
+        ssim_total += v;
+        samples++;
+      }
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
                      double *weight) {
   double a, b, c;
@@ -141,3 +251,63 @@ double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
 
   return ssim_all;
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+double vp9_highbd_calc_ssim(YV12_BUFFER_CONFIG *source,
+                            YV12_BUFFER_CONFIG *dest,
+                            double *weight, unsigned int bd,
+                            unsigned int shift) {
+  double a, b, c;
+  double ssimv;
+
+  a = vp9_highbd_ssim2(source->y_buffer, dest->y_buffer,
+                       source->y_stride, dest->y_stride,
+                       source->y_crop_width, source->y_crop_height,
+                       bd, shift);
+
+  b = vp9_highbd_ssim2(source->u_buffer, dest->u_buffer,
+                       source->uv_stride, dest->uv_stride,
+                       source->uv_crop_width, source->uv_crop_height,
+                       bd, shift);
+
+  c = vp9_highbd_ssim2(source->v_buffer, dest->v_buffer,
+                       source->uv_stride, dest->uv_stride,
+                       source->uv_crop_width, source->uv_crop_height,
+                       bd, shift);
+
+  ssimv = a * .8 + .1 * (b + c);
+
+  *weight = 1;
+
+  return ssimv;
+}
+
+double vp9_highbd_calc_ssimg(YV12_BUFFER_CONFIG *source,
+                             YV12_BUFFER_CONFIG *dest, double *ssim_y,
+                             double *ssim_u, double *ssim_v,
+                             unsigned int bd, unsigned int shift) {
+  double ssim_all = 0;
+  double a, b, c;
+
+  a = vp9_highbd_ssim2(source->y_buffer, dest->y_buffer,
+                       source->y_stride, dest->y_stride,
+                       source->y_crop_width, source->y_crop_height,
+                       bd, shift);
+
+  b = vp9_highbd_ssim2(source->u_buffer, dest->u_buffer,
+                       source->uv_stride, dest->uv_stride,
+                       source->uv_crop_width, source->uv_crop_height,
+                       bd, shift);
+
+  c = vp9_highbd_ssim2(source->v_buffer, dest->v_buffer,
+                       source->uv_stride, dest->uv_stride,
+                       source->uv_crop_width, source->uv_crop_height,
+                       bd, shift);
+  *ssim_y = a;
+  *ssim_u = b;
+  *ssim_v = c;
+  ssim_all = (a * 4 + b + c) / 6;
+
+  return ssim_all;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
index d1dd1b722371c682ded5df6a816ad50cbbef520b..28baa4b596c9c9552648f54839820fa3a46f37f9 100644 (file)
@@ -23,6 +23,22 @@ double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
 double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
                       double *ssim_y, double *ssim_u, double *ssim_v);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+double vp9_highbd_calc_ssim(YV12_BUFFER_CONFIG *source,
+                            YV12_BUFFER_CONFIG *dest,
+                            double *weight,
+                            unsigned int bd,
+                            unsigned int shift);
+
+double vp9_highbd_calc_ssimg(YV12_BUFFER_CONFIG *source,
+                             YV12_BUFFER_CONFIG *dest,
+                             double *ssim_y,
+                             double *ssim_u,
+                             double *ssim_v,
+                             unsigned int bps,
+                             unsigned int shift);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
index 6fd796d397c7f167ac0191dd4d4fe08d6bc7f552..eeb1ce9294e5e8db7cbb1e26107462f545924d76 100644 (file)
@@ -56,6 +56,34 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
     mv_precision_uv = MV_PRECISION_Q3;
   }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_high_build_inter_predictor(y_mb_ptr, stride,
+                                   &pred[0], 16,
+                                   &mv,
+                                   scale,
+                                   16, 16,
+                                   which_mv,
+                                   kernel, MV_PRECISION_Q3, x, y, xd->bd);
+
+    vp9_high_build_inter_predictor(u_mb_ptr, uv_stride,
+                                   &pred[256], uv_block_width,
+                                   &mv,
+                                   scale,
+                                   uv_block_width, uv_block_height,
+                                   which_mv,
+                                   kernel, mv_precision_uv, x, y, xd->bd);
+
+    vp9_high_build_inter_predictor(v_mb_ptr, uv_stride,
+                                   &pred[512], uv_block_width,
+                                   &mv,
+                                   scale,
+                                   uv_block_width, uv_block_height,
+                                   which_mv,
+                                   kernel, mv_precision_uv, x, y, xd->bd);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   vp9_build_inter_predictor(y_mb_ptr, stride,
                             &pred[0], 16,
                             &mv,
@@ -133,6 +161,54 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1,
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_temporal_filter_apply_c(uint8_t *frame1_8,
+                                        unsigned int stride,
+                                        uint8_t *frame2_8,
+                                        unsigned int block_width,
+                                        unsigned int block_height,
+                                        int strength,
+                                        int filter_weight,
+                                        unsigned int *accumulator,
+                                        uint16_t *count) {
+  uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
+  uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
+  unsigned int i, j, k;
+  int modifier;
+  int byte = 0;
+  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+  for (i = 0, k = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++, k++) {
+      int src_byte = frame1[byte];
+      int pixel_value = *frame2++;
+
+      modifier   = src_byte - pixel_value;
+      // This is an integer approximation of:
+      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
+      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
+      modifier *= modifier;
+      modifier *= 3;
+      modifier += rounding;
+      modifier >>= strength;
+
+      if (modifier > 16)
+        modifier = 16;
+
+      modifier = 16 - modifier;
+      modifier *= filter_weight;
+
+      count[k] += modifier;
+      accumulator[k] += modifier * pixel_value;
+
+      byte++;
+    }
+
+    byte += stride - block_width;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
                                               uint8_t *arf_frame_buf,
                                               uint8_t *frame_ptr_buf,
@@ -209,13 +285,26 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
   MACROBLOCKD *mbd = &cpi->mb.e_mbd;
   YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
   uint8_t *dst1, *dst2;
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED_ARRAY(16, uint16_t,  predictor16, 16 * 16 * 3);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  predictor8, 16 * 16 * 3);
+  uint8_t *predictor;
+#else
   DECLARE_ALIGNED_ARRAY(16, uint8_t,  predictor, 16 * 16 * 3);
+#endif
   const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
   const int mb_uv_width  = 16 >> mbd->plane[1].subsampling_x;
 
   // Save input state
   uint8_t* input_buffer[MAX_MB_PLANE];
   int i;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    predictor = CONVERT_TO_BYTEPTR(predictor16);
+  } else {
+    predictor = predictor8;
+  }
+#endif
 
   for (i = 0; i < MAX_MB_PLANE; i++)
     input_buffer[i] = mbd->plane[i].pre[0].buf;
@@ -286,6 +375,44 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
               predictor, scale,
               mb_col * 16, mb_row * 16);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            int adj_strength = strength + 2 * (mbd->bd - 8);
+            // Apply the filter (YUV)
+            vp9_highbd_temporal_filter_apply(f->y_buffer + mb_y_offset,
+                                             f->y_stride,
+                                             predictor, 16, 16, adj_strength,
+                                             filter_weight,
+                                             accumulator, count);
+            vp9_highbd_temporal_filter_apply(f->u_buffer + mb_uv_offset,
+                                             f->uv_stride, predictor + 256,
+                                             mb_uv_width, mb_uv_height,
+                                             adj_strength,
+                                             filter_weight, accumulator + 256,
+                                             count + 256);
+            vp9_highbd_temporal_filter_apply(f->v_buffer + mb_uv_offset,
+                                             f->uv_stride, predictor + 512,
+                                             mb_uv_width, mb_uv_height,
+                                             adj_strength, filter_weight,
+                                             accumulator + 512, count + 512);
+          } else {
+            // Apply the filter (YUV)
+            vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+                                      predictor, 16, 16,
+                                      strength, filter_weight,
+                                      accumulator, count);
+            vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+                                      predictor + 256,
+                                      mb_uv_width, mb_uv_height, strength,
+                                      filter_weight, accumulator + 256,
+                                      count + 256);
+            vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+                                      predictor + 512,
+                                      mb_uv_width, mb_uv_height, strength,
+                                      filter_weight, accumulator + 512,
+                                      count + 512);
+          }
+#else
           // Apply the filter (YUV)
           vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
                                     predictor, 16, 16,
@@ -301,9 +428,108 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
                                     mb_uv_width, mb_uv_height, strength,
                                     filter_weight, accumulator + 512,
                                     count + 512);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
         }
       }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        uint16_t *dst1_16;
+        uint16_t *dst2_16;
+        // Normalize filter output to produce AltRef frame
+        dst1 = cpi->alt_ref_buffer.y_buffer;
+        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+        stride = cpi->alt_ref_buffer.y_stride;
+        byte = mb_y_offset;
+        for (i = 0, k = 0; i < 16; i++) {
+          for (j = 0; j < 16; j++, k++) {
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+
+            dst1_16[byte] = (uint16_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+
+          byte += stride - 16;
+        }
+
+        dst1 = cpi->alt_ref_buffer.u_buffer;
+        dst2 = cpi->alt_ref_buffer.v_buffer;
+        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+        dst2_16 = CONVERT_TO_SHORTPTR(dst2);
+        stride = cpi->alt_ref_buffer.uv_stride;
+        byte = mb_uv_offset;
+        for (i = 0, k = 256; i < mb_uv_height; i++) {
+          for (j = 0; j < mb_uv_width; j++, k++) {
+            int m = k + 256;
+
+            // U
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+            dst1_16[byte] = (uint16_t)pval;
+
+            // V
+            pval = accumulator[m] + (count[m] >> 1);
+            pval *= fixed_divide[count[m]];
+            pval >>= 19;
+            dst2_16[byte] = (uint16_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+
+          byte += stride - mb_uv_width;
+        }
+      } else {
+        // Normalize filter output to produce AltRef frame
+        dst1 = cpi->alt_ref_buffer.y_buffer;
+        stride = cpi->alt_ref_buffer.y_stride;
+        byte = mb_y_offset;
+        for (i = 0, k = 0; i < 16; i++) {
+          for (j = 0; j < 16; j++, k++) {
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+
+            dst1[byte] = (uint8_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+          byte += stride - 16;
+        }
+
+        dst1 = cpi->alt_ref_buffer.u_buffer;
+        dst2 = cpi->alt_ref_buffer.v_buffer;
+        stride = cpi->alt_ref_buffer.uv_stride;
+        byte = mb_uv_offset;
+        for (i = 0, k = 256; i < mb_uv_height; i++) {
+          for (j = 0; j < mb_uv_width; j++, k++) {
+            int m = k + 256;
+
+            // U
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+            dst1[byte] = (uint8_t)pval;
+
+            // V
+            pval = accumulator[m] + (count[m] >> 1);
+            pval *= fixed_divide[count[m]];
+            pval >>= 19;
+            dst2[byte] = (uint8_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+          byte += stride - mb_uv_width;
+        }
+      }
+#else
       // Normalize filter output to produce AltRef frame
       dst1 = cpi->alt_ref_buffer.y_buffer;
       stride = cpi->alt_ref_buffer.y_stride;
@@ -347,6 +573,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
         }
         byte += stride - mb_uv_width;
       }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       mb_y_offset += 16;
       mb_uv_offset += mb_uv_width;
     }
@@ -467,7 +694,8 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
                                       get_frame_new_buffer(cm)->y_crop_height,
                                       get_frame_new_buffer(cm)->y_crop_width,
                                       get_frame_new_buffer(cm)->y_crop_height);
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
     for (frame = 0; frame < frames_to_blur; ++frame) {
       if (cm->mi_cols * MI_SIZE != frames[frame]->y_width ||
           cm->mi_rows * MI_SIZE != frames[frame]->y_height) {
@@ -502,7 +730,7 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
                                       frames[0]->y_crop_height,
                                       frames[0]->y_crop_width,
                                       frames[0]->y_crop_height);
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
   temporal_filter_iterate_c(cpi, frames, frames_to_blur,
index 063c0bafe7babb161e7699b5eacebae4ffbabbdd..da2b6857ca279d1d7cbc420bb870fbd719929ca3 100644 (file)
@@ -53,6 +53,12 @@ extern const int16_t *vp9_dct_value_cost_ptr;
  *  fields are not.
  */
 extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
+#if CONFIG_VP9_HIGHBITDEPTH
+extern const int16_t *vp9_dct_value_cost_high10_ptr;
+extern const TOKENVALUE *vp9_dct_value_tokens_high10_ptr;
+extern const int16_t *vp9_dct_value_cost_high12_ptr;
+extern const TOKENVALUE *vp9_dct_value_tokens_high12_ptr;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"
index 07a3be8dbb9ce54ef79354adf9c16d563549082c..9414120f64e33c23ea64f8a9b524c036b8e8df62 100644 (file)
@@ -93,6 +93,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_intrapred_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_bilinear_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_loopfilter_intrin_sse2.c
 endif
 
 # common (c)
index fbf4aa2fccfa61295e462f8e3435259382472e84..473166cb7e67c0bcfa5cefac4a74ca31ad222e14 100644 (file)
@@ -274,27 +274,47 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   }
 
 #if !CONFIG_VP9_HIGHBITDEPTH
-  if (cfg->g_profile > (unsigned int)PROFILE_1)
+  if (cfg->g_profile > (unsigned int)PROFILE_1) {
     ERROR("Profile > 1 not supported in this build configuration");
+  }
 #endif
   if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
-      extra_cfg->bit_depth > VPX_BITS_8)
+      cfg->g_bit_depth > VPX_BITS_8) {
     ERROR("Codec high bit-depth not supported in profile < 2");
+  }
+  if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+      cfg->g_input_bit_depth > 8) {
+    ERROR("Source high bit-depth not supported in profile < 2");
+  }
   if (cfg->g_profile > (unsigned int)PROFILE_1 &&
-      extra_cfg->bit_depth == VPX_BITS_8)
+      cfg->g_bit_depth == VPX_BITS_8) {
     ERROR("Codec bit-depth 8 not supported in profile > 1");
+  }
 
   return VPX_CODEC_OK;
 }
 
-
 static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
                                     const vpx_image_t *img) {
   switch (img->fmt) {
     case VPX_IMG_FMT_YV12:
     case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I42016:
+      break;
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
+      if (ctx->cfg.g_profile != (unsigned int)PROFILE_1) {
+        ERROR("Invalid image format. I422, I444 images are "
+              "not supported in profile.");
+      }
+      break;
+    case VPX_IMG_FMT_I42216:
+    case VPX_IMG_FMT_I44416:
+      if (ctx->cfg.g_profile != (unsigned int)PROFILE_1 &&
+          ctx->cfg.g_profile != (unsigned int)PROFILE_3) {
+        ERROR("Invalid image format. 16-bit I422, I444 images are "
+              "not supported in profile.");
+      }
       break;
     default:
       ERROR("Invalid image format. Only YV12, I420, I422, I444 images are "
@@ -330,7 +350,7 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->profile = cfg->g_profile;
   oxcf->width   = cfg->g_w;
   oxcf->height  = cfg->g_h;
-  oxcf->bit_depth = extra_cfg->bit_depth;
+  oxcf->bit_depth = cfg->g_bit_depth;
   oxcf->input_bit_depth = cfg->g_input_bit_depth;
   // guess a frame rate if out of whack, use 30
   oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
index 393c66ebdad2b39a68597fe773451acf9ffa84d0..85e32d351e9d86d89cc651e9cefcc1a1dbb57198 100644 (file)
@@ -437,7 +437,6 @@ static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
     // call to get_frame.
     if (!(*iter)) {
       img = &ctx->img;
-      img->bit_depth = (int)ctx->pbi->common.bit_depth;
       *iter = img;
     }
   }
index fc98b62c5e8698d5044d10aaa41362900bf21862..0398b40e84430b9899b3a02969718eda9adb50e7 100644 (file)
@@ -46,6 +46,22 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG  *yv12,
   img->stride[VPX_PLANE_U] = yv12->uv_stride;
   img->stride[VPX_PLANE_V] = yv12->uv_stride;
   img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
+    // vpx_image_t uses byte strides and a pointer to the first byte
+    // of the image.
+    img->fmt |= VPX_IMG_FMT_HIGHBITDEPTH;
+    img->bit_depth = yv12->bit_depth;
+    img->planes[VPX_PLANE_Y] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->y_buffer);
+    img->planes[VPX_PLANE_U] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->u_buffer);
+    img->planes[VPX_PLANE_V] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->v_buffer);
+    img->planes[VPX_PLANE_ALPHA] = NULL;
+    img->stride[VPX_PLANE_Y] = 2 * yv12->y_stride;
+    img->stride[VPX_PLANE_U] = 2 * yv12->uv_stride;
+    img->stride[VPX_PLANE_V] = 2 * yv12->uv_stride;
+    img->stride[VPX_PLANE_ALPHA] = 2 * yv12->y_stride;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   img->bps = bps;
   img->user_priv = user_priv;
   img->img_data = yv12->buffer_alloc;
@@ -72,6 +88,32 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
   yv12->y_stride = img->stride[VPX_PLANE_Y];
   yv12->uv_stride = img->stride[VPX_PLANE_U];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    // In vpx_image_t
+    //     planes point to uint8 address of start of data
+    //     stride counts uint8s to reach next row
+    // In YV12_BUFFER_CONFIG
+    //     y_buffer, u_buffer, v_buffer point to uint16 address of data
+    //     stride and border counts in uint16s
+    // This means that all the address calculations in the main body of code
+    // should work correctly.
+    // However, before we do any pixel operations we need to cast the address
+    // to a uint16 ponter and double its value.
+    yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer);
+    yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer);
+    yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer);
+    yv12->y_stride >>= 1;
+    yv12->uv_stride >>= 1;
+    yv12->flags = YV12_FLAG_HIGHBITDEPTH;
+  } else {
+    yv12->flags = 0;
+  }
+  yv12->border  = (yv12->y_stride - img->w) / 2;
+#else
+  yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
   return VPX_CODEC_OK;
 }
index e58b61ea390d7878240d2aab99adc9d45dc51b13..6ae461d15f824f2507c7fd8afc6a84fd6ed9aa22 100644 (file)
@@ -110,6 +110,7 @@ static vpx_image_t *img_alloc_helper(vpx_image_t   *img,
     case VPX_IMG_FMT_YV12:
     case VPX_IMG_FMT_VPXI420:
     case VPX_IMG_FMT_VPXYV12:
+    case VPX_IMG_FMT_I42016:
       ycs = 1;
       break;
     default:
@@ -209,39 +210,40 @@ int vpx_img_set_rect(vpx_image_t  *img,
       img->planes[VPX_PLANE_PACKED] =
         img->img_data + x * img->bps / 8 + y * img->stride[VPX_PLANE_PACKED];
     } else {
+      const int bytes_per_sample =
+          (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
       data = img->img_data;
 
       if (img->fmt & VPX_IMG_FMT_HAS_ALPHA) {
         img->planes[VPX_PLANE_ALPHA] =
-          data + x + y * img->stride[VPX_PLANE_ALPHA];
+            data + x * bytes_per_sample + y * img->stride[VPX_PLANE_ALPHA];
         data += img->h * img->stride[VPX_PLANE_ALPHA];
       }
 
-      img->planes[VPX_PLANE_Y] = data + x + y * img->stride[VPX_PLANE_Y];
+      img->planes[VPX_PLANE_Y] = data + x * bytes_per_sample +
+          y * img->stride[VPX_PLANE_Y];
       data += img->h * img->stride[VPX_PLANE_Y];
 
       if (!(img->fmt & VPX_IMG_FMT_UV_FLIP)) {
-        img->planes[VPX_PLANE_U] = data
-                                   + (x >> img->x_chroma_shift)
-                                   + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
+        img->planes[VPX_PLANE_U] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
         data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
-        img->planes[VPX_PLANE_V] = data
-                                   + (x >> img->x_chroma_shift)
-                                   + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
+        img->planes[VPX_PLANE_V] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
       } else {
-        img->planes[VPX_PLANE_V] = data
-                                   + (x >> img->x_chroma_shift)
-                                   + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
+        img->planes[VPX_PLANE_V] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
         data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
-        img->planes[VPX_PLANE_U] = data
-                                   + (x >> img->x_chroma_shift)
-                                   + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
+        img->planes[VPX_PLANE_U] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
       }
     }
-
     return 0;
   }
-
   return -1;
 }
 
index ffeefb8192fb9c5eeac0fb1a06d88dc5b021ee2b..8eee303f45d5cce50019b93eea28df0ca55e8711 100644 (file)
@@ -63,4 +63,15 @@ typedef size_t uintptr_t;
 #include <inttypes.h>
 #endif
 
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+#if CONFIG_VP9_HIGHBITDEPTH && CONFIG_VP9
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+#else
+typedef int32_t tran_high_t;
+typedef int16_t tran_low_t;
+#endif
+
 #endif  // VPX_VPX_INTEGER_H_
index 1e4a8e28873e571b96ccbfdd3c0c53082be36893..42c98f5a838241b9045ff35047b9a64da9b30686 100644 (file)
@@ -27,6 +27,12 @@ extern "C" {
 
 int arm_cpu_caps(void);
 
+// Earlier gcc compilers have issues with some neon intrinsics
+#if !defined(__clang__) && defined(__GNUC__) && \
+    __GNUC__ == 4 && __GNUC_MINOR__ <= 6
+#define VPX_INCOMPATIBLE_GCC
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
index cf23c295e02fe99c19bfdb8b2110166ed54cce8e..f5c945c056054348336109d239fc8c0ee629b2d4 100644 (file)
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -551,8 +551,8 @@ static void high_img_upshift(vpx_image_t *dst, vpx_image_t *src,
     int h = src->d_h;
     int x, y;
     if (plane) {
-      w >>= src->x_chroma_shift;
-      h >>= src->y_chroma_shift;
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
     }
     for (y = 0; y < h; y++) {
       uint16_t *p_src = (uint16_t *)(src->planes[plane] +
@@ -590,8 +590,8 @@ static void low_img_upshift(vpx_image_t *dst, vpx_image_t *src,
     int h = src->d_h;
     int x, y;
     if (plane) {
-      w >>= src->x_chroma_shift;
-      h >>= src->y_chroma_shift;
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
     }
     for (y = 0; y < h; y++) {
       uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
@@ -636,8 +636,8 @@ static void high_img_downshift(vpx_image_t *dst, vpx_image_t *src,
     int h = src->d_h;
     int x, y;
     if (plane) {
-      w >>= src->x_chroma_shift;
-      h >>= src->y_chroma_shift;
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
     }
     for (y = 0; y < h; y++) {
       uint16_t *p_src = (uint16_t *)(src->planes[plane] +
@@ -674,8 +674,8 @@ static void low_img_downshift(vpx_image_t *dst, vpx_image_t *src,
     int h = src->d_h;
     int x, y;
     if (plane) {
-      w >>= src->x_chroma_shift;
-      h >>= src->y_chroma_shift;
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
     }
     for (y = 0; y < h; y++) {
       uint16_t *p_src = (uint16_t *)(src->planes[plane] +
@@ -696,6 +696,14 @@ static void img_downshift(vpx_image_t *dst, vpx_image_t *src,
     low_img_downshift(dst, src, down_shift);
   }
 }
+
+static int img_shifted_realloc_required(const vpx_image_t *img,
+                                        const vpx_image_t *shifted,
+                                        vpx_img_fmt_t required_fmt) {
+  return img->d_w != shifted->d_w ||
+         img->d_h != shifted->d_h ||
+         required_fmt != shifted->fmt;
+}
 #endif
 
 int main_loop(int argc, const char **argv_) {
@@ -1130,16 +1138,17 @@ int main_loop(int argc, const char **argv_) {
       }
       // Shift up or down if necessary
       if (output_bit_depth != img->bit_depth) {
+        const vpx_img_fmt_t shifted_fmt = output_bit_depth == 8 ?
+            img->fmt ^ (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) :
+            img->fmt | VPX_IMG_FMT_HIGHBITDEPTH;
+        if (img_shifted &&
+            img_shifted_realloc_required(img, img_shifted, shifted_fmt)) {
+          vpx_img_free(img_shifted);
+          img_shifted = NULL;
+        }
         if (!img_shifted) {
-          if (output_bit_depth == 8) {
-            img_shifted = vpx_img_alloc(
-                NULL, img->fmt - VPX_IMG_FMT_HIGHBITDEPTH,
-                img->d_w, img->d_h, 16);
-          } else {
-            img_shifted = vpx_img_alloc(
-                NULL, img->fmt | VPX_IMG_FMT_HIGHBITDEPTH,
-                img->d_w, img->d_h, 16);
-          }
+          img_shifted = vpx_img_alloc(NULL, shifted_fmt,
+                                      img->d_w, img->d_h, 16);
           img_shifted->bit_depth = output_bit_depth;
         }
         if (output_bit_depth > img->bit_depth) {