Merge "highbd_quantize_fp_32x32: normalize abs_qcoeff type"

author James Zern <jzern@google.com>

Tue, 27 Jun 2017 23:30:16 +0000 (23:30 +0000)

committer Gerrit Code Review <noreply-gerritcodereview@google.com>

Tue, 27 Jun 2017 23:30:17 +0000 (23:30 +0000)
author James Zern <jzern@google.com>
Tue, 27 Jun 2017 23:30:16 +0000 (23:30 +0000)
committer Gerrit Code Review <noreply-gerritcodereview@google.com>
Tue, 27 Jun 2017 23:30:17 +0000 (23:30 +0000)
diff --git a/test/datarate_test.cc b/test/datarate_test.cc

index a120a88d2a249f9abd675021f4f45f55fdae215b..8e93de6b9ccefdb9d040169e0c8bc58e2f7f1051 100644 (file)
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -1396,7 +1396,7 @@ TEST_P(DatarateOnePassCbrSvc, DISABLED_OnePassCbrSvc2SL3TLSmallKf) {
  
  // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
  // 3 temporal layers. Run HD clip with 4 threads.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4threads) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4Threads) {
    cfg_.rc_buf_initial_sz = 500;
    cfg_.rc_buf_optimal_sz = 500;
    cfg_.rc_buf_sz = 1000;
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc

index d8054c4eb59a5c2333c4513759dffc18c5fa71b1..cd26bfd09247553390fb586eb9104537ca7ba993 100644 (file)
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -383,14 +383,14 @@ INSTANTIATE_TEST_CASE_P(C, PartialTrans32x32Test,
                                                       VPX_BITS_8)));
  #endif  // CONFIG_VP9_HIGHBITDEPTH
  
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
  INSTANTIATE_TEST_CASE_P(
      NEON, Trans32x32Test,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_neon,
-                                 0, VPX_BITS_8),
+    ::testing::Values(make_tuple(&vpx_fdct32x32_neon,
+                                 &vpx_idct32x32_1024_add_neon, 0, VPX_BITS_8),
                        make_tuple(&vpx_fdct32x32_rd_c,
                                   &vpx_idct32x32_1024_add_neon, 1, VPX_BITS_8)));
-#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
  
  #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
  INSTANTIATE_TEST_CASE_P(
diff --git a/test/dct_test.cc b/test/dct_test.cc

index 109ccb630a50a9222c53aec560fbbbfd91d3c82b..f7078f078ec80c89a6f6d8d11dd1f200046c6aed 100644 (file)
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -498,18 +498,16 @@ INSTANTIATE_TEST_CASE_P(
  // TODO(johannkoenig): high bit depth fdct8x8.
  INSTANTIATE_TEST_CASE_P(
      SSSE3, TransDCT,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_c,
-                                 &vpx_idct32x32_1024_add_ssse3, 32, 0,
-                                 VPX_BITS_8),
+    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2,
+                                 32, 0, VPX_BITS_8),
                        make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_sse2, 8, 0,
                                   VPX_BITS_8)));
  #else
  // vpx_fdct8x8_ssse3 is only available in 64 bit builds.
  INSTANTIATE_TEST_CASE_P(
      SSSE3, TransDCT,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_c,
-                                 &vpx_idct32x32_1024_add_ssse3, 32, 0,
-                                 VPX_BITS_8),
+    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2,
+                                 32, 0, VPX_BITS_8),
                        make_tuple(&vpx_fdct8x8_ssse3, &vpx_idct8x8_64_add_sse2,
                                   8, 0, VPX_BITS_8)));
  #endif  // !ARCH_X86_64
@@ -529,7 +527,10 @@ INSTANTIATE_TEST_CASE_P(
  #if !CONFIG_EMULATE_HARDWARE
  INSTANTIATE_TEST_CASE_P(
      NEON, TransDCT,
-    ::testing::Values(make_tuple(&vpx_fdct16x16_neon,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_neon,
+                                 &vpx_idct32x32_1024_add_neon, 32, 0,
+                                 VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_neon,
                                   &vpx_idct16x16_256_add_neon, 16, 0,
                                   VPX_BITS_8),
                        make_tuple(&vpx_fdct8x8_neon, &vpx_idct8x8_64_add_neon, 8,
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc

index 8b99766b50af34e80387ab0bef4299b66264d329..30df75f97a5a72d434ff15ab3f4366fc95d03500 100644 (file)
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -477,7 +477,9 @@ const PartialInvTxfmParam c_partial_idct_tests[] = {
  INSTANTIATE_TEST_CASE_P(C, PartialIDctTest,
                          ::testing::ValuesIn(c_partial_idct_tests));
  
-#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+#if !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON
  const PartialInvTxfmParam neon_partial_idct_tests[] = {
  #if CONFIG_VP9_HIGHBITDEPTH
    make_tuple(&vpx_highbd_fdct32x32_c,
@@ -625,9 +627,9 @@ const PartialInvTxfmParam neon_partial_idct_tests[] = {
  
  INSTANTIATE_TEST_CASE_P(NEON, PartialIDctTest,
                          ::testing::ValuesIn(neon_partial_idct_tests));
-#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_NEON
  
-#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+#if HAVE_SSE2
  // 32x32_135_ is implemented using the 1024 version.
  const PartialInvTxfmParam sse2_partial_idct_tests[] = {
  #if CONFIG_VP9_HIGHBITDEPTH
@@ -734,12 +736,10 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
  INSTANTIATE_TEST_CASE_P(SSE2, PartialIDctTest,
                          ::testing::ValuesIn(sse2_partial_idct_tests));
  
-#endif  // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_SSE2
  
-#if HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE
+#if HAVE_SSSE3
  const PartialInvTxfmParam ssse3_partial_idct_tests[] = {
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
-             &wrapper<vpx_idct32x32_1024_add_ssse3>, TX_32X32, 1024, 8, 1),
    make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_135_add_c>,
               &wrapper<vpx_idct32x32_135_add_ssse3>, TX_32X32, 135, 8, 1),
    make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
@@ -750,9 +750,26 @@ const PartialInvTxfmParam ssse3_partial_idct_tests[] = {
  
  INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest,
                          ::testing::ValuesIn(ssse3_partial_idct_tests));
-#endif  // HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_SSSE3
+
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+const PartialInvTxfmParam sse4_1_partial_idct_tests[] = {
+  make_tuple(
+      &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+      &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+      &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+      &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 12, 2)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, PartialIDctTest,
+                        ::testing::ValuesIn(sse4_1_partial_idct_tests));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
  
-#if HAVE_DSPR2 && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
  const PartialInvTxfmParam dspr2_partial_idct_tests[] = {
    make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
               &wrapper<vpx_idct32x32_1024_add_dspr2>, TX_32X32, 1024, 8, 1),
@@ -780,9 +797,9 @@ const PartialInvTxfmParam dspr2_partial_idct_tests[] = {
  
  INSTANTIATE_TEST_CASE_P(DSPR2, PartialIDctTest,
                          ::testing::ValuesIn(dspr2_partial_idct_tests));
-#endif  // HAVE_DSPR2 && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
  
-#if HAVE_MSA && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
  // 32x32_135_ is implemented using the 1024 version.
  const PartialInvTxfmParam msa_partial_idct_tests[] = {
    make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
@@ -811,6 +828,8 @@ const PartialInvTxfmParam msa_partial_idct_tests[] = {
  
  INSTANTIATE_TEST_CASE_P(MSA, PartialIDctTest,
                          ::testing::ValuesIn(msa_partial_idct_tests));
-#endif  // HAVE_MSA && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // !CONFIG_EMULATE_HARDWARE
  
  }  // namespace
diff --git a/test/variance_test.cc b/test/variance_test.cc

index d5727a67a53a736ef8eb0f7a48479993683ba1ae..8765c48f245a5477f64fcf91fec8506d3904f599 100644 (file)
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -561,46 +561,26 @@ void MainTestClass<FunctionType>::MaxTestSse() {
  
  ////////////////////////////////////////////////////////////////////////////////
  
-using ::std::tr1::get;
-using ::std::tr1::make_tuple;
-using ::std::tr1::tuple;
-
-template <typename SubpelVarianceFunctionType>
+template <typename FunctionType>
  class SubpelVarianceTest
-    : public ::testing::TestWithParam<
-          tuple<int, int, SubpelVarianceFunctionType, int> > {
+    : public ::testing::TestWithParam<TestParams<FunctionType> > {
   public:
    virtual void SetUp() {
-    const tuple<int, int, SubpelVarianceFunctionType, int> &params =
-        this->GetParam();
-    log2width_ = get<0>(params);
-    width_ = 1 << log2width_;
-    log2height_ = get<1>(params);
-    height_ = 1 << log2height_;
-    subpel_variance_ = get<2>(params);
-    if (get<3>(params)) {
-      bit_depth_ = (vpx_bit_depth_t)get<3>(params);
-      use_high_bit_depth_ = true;
-    } else {
-      bit_depth_ = VPX_BITS_8;
-      use_high_bit_depth_ = false;
-    }
-    mask_ = (1 << bit_depth_) - 1;
+    params_ = this->GetParam();
  
      rnd_.Reset(ACMRandom::DeterministicSeed());
-    block_size_ = width_ * height_;
-    if (!use_high_bit_depth_) {
-      src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
-      sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
-      ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+    if (!use_high_bit_depth()) {
+      src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size()));
+      sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size()));
+      ref_ = new uint8_t[block_size() + width() + height() + 1];
  #if CONFIG_VP9_HIGHBITDEPTH
      } else {
        src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
-          vpx_memalign(16, block_size_ * sizeof(uint16_t))));
+          vpx_memalign(16, block_size() * sizeof(uint16_t))));
        sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
-          vpx_memalign(16, block_size_ * sizeof(uint16_t))));
-      ref_ =
-          CONVERT_TO_BYTEPTR(new uint16_t[block_size_ + width_ + height_ + 1]);
+          vpx_memalign(16, block_size() * sizeof(uint16_t))));
+      ref_ = CONVERT_TO_BYTEPTR(
+          new uint16_t[block_size() + width() + height() + 1]);
  #endif  // CONFIG_VP9_HIGHBITDEPTH
      }
      ASSERT_TRUE(src_ != NULL);
@@ -609,7 +589,7 @@ class SubpelVarianceTest
    }
  
    virtual void TearDown() {
-    if (!use_high_bit_depth_) {
+    if (!use_high_bit_depth()) {
        vpx_free(src_);
        delete[] ref_;
        vpx_free(sec_);
@@ -631,42 +611,45 @@ class SubpelVarianceTest
    uint8_t *src_;
    uint8_t *ref_;
    uint8_t *sec_;
-  bool use_high_bit_depth_;
-  vpx_bit_depth_t bit_depth_;
-  int width_, log2width_;
-  int height_, log2height_;
-  int block_size_, mask_;
-  SubpelVarianceFunctionType subpel_variance_;
+  TestParams<FunctionType> params_;
+
+  // some relay helpers
+  bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
+  int byte_shift() const { return params_.bit_depth - 8; }
+  int block_size() const { return params_.block_size; }
+  int width() const { return params_.width; }
+  int height() const { return params_.height; }
+  uint32_t mask() const { return params_.mask; }
  };
  
  template <typename SubpelVarianceFunctionType>
  void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
    for (int x = 0; x < 8; ++x) {
      for (int y = 0; y < 8; ++y) {
-      if (!use_high_bit_depth_) {
-        for (int j = 0; j < block_size_; j++) {
+      if (!use_high_bit_depth()) {
+        for (int j = 0; j < block_size(); j++) {
            src_[j] = rnd_.Rand8();
          }
-        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
            ref_[j] = rnd_.Rand8();
          }
  #if CONFIG_VP9_HIGHBITDEPTH
        } else {
-        for (int j = 0; j < block_size_; j++) {
-          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
+        for (int j = 0; j < block_size(); j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
          }
-        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
-          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
          }
  #endif  // CONFIG_VP9_HIGHBITDEPTH
        }
        unsigned int sse1, sse2;
        unsigned int var1;
        ASM_REGISTER_STATE_CHECK(
-          var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1));
-      const unsigned int var2 =
-          subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2,
-                              use_high_bit_depth_, bit_depth_);
+          var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+      const unsigned int var2 = subpel_variance_ref(
+          ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+          use_high_bit_depth(), params_.bit_depth);
        EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
        EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
      }
@@ -680,28 +663,28 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
    // Ref: Set the first half of values to the maximum, the second half to 0.
    for (int x = 0; x < 8; ++x) {
      for (int y = 0; y < 8; ++y) {
-      const int half = block_size_ / 2;
-      if (!use_high_bit_depth_) {
+      const int half = block_size() / 2;
+      if (!use_high_bit_depth()) {
          memset(src_, 0, half);
          memset(src_ + half, 255, half);
          memset(ref_, 255, half);
-        memset(ref_ + half, 0, half + width_ + height_ + 1);
+        memset(ref_ + half, 0, half + width() + height() + 1);
  #if CONFIG_VP9_HIGHBITDEPTH
        } else {
-        vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half);
+        vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask(), half);
          vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half);
          vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half);
-        vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_,
-                     half + width_ + height_ + 1);
+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask(),
+                     half + width() + height() + 1);
  #endif  // CONFIG_VP9_HIGHBITDEPTH
        }
        unsigned int sse1, sse2;
        unsigned int var1;
        ASM_REGISTER_STATE_CHECK(
-          var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1));
-      const unsigned int var2 =
-          subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2,
-                              use_high_bit_depth_, bit_depth_);
+          var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+      const unsigned int var2 = subpel_variance_ref(
+          ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+          use_high_bit_depth(), params_.bit_depth);
        EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
        EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
      }
@@ -712,33 +695,32 @@ template <>
  void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
    for (int x = 0; x < 8; ++x) {
      for (int y = 0; y < 8; ++y) {
-      if (!use_high_bit_depth_) {
-        for (int j = 0; j < block_size_; j++) {
+      if (!use_high_bit_depth()) {
+        for (int j = 0; j < block_size(); j++) {
            src_[j] = rnd_.Rand8();
            sec_[j] = rnd_.Rand8();
          }
-        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
            ref_[j] = rnd_.Rand8();
          }
  #if CONFIG_VP9_HIGHBITDEPTH
        } else {
-        for (int j = 0; j < block_size_; j++) {
-          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
-          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask_;
+        for (int j = 0; j < block_size(); j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask();
          }
-        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
-          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
          }
  #endif  // CONFIG_VP9_HIGHBITDEPTH
        }
        uint32_t sse1, sse2;
        uint32_t var1, var2;
-      ASM_REGISTER_STATE_CHECK(var1 =
-                                   subpel_variance_(ref_, width_ + 1, x, y,
-                                                    src_, width_, &sse1, sec_));
-      var2 = subpel_avg_variance_ref(ref_, src_, sec_, log2width_, log2height_,
-                                     x, y, &sse2, use_high_bit_depth_,
-                                     static_cast<vpx_bit_depth_t>(bit_depth_));
+      ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y,
+                                                   src_, width(), &sse1, sec_));
+      var2 = subpel_avg_variance_ref(ref_, src_, sec_, params_.log2width,
+                                     params_.log2height, x, y, &sse2,
+                                     use_high_bit_depth(), params_.bit_depth);
        EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
        EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
      }
@@ -798,37 +780,41 @@ INSTANTIATE_TEST_CASE_P(
                        VarianceParams(2, 3, &vpx_variance4x8_c),
                        VarianceParams(2, 2, &vpx_variance4x4_c)));
  
+typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams;
  INSTANTIATE_TEST_CASE_P(
      C, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
-
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
+
+typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams;
  INSTANTIATE_TEST_CASE_P(
      C, VpxSubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
  
  #if CONFIG_VP9_HIGHBITDEPTH
  typedef MainTestClass<VarianceMxNFunc> VpxHBDMseTest;
@@ -850,18 +836,18 @@ TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
  /* TODO(debargha): This test does not support the highbd version
  INSTANTIATE_TEST_CASE_P(
      C, VpxHBDMseTest,
-    ::testing::Values(make_tuple(4, 4, &vpx_highbd_12_mse16x16_c),
-                      make_tuple(4, 4, &vpx_highbd_12_mse16x8_c),
-                      make_tuple(4, 4, &vpx_highbd_12_mse8x16_c),
-                      make_tuple(4, 4, &vpx_highbd_12_mse8x8_c),
-                      make_tuple(4, 4, &vpx_highbd_10_mse16x16_c),
-                      make_tuple(4, 4, &vpx_highbd_10_mse16x8_c),
-                      make_tuple(4, 4, &vpx_highbd_10_mse8x16_c),
-                      make_tuple(4, 4, &vpx_highbd_10_mse8x8_c),
-                      make_tuple(4, 4, &vpx_highbd_8_mse16x16_c),
-                      make_tuple(4, 4, &vpx_highbd_8_mse16x8_c),
-                      make_tuple(4, 4, &vpx_highbd_8_mse8x16_c),
-                      make_tuple(4, 4, &vpx_highbd_8_mse8x8_c)));
+    ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c),
+                      MseParams(4, 4, &vpx_highbd_12_mse16x8_c),
+                      MseParams(4, 4, &vpx_highbd_12_mse8x16_c),
+                      MseParams(4, 4, &vpx_highbd_12_mse8x8_c),
+                      MseParams(4, 4, &vpx_highbd_10_mse16x16_c),
+                      MseParams(4, 4, &vpx_highbd_10_mse16x8_c),
+                      MseParams(4, 4, &vpx_highbd_10_mse8x16_c),
+                      MseParams(4, 4, &vpx_highbd_10_mse8x8_c),
+                      MseParams(4, 4, &vpx_highbd_8_mse16x16_c),
+                      MseParams(4, 4, &vpx_highbd_8_mse16x8_c),
+                      MseParams(4, 4, &vpx_highbd_8_mse8x16_c),
+                      MseParams(4, 4, &vpx_highbd_8_mse8x8_c)));
  */
  
  INSTANTIATE_TEST_CASE_P(
@@ -909,88 +895,161 @@ INSTANTIATE_TEST_CASE_P(
  INSTANTIATE_TEST_CASE_P(
      C, VpxHBDSubpelVarianceTest,
      ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
-        make_tuple(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
-        make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
-        make_tuple(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
-        make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12),
-        make_tuple(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12),
-        make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, 12)));
+        SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
+        SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
+        SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
+        SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8),
+        SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8),
+        SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8),
+        SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8),
+        SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8),
+        SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8),
+        SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8),
+        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
+        SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
+        SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
+        SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c,
+                             10),
+        SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c,
+                             10),
+        SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c,
+                             10),
+        SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c,
+                             10),
+        SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c,
+                             10),
+        SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c,
+                             10),
+        SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c,
+                             10),
+        SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10),
+        SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10),
+        SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10),
+        SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
+        SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
+        SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
+        SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c,
+                             12),
+        SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c,
+                             12),
+        SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c,
+                             12),
+        SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c,
+                             12),
+        SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c,
+                             12),
+        SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c,
+                             12),
+        SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c,
+                             12),
+        SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12),
+        SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12),
+        SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12),
+        SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12),
+        SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12),
+        SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c,
+                             12)));
  
  INSTANTIATE_TEST_CASE_P(
      C, VpxHBDSubpelAvgVarianceTest,
      ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, 8),
-        make_tuple(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, 8),
-        make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, 8),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_c, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_c, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_c, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_c, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_c, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_c, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_c, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_c, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_c, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
-        make_tuple(2, 3, &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
-        make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_c, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_c, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_c, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_c, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_c, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_c, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_c, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_c, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_c, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12),
-        make_tuple(2, 3, &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12),
-        make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_c, 12)));
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8),
+        SubpelAvgVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c,
+                                8),
+        SubpelAvgVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c,
+                                8),
+        SubpelAvgVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c,
+                                8),
+        SubpelAvgVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c,
+                                8),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x64_c,
+                                10),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x32_c,
+                                10),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x64_c,
+                                10),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x32_c,
+                                10),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x16_c,
+                                10),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x32_c,
+                                10),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x16_c,
+                                10),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x8_c,
+                                10),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x16_c,
+                                10),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x64_c,
+                                12),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x32_c,
+                                12),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x64_c,
+                                12),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x32_c,
+                                12),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x16_c,
+                                12),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x32_c,
+                                12),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x16_c,
+                                12),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x8_c,
+                                12),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x16_c,
+                                12),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x4_c,
+                                12)));
  #endif  // CONFIG_VP9_HIGHBITDEPTH
  
  #if HAVE_SSE2
@@ -1021,36 +1080,37 @@ INSTANTIATE_TEST_CASE_P(
  
  INSTANTIATE_TEST_CASE_P(
      SSE2, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
  
  INSTANTIATE_TEST_CASE_P(
      SSE2, VpxSubpelAvgVarianceTest,
      ::testing::Values(
-        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0),
-        make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0),
-        make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0),
-        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0),
-        make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0),
-        make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0),
-        make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0),
-        make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0),
-        make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
-        make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
-        make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
-        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
-        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
  
  #if CONFIG_VP9_HIGHBITDEPTH
  /* TODO(debargha): This test does not support the highbd version
@@ -1107,112 +1167,219 @@ INSTANTIATE_TEST_CASE_P(
  INSTANTIATE_TEST_CASE_P(
      SSE2, VpxHBDSubpelVarianceTest,
      ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2, 12),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2, 10),
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2, 8)));
+        SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2,
+                             12),
+        SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2,
+                             12),
+        SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2,
+                             12),
+        SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2,
+                             12),
+        SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2,
+                             12),
+        SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2,
+                             12),
+        SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2,
+                             12),
+        SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2,
+                             12),
+        SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2,
+                             12),
+        SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2,
+                             12),
+        SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2,
+                             12),
+        SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2,
+                             10),
+        SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2,
+                             10),
+        SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2,
+                             10),
+        SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2,
+                             10),
+        SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2,
+                             10),
+        SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2,
+                             10),
+        SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2,
+                             10),
+        SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2,
+                             10),
+        SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2,
+                             10),
+        SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2,
+                             10),
+        SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2,
+                             10),
+        SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2,
+                             8),
+        SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2,
+                             8),
+        SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2,
+                             8),
+        SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2,
+                             8),
+        SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2,
+                             8),
+        SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2,
+                             8),
+        SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2,
+                             8),
+        SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2,
+                             8),
+        SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2,
+                             8),
+        SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8),
+        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2,
+                             8)));
  
  INSTANTIATE_TEST_CASE_P(
      SSE2, VpxHBDSubpelAvgVarianceTest,
      ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2, 12),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2, 10),
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2, 8)));
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2,
+                                12),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2,
+                                12),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2,
+                                12),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2,
+                                12),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2,
+                                12),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2,
+                                12),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2,
+                                12),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2,
+                                12),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2,
+                                12),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2,
+                                12),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2,
+                                12),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2,
+                                10),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2,
+                                10),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2,
+                                10),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2,
+                                10),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2,
+                                10),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2,
+                                10),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2,
+                                10),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2,
+                                10),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2,
+                                10),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2,
+                                10),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2,
+                                10),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2,
+                                8),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2,
+                                8),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2,
+                                8),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2,
+                                8),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2,
+                                8),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2,
+                                8),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2,
+                                8),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2,
+                                8),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2,
+                                8),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2,
+                                8),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2,
+                                8)));
  #endif  // CONFIG_VP9_HIGHBITDEPTH
  #endif  // HAVE_SSE2
  
  #if HAVE_SSSE3
  INSTANTIATE_TEST_CASE_P(
      SSSE3, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0)));
  
  INSTANTIATE_TEST_CASE_P(
      SSSE3, VpxSubpelAvgVarianceTest,
      ::testing::Values(
-        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3, 0),
-        make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3, 0),
-        make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3, 0),
-        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3, 0),
-        make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3, 0),
-        make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3, 0),
-        make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3, 0),
-        make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0),
-        make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0),
-        make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0),
-        make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0),
-        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0),
-        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3, 0)));
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3,
+                                0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3,
+                                0)));
  #endif  // HAVE_SSSE3
  
  #if HAVE_AVX2
@@ -1229,14 +1396,16 @@ INSTANTIATE_TEST_CASE_P(
  
  INSTANTIATE_TEST_CASE_P(
      AVX2, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0)));
  
  INSTANTIATE_TEST_CASE_P(
      AVX2, VpxSubpelAvgVarianceTest,
      ::testing::Values(
-        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0),
-        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, 0)));
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2,
+                                0)));
  #endif  // HAVE_AVX2
  
  #if HAVE_NEON
@@ -1265,36 +1434,37 @@ INSTANTIATE_TEST_CASE_P(
  
  INSTANTIATE_TEST_CASE_P(
      NEON, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_neon, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_neon, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_neon, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_neon, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_neon, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_neon, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_neon, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_neon, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_neon, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_neon, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_neon, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_neon, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_neon, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_neon, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_neon, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_neon, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_neon, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_neon, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_neon, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_neon, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_neon, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_neon, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_neon, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_neon, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_neon, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_neon, 0)));
  
  INSTANTIATE_TEST_CASE_P(
      NEON, VpxSubpelAvgVarianceTest,
      ::testing::Values(
-        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_neon, 0),
-        make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_neon, 0),
-        make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_neon, 0),
-        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_neon, 0),
-        make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_neon, 0),
-        make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_neon, 0),
-        make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_neon, 0),
-        make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_neon, 0),
-        make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_neon, 0),
-        make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_neon, 0),
-        make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0),
-        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0),
-        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0)));
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_neon, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_neon, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_neon, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_neon, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_neon, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_neon, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_neon, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_neon, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_neon, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_neon, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0)));
  #endif  // HAVE_NEON
  
  #if HAVE_MSA
@@ -1329,35 +1499,37 @@ INSTANTIATE_TEST_CASE_P(
  
  INSTANTIATE_TEST_CASE_P(
      MSA, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_msa, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_msa, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_msa, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_msa, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_msa, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_msa, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_msa, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_msa, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_msa, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_msa, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_msa, 0),
-                      make_tuple(6, 6, &vpx_sub_pixel_variance64x64_msa, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_msa, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_msa, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_msa, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_msa, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_msa, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_msa, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_msa, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_msa, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_msa, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_msa, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_msa, 0),
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_msa, 0)));
  
  INSTANTIATE_TEST_CASE_P(
      MSA, VpxSubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0)));
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0)));
  #endif  // HAVE_MSA
  
  #if HAVE_VSX
diff --git a/third_party/libwebm/README.libvpx b/third_party/libwebm/README.libvpx

index 1f8a13d78c114c108b272b80dcecd94e23f29644..7ef6c6298c367c8e16e0d1a201205bd92985a3ed 100644 (file)
--- a/third_party/libwebm/README.libvpx
+++ b/third_party/libwebm/README.libvpx
@@ -1,5 +1,5 @@
  URL: https://chromium.googlesource.com/webm/libwebm
-Version: 9732ae991efb71aced4267d4794918279e362d99
+Version: a97c484bfd6b5de4b1b61efe33089b55d810b412
  License: BSD
  License File: LICENSE.txt
  
diff --git a/third_party/libwebm/common/hdr_util.h b/third_party/libwebm/common/hdr_util.h

index 689fb30a3fcb8d3f649393bfe346c525902f310a..3ef5388fd038624136e0ebbe095c70bea0af821a 100644 (file)
--- a/third_party/libwebm/common/hdr_util.h
+++ b/third_party/libwebm/common/hdr_util.h
@@ -47,7 +47,15 @@ struct Vp9CodecFeatures {
    int chroma_subsampling;
  };
  
+// disable deprecation warnings for auto_ptr
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
  typedef std::auto_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic pop
+#endif
  
  bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
                               PrimaryChromaticityPtr* muxer_pc);
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/third_party/libwebm/mkvmuxer/mkvmuxer.cc

index 299b45c989c5ba26b3ff585f936fbb689e248255..15b9a908d8a2ed914682e8b85796b1002d151353 100644 (file)
--- a/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -24,6 +24,11 @@
  #include "mkvmuxer/mkvwriter.h"
  #include "mkvparser/mkvparser.h"
  
+// disable deprecation warnings for auto_ptr
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
  namespace mkvmuxer {
  
  const float PrimaryChromaticity::kChromaticityMin = 0.0f;
@@ -3053,7 +3058,7 @@ Segment::Segment()
        output_cues_(true),
        accurate_cluster_duration_(false),
        fixed_size_cluster_timecode_(false),
-      estimate_file_duration_(true),
+      estimate_file_duration_(false),
        payload_pos_(0),
        size_position_(0),
        doc_type_version_(kDefaultDocTypeVersion),
@@ -3361,7 +3366,10 @@ uint64_t Segment::AddVideoTrack(int32_t width, int32_t height, int32_t number) {
    track->set_width(width);
    track->set_height(height);
  
-  tracks_.AddTrack(track, number);
+  if (!tracks_.AddTrack(track, number)) {
+    delete track;
+    return 0;
+  }
    has_video_ = true;
  
    return track->number();
@@ -3383,8 +3391,10 @@ bool Segment::AddCuePoint(uint64_t timestamp, uint64_t track) {
    cue->set_block_number(cluster->blocks_added());
    cue->set_cluster_pos(cluster->position_for_cues());
    cue->set_track(track);
-  if (!cues_.AddCue(cue))
+  if (!cues_.AddCue(cue)) {
+    delete cue;
      return false;
+  }
  
    new_cuepoint_ = false;
    return true;
@@ -3401,7 +3411,10 @@ uint64_t Segment::AddAudioTrack(int32_t sample_rate, int32_t channels,
    track->set_sample_rate(sample_rate);
    track->set_channels(channels);
  
-  tracks_.AddTrack(track, number);
+  if (!tracks_.AddTrack(track, number)) {
+    delete track;
+    return 0;
+  }
  
    return track->number();
  }
@@ -3490,16 +3503,33 @@ bool Segment::AddGenericFrame(const Frame* frame) {
    if (frame->discard_padding() != 0)
      doc_type_version_ = 4;
  
+  if (cluster_list_size_ > 0) {
+    const uint64_t timecode_scale = segment_info_.timecode_scale();
+    const uint64_t frame_timecode = frame->timestamp() / timecode_scale;
+
+    const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1];
+    const uint64_t last_cluster_timecode = last_cluster->timecode();
+
+    const uint64_t rel_timecode = frame_timecode - last_cluster_timecode;
+    if (rel_timecode > kMaxBlockTimecode) {
+      force_new_cluster_ = true;
+    }
+  }
+
    // If the segment has a video track hold onto audio frames to make sure the
    // audio that is associated with the start time of a video key-frame is
    // muxed into the same cluster.
    if (has_video_ && tracks_.TrackIsAudio(frame->track_number()) &&
        !force_new_cluster_) {
      Frame* const new_frame = new (std::nothrow) Frame();
-    if (!new_frame || !new_frame->CopyFrom(*frame))
+    if (!new_frame || !new_frame->CopyFrom(*frame)) {
+      delete new_frame;
        return false;
-    if (!QueueFrame(new_frame))
+    }
+    if (!QueueFrame(new_frame)) {
+      delete new_frame;
        return false;
+    }
      track_frames_written_[frame->track_number() - 1]++;
      return true;
    }
@@ -3522,8 +3552,10 @@ bool Segment::AddGenericFrame(const Frame* frame) {
    if (!frame->CanBeSimpleBlock() && !frame->is_key() &&
        !frame->reference_block_timestamp_set()) {
      Frame* const new_frame = new (std::nothrow) Frame();
-    if (!new_frame->CopyFrom(*frame))
+    if (!new_frame || !new_frame->CopyFrom(*frame)) {
+      delete new_frame;
        return false;
+    }
      new_frame->set_reference_block_timestamp(
          last_track_timestamp_[frame->track_number() - 1]);
      frame = new_frame;
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc

index 1ba17ac1ba03fe9a8bc77682cd2a26ed53e36750..bd98b1104db432f93f18ba15f105d476e1a38e53 100644 (file)
--- a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -288,7 +288,7 @@ uint64 EbmlElementSize(uint64 type, const char* value) {
    ebml_size += strlen(value);
  
    // Size of Datasize
-  ebml_size++;
+  ebml_size += GetCodedUIntSize(strlen(value));
  
    return ebml_size;
  }
diff --git a/third_party/libwebm/mkvmuxer/mkvwriter.cc b/third_party/libwebm/mkvmuxer/mkvwriter.cc

index ec34e4df8188a870732a41c72c5a17769ea55ff3..84655d802a8000d46752731210a6073b50e7dbd9 100644 (file)
--- a/third_party/libwebm/mkvmuxer/mkvwriter.cc
+++ b/third_party/libwebm/mkvmuxer/mkvwriter.cc
@@ -8,6 +8,8 @@
  
  #include "mkvmuxer/mkvwriter.h"
  
+#include <sys/types.h>
+
  #ifdef _MSC_VER
  #include <share.h>  // for _SH_DENYWR
  #endif
diff --git a/third_party/libwebm/mkvparser/mkvparser.cc b/third_party/libwebm/mkvparser/mkvparser.cc

index e62d6f6075c7b361730253827ba368165056220e..37f230d0a958bcf14bd340988c356e4788ebb5f3 100644 (file)
--- a/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/third_party/libwebm/mkvparser/mkvparser.cc
@@ -22,6 +22,11 @@
  
  #include "common/webmids.h"
  
+// disable deprecation warnings for auto_ptr
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
  namespace mkvparser {
  const float MasteringMetadata::kValueNotPresent = FLT_MAX;
  const long long Colour::kValueNotPresent = LLONG_MAX;
@@ -1528,15 +1533,19 @@ long SeekHead::Parse() {
    if (pos != stop)
      return E_FILE_FORMAT_INVALID;
  
-  m_entries = new (std::nothrow) Entry[entry_count];
+  if (entry_count > 0) {
+    m_entries = new (std::nothrow) Entry[entry_count];
  
-  if (m_entries == NULL)
-    return -1;
+    if (m_entries == NULL)
+      return -1;
+  }
  
-  m_void_elements = new (std::nothrow) VoidElement[void_element_count];
+  if (void_element_count > 0) {
+    m_void_elements = new (std::nothrow) VoidElement[void_element_count];
  
-  if (m_void_elements == NULL)
-    return -1;
+    if (m_void_elements == NULL)
+      return -1;
+  }
  
    // now parse the entries and void elements
  
@@ -1555,14 +1564,14 @@ long SeekHead::Parse() {
      if (status < 0)  // error
        return status;
  
-    if (id == libwebm::kMkvSeek) {
+    if (id == libwebm::kMkvSeek && entry_count > 0) {
        if (ParseEntry(pReader, pos, size, pEntry)) {
          Entry& e = *pEntry++;
  
          e.element_start = idpos;
          e.element_size = (pos + size) - idpos;
        }
-    } else if (id == libwebm::kMkvVoid) {
+    } else if (id == libwebm::kMkvVoid && void_element_count > 0) {
        VoidElement& e = *pVoidElement++;
  
        e.element_start = idpos;
@@ -2426,7 +2435,9 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
  }
  
  const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const {
-  assert(pTrack);
+  if (pTrack == NULL) {
+    return NULL;
+  }
  
    const long long n = pTrack->GetNumber();
  
@@ -4026,7 +4037,7 @@ long SegmentInfo::Parse() {
    }
  
    const double rollover_check = m_duration * m_timecodeScale;
-  if (rollover_check > LLONG_MAX)
+  if (rollover_check > static_cast<double>(LLONG_MAX))
      return E_FILE_FORMAT_INVALID;
  
    if (pos != stop)
@@ -4975,29 +4986,27 @@ bool PrimaryChromaticity::Parse(IMkvReader* reader, long long read_pos,
    if (!reader)
      return false;
  
-  std::auto_ptr<PrimaryChromaticity> chromaticity_ptr;
-
-  if (!*chromaticity) {
-    chromaticity_ptr.reset(new PrimaryChromaticity());
-  } else {
-    chromaticity_ptr.reset(*chromaticity);
-  }
+  if (!*chromaticity)
+    *chromaticity = new PrimaryChromaticity();
  
-  if (!chromaticity_ptr.get())
+  if (!*chromaticity)
      return false;
  
-  float* value = is_x ? &chromaticity_ptr->x : &chromaticity_ptr->y;
+  PrimaryChromaticity* pc = *chromaticity;
+  float* value = is_x ? &pc->x : &pc->y;
  
    double parser_value = 0;
-  const long long value_parse_status =
+  const long long parse_status =
        UnserializeFloat(reader, read_pos, value_size, parser_value);
  
-  *value = static_cast<float>(parser_value);
-
-  if (value_parse_status < 0 || *value < 0.0 || *value > 1.0)
+  // Valid range is [0, 1]. Make sure the double is representable as a float
+  // before casting.
+  if (parse_status < 0 || parser_value < 0.0 || parser_value > 1.0 ||
+      (parser_value > 0.0 && parser_value < FLT_MIN))
      return false;
  
-  *chromaticity = chromaticity_ptr.release();
+  *value = static_cast<float>(parser_value);
+
    return true;
  }
  
@@ -5228,7 +5237,9 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size,
        double value = 0;
        const long long value_parse_status =
            UnserializeFloat(reader, read_pos, child_size, value);
-      if (value_parse_status < 0) {
+      // Make sure value is representable as a float before casting.
+      if (value_parse_status < 0 || value < -FLT_MAX || value > FLT_MAX ||
+          (value > 0.0 && value < FLT_MIN)) {
          return false;
        }
  
@@ -7932,7 +7943,6 @@ long Block::Parse(const Cluster* pCluster) {
      pf = m_frames;
      while (pf != pf_end) {
        Frame& f = *pf++;
-      assert((pos + f.len) <= stop);
        if ((pos + f.len) > stop)
          return E_FILE_FORMAT_INVALID;
  
diff --git a/third_party/libwebm/mkvparser/mkvreader.cc b/third_party/libwebm/mkvparser/mkvreader.cc

index b8fd00c2635e2c15b54cef14eb2b77cdaa4bcd1f..23d68f508919a4cf38207411100a529c983d4148 100644 (file)
--- a/third_party/libwebm/mkvparser/mkvreader.cc
+++ b/third_party/libwebm/mkvparser/mkvreader.cc
@@ -7,6 +7,8 @@
  // be found in the AUTHORS file in the root of the source tree.
  #include "mkvparser/mkvreader.h"
  
+#include <sys/types.h>
+
  #include <cassert>
  
  namespace mkvparser {
diff --git a/tools/tiny_ssim.c b/tools/tiny_ssim.c

index 1f6a448bcdb4a9da5831b9d427554f31c7b218d5..2a450003d5934c0c7c73e50e5f3f7e22feb7d0c7 100644 (file)
--- a/tools/tiny_ssim.c
+++ b/tools/tiny_ssim.c
@@ -17,9 +17,9 @@
  #include "vpx/vpx_integer.h"
  #include "./y4minput.h"
  
-void vp8_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp,
-                          uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
-                          uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+static void ssim_parms_8x8(unsigned char *s, int sp, unsigned char *r, int rp,
+                           uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+                           uint32_t *sum_sq_r, uint32_t *sum_sxr) {
    int i, j;
    for (i = 0; i < 8; i++, s += sp, r += rp) {
      for (j = 0; j < 8; j++) {
@@ -56,16 +56,15 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
  
  static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) {
    uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  vp8_ssim_parms_8x8_c(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
-                       &sum_sxr);
+  ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
    return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
  }
  
  // We are using a 8x8 moving window with starting location of each 8x8 window
  // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
  // block boundaries to penalize blocking artifacts.
-double vp8_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,
-                 int stride_img2, int width, int height) {
+static double ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,
+                    int stride_img2, int width, int height) {
    int i, j;
    int samples = 0;
    double ssim_total = 0;
@@ -103,7 +102,7 @@ static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon,
  }
  
  #define MAX_PSNR 100
-double vp9_mse2psnr(double samples, double peak, double mse) {
+static double mse2psnr(double samples, double peak, double mse) {
    double psnr;
  
    if (mse > 0.0)
@@ -129,7 +128,8 @@ typedef struct input_file {
  } input_file_t;
  
  // Open a file and determine if its y4m or raw.  If y4m get the header.
-int open_input_file(const char *file_name, input_file_t *input, int w, int h) {
+static int open_input_file(const char *file_name, input_file_t *input, int w,
+                           int h) {
    char y4m_buf[4];
    size_t r1;
    input->type = RAW_YUV;
@@ -159,7 +159,7 @@ int open_input_file(const char *file_name, input_file_t *input, int w, int h) {
    return 0;
  }
  
-void close_input_file(input_file_t *in) {
+static void close_input_file(input_file_t *in) {
    if (in->file) fclose(in->file);
    if (in->type == Y4M) {
      vpx_img_free(&in->img);
@@ -168,8 +168,8 @@ void close_input_file(input_file_t *in) {
    }
  }
  
-size_t read_input_file(input_file_t *in, unsigned char **y, unsigned char **u,
-                       unsigned char **v) {
+static size_t read_input_file(input_file_t *in, unsigned char **y,
+                              unsigned char **u, unsigned char **v) {
    size_t r1 = 0;
    switch (in->type) {
      case Y4M:
@@ -287,7 +287,7 @@ int main(int argc, char *argv[]) {
        break;
      }
  #define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \
-  ssim = vp8_ssim2(buf0, buf1, w, w, w, h);         \
+  ssim = ssim2(buf0, buf1, w, w, w, h);             \
    psnr = calc_plane_error(buf0, w, buf1, w, w, h);
  
      if (n_frames == allocated_frames) {
@@ -321,11 +321,11 @@ int main(int argc, char *argv[]) {
      ssimuavg += ssimu[i];
      ssimvavg += ssimv[i];
  
-    frame_psnr = vp9_mse2psnr(w * h * 6 / 4, 255.0,
-                              (double)psnry[i] + psnru[i] + psnrv[i]);
-    frame_psnry = vp9_mse2psnr(w * h * 4 / 4, 255.0, (double)psnry[i]);
-    frame_psnru = vp9_mse2psnr(w * h * 1 / 4, 255.0, (double)psnru[i]);
-    frame_psnrv = vp9_mse2psnr(w * h * 1 / 4, 255.0, (double)psnrv[i]);
+    frame_psnr =
+        mse2psnr(w * h * 6 / 4, 255.0, (double)psnry[i] + psnru[i] + psnrv[i]);
+    frame_psnry = mse2psnr(w * h * 4 / 4, 255.0, (double)psnry[i]);
+    frame_psnru = mse2psnr(w * h * 1 / 4, 255.0, (double)psnru[i]);
+    frame_psnrv = mse2psnr(w * h * 1 / 4, 255.0, (double)psnrv[i]);
  
      psnravg += frame_psnr;
      psnryavg += frame_psnry;
@@ -367,10 +367,10 @@ int main(int argc, char *argv[]) {
    puts("");
  
    psnrglb = psnryglb + psnruglb + psnrvglb;
-  psnrglb = vp9_mse2psnr((double)n_frames * w * h * 6 / 4, 255.0, psnrglb);
-  psnryglb = vp9_mse2psnr((double)n_frames * w * h * 4 / 4, 255.0, psnryglb);
-  psnruglb = vp9_mse2psnr((double)n_frames * w * h * 1 / 4, 255.0, psnruglb);
-  psnrvglb = vp9_mse2psnr((double)n_frames * w * h * 1 / 4, 255.0, psnrvglb);
+  psnrglb = mse2psnr((double)n_frames * w * h * 6 / 4, 255.0, psnrglb);
+  psnryglb = mse2psnr((double)n_frames * w * h * 4 / 4, 255.0, psnryglb);
+  psnruglb = mse2psnr((double)n_frames * w * h * 1 / 4, 255.0, psnruglb);
+  psnrvglb = mse2psnr((double)n_frames * w * h * 1 / 4, 255.0, psnrvglb);
  
    printf("GlbPSNR: %lf\n", psnrglb);
    printf("GlbPSNR-Y: %lf\n", psnryglb);
diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c

index 2a7cde8788f258f708f80dc5f4a9493f9e7ad0bc..188e290ca7ff979b29c49ac1542e38bd813600dd 100644 (file)
--- a/vp8/common/loopfilter_filters.c
+++ b/vp8/common/loopfilter_filters.c
@@ -86,10 +86,12 @@ static void vp8_filter(signed char mask, uc hev, uc *op1, uc *op0, uc *oq0,
    u = vp8_signed_char_clamp(ps1 + filter_value);
    *op1 = u ^ 0x80;
  }
-void vp8_loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
-                                       const unsigned char *blimit,
-                                       const unsigned char *limit,
-                                       const unsigned char *thresh, int count) {
+
+static void loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
+                                          const unsigned char *blimit,
+                                          const unsigned char *limit,
+                                          const unsigned char *thresh,
+                                          int count) {
    int hev = 0; /* high edge variance */
    signed char mask = 0;
    int i = 0;
@@ -109,10 +111,11 @@ void vp8_loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
    } while (++i < count * 8);
  }
  
-void vp8_loop_filter_vertical_edge_c(unsigned char *s, int p,
-                                     const unsigned char *blimit,
-                                     const unsigned char *limit,
-                                     const unsigned char *thresh, int count) {
+static void loop_filter_vertical_edge_c(unsigned char *s, int p,
+                                        const unsigned char *blimit,
+                                        const unsigned char *limit,
+                                        const unsigned char *thresh,
+                                        int count) {
    int hev = 0; /* high edge variance */
    signed char mask = 0;
    int i = 0;
@@ -185,11 +188,11 @@ static void vp8_mbfilter(signed char mask, uc hev, uc *op2, uc *op1, uc *op0,
    *op2 = s ^ 0x80;
  }
  
-void vp8_mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
-                                         const unsigned char *blimit,
-                                         const unsigned char *limit,
-                                         const unsigned char *thresh,
-                                         int count) {
+static void mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
+                                            const unsigned char *blimit,
+                                            const unsigned char *limit,
+                                            const unsigned char *thresh,
+                                            int count) {
    signed char hev = 0; /* high edge variance */
    signed char mask = 0;
    int i = 0;
@@ -210,10 +213,11 @@ void vp8_mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
    } while (++i < count * 8);
  }
  
-void vp8_mbloop_filter_vertical_edge_c(unsigned char *s, int p,
-                                       const unsigned char *blimit,
-                                       const unsigned char *limit,
-                                       const unsigned char *thresh, int count) {
+static void mbloop_filter_vertical_edge_c(unsigned char *s, int p,
+                                          const unsigned char *blimit,
+                                          const unsigned char *limit,
+                                          const unsigned char *thresh,
+                                          int count) {
    signed char hev = 0; /* high edge variance */
    signed char mask = 0;
    int i = 0;
@@ -295,17 +299,17 @@ void vp8_loop_filter_simple_vertical_edge_c(unsigned char *s, int p,
  void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
                             unsigned char *v_ptr, int y_stride, int uv_stride,
                             loop_filter_info *lfi) {
-  vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 2);
+  mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                  lfi->hev_thr, 2);
  
    if (u_ptr) {
-    vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                        lfi->hev_thr, 1);
+    mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, 1);
    }
  
    if (v_ptr) {
-    vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                        lfi->hev_thr, 1);
+    mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, 1);
    }
  }
  
@@ -313,17 +317,17 @@ void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
  void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
                             unsigned char *v_ptr, int y_stride, int uv_stride,
                             loop_filter_info *lfi) {
-  vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, 2);
+  mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 2);
  
    if (u_ptr) {
-    vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 1);
+    mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                  lfi->hev_thr, 1);
    }
  
    if (v_ptr) {
-    vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 1);
+    mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                  lfi->hev_thr, 1);
    }
  }
  
@@ -331,21 +335,21 @@ void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
  void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
                            unsigned char *v_ptr, int y_stride, int uv_stride,
                            loop_filter_info *lfi) {
-  vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim,
-                                    lfi->lim, lfi->hev_thr, 2);
-  vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim,
-                                    lfi->lim, lfi->hev_thr, 2);
-  vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim,
-                                    lfi->lim, lfi->hev_thr, 2);
+  loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim,
+                                lfi->lim, lfi->hev_thr, 2);
+  loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim,
+                                lfi->lim, lfi->hev_thr, 2);
+  loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim,
+                                lfi->lim, lfi->hev_thr, 2);
  
    if (u_ptr) {
-    vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
+    loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim,
+                                  lfi->lim, lfi->hev_thr, 1);
    }
  
    if (v_ptr) {
-    vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
+    loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim,
+                                  lfi->lim, lfi->hev_thr, 1);
    }
  }
  
@@ -363,21 +367,21 @@ void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
  void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
                            unsigned char *v_ptr, int y_stride, int uv_stride,
                            loop_filter_info *lfi) {
-  vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
-                                  lfi->hev_thr, 2);
-  vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
-                                  lfi->hev_thr, 2);
-  vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
-                                  lfi->hev_thr, 2);
+  loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
+                              lfi->hev_thr, 2);
+  loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
+                              lfi->hev_thr, 2);
+  loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
+                              lfi->hev_thr, 2);
  
    if (u_ptr) {
-    vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
-                                    lfi->hev_thr, 1);
+    loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+                                lfi->hev_thr, 1);
    }
  
    if (v_ptr) {
-    vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
-                                    lfi->hev_thr, 1);
+    loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+                                lfi->hev_thr, 1);
    }
  }
  
diff --git a/vp8/common/onyxd.h b/vp8/common/onyxd.h

index cc2cb8089c23d651bd8dec490ed0efab71bc90df..d3c1b0e972c57a898a6bfb0ea0c69e359a8e2b5a 100644 (file)
--- a/vp8/common/onyxd.h
+++ b/vp8/common/onyxd.h
@@ -22,6 +22,7 @@ extern "C" {
  #include "vpx/vp8.h"
  
  struct VP8D_COMP;
+struct VP8Common;
  
  typedef struct {
    int Width;
@@ -45,6 +46,7 @@ int vp8dx_receive_compressed_data(struct VP8D_COMP *comp, size_t size,
  int vp8dx_get_raw_frame(struct VP8D_COMP *comp, YV12_BUFFER_CONFIG *sd,
                          int64_t *time_stamp, int64_t *time_end_stamp,
                          vp8_ppflags_t *flags);
+int vp8dx_references_buffer(struct VP8Common *oci, int ref_frame);
  
  vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *comp,
                                      enum vpx_ref_frame_type ref_frame_flag,
diff --git a/vp8/common/vp8_skin_detection.c b/vp8/common/vp8_skin_detection.c

index 2c0237087ee858eef7f4d156bf36e8e748a89f16..6739efa5fe4e711461223a9f84c1dca7123b1825 100644 (file)
--- a/vp8/common/vp8_skin_detection.c
+++ b/vp8/common/vp8_skin_detection.c
@@ -73,10 +73,8 @@ void vp8_compute_skin_map(VP8_COMP *const cpi, FILE *yuv_skinmap_file) {
    VP8_COMMON *const cm = &cpi->common;
    uint8_t *y;
    const uint8_t *src_y = cpi->Source->y_buffer;
-  const uint8_t *src_u = cpi->Source->u_buffer;
-  const uint8_t *src_v = cpi->Source->v_buffer;
    const int src_ystride = cpi->Source->y_stride;
-  const int src_uvstride = cpi->Source->uv_stride;
+  int offset = 0;
  
    YV12_BUFFER_CONFIG skinmap;
    memset(&skinmap, 0, sizeof(skinmap));
@@ -89,41 +87,21 @@ void vp8_compute_skin_map(VP8_COMP *const cpi, FILE *yuv_skinmap_file) {
    y = skinmap.y_buffer;
    // Loop through blocks and set skin map based on center pixel of block.
    // Set y to white for skin block, otherwise set to source with gray scale.
-  // Ignore rightmost/bottom boundary blocks.
    for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 1) {
      num_bl = 0;
      for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 1) {
-      int is_skin = 0;
-      int consec_zeromv = 0;
-      const int bl_index = mb_row * cm->mb_cols + mb_col;
-      const int bl_index1 = bl_index + 1;
-      const int bl_index2 = bl_index + cm->mb_cols;
-      const int bl_index3 = bl_index2 + 1;
-      consec_zeromv = VPXMIN(cpi->consec_zero_last[bl_index],
-                             VPXMIN(cpi->consec_zero_last[bl_index1],
-                                    VPXMIN(cpi->consec_zero_last[bl_index2],
-                                           cpi->consec_zero_last[bl_index3])));
-      is_skin =
-          vp8_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride,
-                                 SKIN_8X8, consec_zeromv, 0);
+      const int is_skin = cpi->skin_map[offset++];
        for (i = 0; i < 16; i++) {
          for (j = 0; j < 16; j++) {
-          if (is_skin)
-            y[i * src_ystride + j] = 255;
-          else
-            y[i * src_ystride + j] = src_y[i * src_ystride + j];
+          y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j];
          }
        }
        num_bl++;
        y += 16;
        src_y += 16;
-      src_u += 8;
-      src_v += 8;
      }
      y += (src_ystride << 4) - (num_bl << 4);
      src_y += (src_ystride << 4) - (num_bl << 4);
-    src_u += (src_uvstride << 3) - (num_bl << 3);
-    src_v += (src_uvstride << 3) - (num_bl << 3);
    }
    vpx_write_yuv_frame(yuv_skinmap_file, &skinmap);
    vpx_free_frame_buffer(&skinmap);
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c

index b946ab73d0ba7d1c856395e63ae96de50230101f..58a26c1c972c4ae6587d2143348abaec2f24634f 100644 (file)
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -8,6 +8,7 @@
   *  be found in the AUTHORS file in the root of the source tree.
   */
  
+#include "decodemv.h"
  #include "treereader.h"
  #include "vp8/common/entropymv.h"
  #include "vp8/common/entropymode.h"
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c

index 789c2eeffd32e68f836c527faf32b0e1b3a9c50a..f516eb0c78b353070b8646790d6fa0e5946afdc5 100644 (file)
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -41,7 +41,6 @@
  #endif
  
  extern void vp8_init_loop_filter(VP8_COMMON *cm);
-extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
  static int get_free_fb(VP8_COMMON *cm);
  static void ref_cnt_fb(int *buf, int *idx, int new_idx);
  
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h

index 88b1ff16bca99b251060c97bae0e11acc94b6c22..d05368544e55b66d930394f18e15b2c5e90667c2 100644 (file)
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -119,6 +119,8 @@ typedef struct VP8D_COMP {
    void *decrypt_state;
  } VP8D_COMP;
  
+void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
+void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
  int vp8_decode_frame(VP8D_COMP *cpi);
  
  int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf);
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c

index 9f77519882c9da2181df28e65dea64b0242b5f5a..f5bdae493f4326964de303433406b5645beee953 100644 (file)
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -20,6 +20,7 @@
  #include "vp8/common/loopfilter.h"
  #include "vp8/common/extend.h"
  #include "vpx_ports/vpx_timer.h"
+#include "decoderthreading.h"
  #include "detokenize.h"
  #include "vp8/common/reconintra4x4.h"
  #include "vp8/common/reconinter.h"
@@ -36,8 +37,6 @@
      memset((p), 0, (n) * sizeof(*(p)));                             \
    } while (0)
  
-void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
-
  static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
                                         MB_ROW_DEC *mbrd, int count) {
    VP8_COMMON *const pc = &pbi->common;
diff --git a/vp8/encoder/bitstream.h b/vp8/encoder/bitstream.h

index 2b196dcd27dd6db73e6c67d850ce08a599bc09fb..ed45bff9e209da98cd8a52af67fb85dfe8d6af11 100644 (file)
--- a/vp8/encoder/bitstream.h
+++ b/vp8/encoder/bitstream.h
@@ -15,7 +15,15 @@
  extern "C" {
  #endif
  
+#include "vp8/encoder/treewriter.h"
+#include "vp8/encoder/tokenize.h"
+
  void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount);
+void vp8_convert_rfct_to_prob(struct VP8_COMP *const cpi);
+void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra,
+                              int prob_last, int prob_garf);
+int vp8_estimate_entropy_savings(struct VP8_COMP *cpi);
+void vp8_update_coef_probs(struct VP8_COMP *cpi);
  
  #ifdef __cplusplus
  }  // extern "C"
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c

index c7ad3bfe2c90ec2b95f335ebc85c1f1b7752e13e..b867f6cb19a95278f370275174e5741f3352c8ac 100644 (file)
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -11,8 +11,12 @@
  #include "vpx_config.h"
  #include "vp8_rtcd.h"
  #include "./vpx_dsp_rtcd.h"
+#include "bitstream.h"
  #include "encodemb.h"
  #include "encodemv.h"
+#if CONFIG_MULTITHREAD
+#include "ethreading.h"
+#endif
  #include "vp8/common/common.h"
  #include "onyx_int.h"
  #include "vp8/common/extend.h"
@@ -35,13 +39,6 @@
  #include "encodeframe.h"
  
  extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
-extern void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra,
-                                     int prob_last, int prob_garf);
-extern void vp8_convert_rfct_to_prob(VP8_COMP *const cpi);
-extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
-extern void vp8_auto_select_speed(VP8_COMP *cpi);
-extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
-                                      MB_ROW_COMP *mbr_ei, int count);
  static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x);
  
  #ifdef MODE_STATS
diff --git a/vp8/encoder/encodeframe.h b/vp8/encoder/encodeframe.h

index c1d86349271600ebc2d50f311b87ed5849edf64f..5274aba41205bce7d97265a0d2408c29bf3dc299 100644 (file)
--- a/vp8/encoder/encodeframe.h
+++ b/vp8/encoder/encodeframe.h
@@ -10,24 +10,29 @@
  #ifndef VP8_ENCODER_ENCODEFRAME_H_
  #define VP8_ENCODER_ENCODEFRAME_H_
  
+#include "vp8/encoder/tokenize.h"
+
  #ifdef __cplusplus
  extern "C" {
  #endif
-extern void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x);
  
-extern void vp8_build_block_offsets(MACROBLOCK *x);
+struct VP8_COMP;
+struct macroblock;
+
+void vp8_activity_masking(struct VP8_COMP *cpi, MACROBLOCK *x);
+
+void vp8_build_block_offsets(struct macroblock *x);
  
-extern void vp8_setup_block_ptrs(MACROBLOCK *x);
+void vp8_setup_block_ptrs(struct macroblock *x);
  
-extern void vp8_encode_frame(VP8_COMP *cpi);
+void vp8_encode_frame(struct VP8_COMP *cpi);
  
-extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
-                                         TOKENEXTRA **t, int recon_yoffset,
-                                         int recon_uvoffset, int mb_row,
-                                         int mb_col);
+int vp8cx_encode_inter_macroblock(struct VP8_COMP *cpi, struct macroblock *x,
+                                  TOKENEXTRA **t, int recon_yoffset,
+                                  int recon_uvoffset, int mb_row, int mb_col);
  
-extern int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
-                                         TOKENEXTRA **t);
+int vp8cx_encode_intra_macroblock(struct VP8_COMP *cpi, struct macroblock *x,
+                                  TOKENEXTRA **t);
  #ifdef __cplusplus
  }  // extern "C"
  #endif
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c

index df34997accde9ab6a96f51af65e379bf555851c9..3e5b709e0d3f3a615fe9f47408427b8a73be2d1e 100644 (file)
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -14,6 +14,7 @@
  #include "vp8/common/extend.h"
  #include "bitstream.h"
  #include "encodeframe.h"
+#include "ethreading.h"
  
  #if CONFIG_MULTITHREAD
  
diff --git a/vp8/encoder/ethreading.h b/vp8/encoder/ethreading.h

new file mode 100644 (file)

index 0000000..95bf73d
--- /dev/null
+++ b/vp8/encoder/ethreading.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_ETHREADING_H_
+#define VP8_ENCODER_ETHREADING_H_
+
+#include "vp8/encoder/onyx_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+struct macroblock;
+
+void vp8cx_init_mbrthread_data(struct VP8_COMP *cpi, struct macroblock *x,
+                               MB_ROW_COMP *mbr_ei, int count);
+int vp8cx_create_encoder_threads(struct VP8_COMP *cpi);
+void vp8cx_remove_encoder_threads(struct VP8_COMP *cpi);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP8_ENCODER_ETHREADING_H_
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c

index 66d441f0c16e7cf82d4387045d5f7ff96486046b..b018ca61eeab82fb9ff4dd5d9e284f11f49c54bb 100644 (file)
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -12,6 +12,7 @@
  #include "./vpx_scale_rtcd.h"
  #include "./vpx_dsp_rtcd.h"
  #include "./vp8_rtcd.h"
+#include "bitstream.h"
  #include "vp8/common/onyxc_int.h"
  #include "vp8/common/blockd.h"
  #include "onyx_int.h"
@@ -43,6 +44,13 @@
  #include "mr_dissim.h"
  #endif
  #include "encodeframe.h"
+#if CONFIG_MULTITHREAD
+#include "ethreading.h"
+#endif
+#include "picklpf.h"
+#if !CONFIG_REALTIME_ONLY
+#include "temporal_filter.h"
+#endif
  
  #include <assert.h>
  #include <math.h>
@@ -51,28 +59,17 @@
  
  #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
  extern int vp8_update_coef_context(VP8_COMP *cpi);
-extern void vp8_update_coef_probs(VP8_COMP *cpi);
  #endif
  
-extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
-extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val);
-extern void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
-
  extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source,
                                YV12_BUFFER_CONFIG *post, int filt_lvl,
                                int low_var_thresh, int flag);
  extern void print_parms(VP8_CONFIG *ocf, char *filenam);
  extern unsigned int vp8_get_processor_freq();
  extern void print_tree_update_probs();
-extern int vp8cx_create_encoder_threads(VP8_COMP *cpi);
-extern void vp8cx_remove_encoder_threads(VP8_COMP *cpi);
-
-int vp8_estimate_entropy_savings(VP8_COMP *cpi);
  
  int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
  
-extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance);
-
  static void set_default_lf_deltas(VP8_COMP *cpi);
  
  extern const int vp8_gf_interval_table[101];
@@ -620,6 +617,37 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) {
    set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
  }
  
+static void compute_skin_map(VP8_COMP *cpi) {
+  int mb_row, mb_col, num_bl;
+  VP8_COMMON *cm = &cpi->common;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const uint8_t *src_u = cpi->Source->u_buffer;
+  const uint8_t *src_v = cpi->Source->v_buffer;
+  const int src_ystride = cpi->Source->y_stride;
+  const int src_uvstride = cpi->Source->uv_stride;
+
+  const SKIN_DETECTION_BLOCK_SIZE bsize =
+      (cm->Width * cm->Height <= 352 * 288) ? SKIN_8X8 : SKIN_16X16;
+  int offset = 0;
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+    num_bl = 0;
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+      const int bl_index = mb_row * cm->mb_cols + mb_col;
+      cpi->skin_map[offset] =
+          vp8_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride,
+                                 bsize, cpi->consec_zero_last[bl_index], 0);
+      num_bl++;
+      offset++;
+      src_y += 16;
+      src_u += 8;
+      src_v += 8;
+    }
+    src_y += (src_ystride << 4) - (num_bl << 4);
+    src_u += (src_uvstride << 3) - (num_bl << 3);
+    src_v += (src_uvstride << 3) - (num_bl << 3);
+  }
+}
+
  static void set_default_lf_deltas(VP8_COMP *cpi) {
    cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
    cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
@@ -1861,6 +1889,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
      cpi->cyclic_refresh_map = (signed char *)NULL;
    }
  
+  CHECK_MEM_ERROR(cpi->skin_map, vpx_calloc(cm->mb_rows * cm->mb_cols,
+                                            sizeof(cpi->skin_map[0])));
+
    CHECK_MEM_ERROR(cpi->consec_zero_last,
                    vpx_calloc(cm->mb_rows * cm->mb_cols, 1));
    CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias,
@@ -1938,7 +1969,7 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
    yuv_denoised_file = fopen("denoised.yuv", "ab");
  #endif
  #ifdef OUTPUT_YUV_SKINMAP
-  yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+  yuv_skinmap_file = fopen("skinmap.yuv", "wb");
  #endif
  
  #if 0
@@ -2291,6 +2322,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
    dealloc_compressor_data(cpi);
    vpx_free(cpi->mb.ss);
    vpx_free(cpi->tok);
+  vpx_free(cpi->skin_map);
    vpx_free(cpi->cyclic_refresh_map);
    vpx_free(cpi->consec_zero_last);
    vpx_free(cpi->consec_zero_last_mvbias);
@@ -3770,6 +3802,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
    }
  #endif
  
+  compute_skin_map(cpi);
+
    /* Setup background Q adjustment for error resilient mode.
     * For multi-layer encodes only enable this for the base layer.
    */
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h

index fe775064a45730af66bac8539f81a5b2bae5f94b..53e8be84fca2634e7a0440f8b03befa4f797a448 100644 (file)
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -471,6 +471,8 @@ typedef struct VP8_COMP {
    int zeromv_count;
    int lf_zeromv_pct;
  
+  unsigned char *skin_map;
+
    unsigned char *segmentation_map;
    signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
    int segment_encode_breakout[MAX_MB_SEGMENTS];
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c

index d399839ddf6b5d64a3f8f7663987af217e4cd555..a9943eb6ab9f9c197a545cba09b43f03955b9e17 100644 (file)
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -691,9 +691,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
    x->is_skin = 0;
    if (!cpi->oxcf.screen_content_mode) {
      int block_index = mb_row * cpi->common.mb_cols + mb_col;
-    x->is_skin = vp8_compute_skin_block(
-        x->src.y_buffer, x->src.u_buffer, x->src.v_buffer, x->src.y_stride,
-        x->src.uv_stride, SKIN_16X16, cpi->consec_zero_last[block_index], 0);
+    x->is_skin = cpi->skin_map[block_index];
    }
  #if CONFIG_TEMPORAL_DENOISING
    if (cpi->oxcf.noise_sensitivity) {
diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c

index 6f287322ec5ef34527c85a19982abc1bfad4aa3e..b1b712db9a7e16a14ca991e47034b2f9f012b6da 100644 (file)
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -12,6 +12,7 @@
  #include "./vpx_scale_rtcd.h"
  #include "vp8/common/onyxc_int.h"
  #include "onyx_int.h"
+#include "vp8/encoder/picklpf.h"
  #include "vp8/encoder/quantize.h"
  #include "vpx_mem/vpx_mem.h"
  #include "vpx_scale/vpx_scale.h"
diff --git a/vp8/encoder/picklpf.h b/vp8/encoder/picklpf.h

new file mode 100644 (file)

index 0000000..e6ad0db
--- /dev/null
+++ b/vp8/encoder/picklpf.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_PICKLPF_H_
+#define VP8_ENCODER_PICKLPF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+struct yv12_buffer_config;
+
+void vp8cx_pick_filter_level_fast(struct yv12_buffer_config *sd,
+                                  struct VP8_COMP *cpi);
+void vp8cx_set_alt_lf_level(struct VP8_COMP *cpi, int filt_val);
+void vp8cx_pick_filter_level(struct yv12_buffer_config *sd, VP8_COMP *cpi);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP8_ENCODER_PICKLPF_H_
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c

index 3792b10f89b7c0a6c0fb33e1fb76e16ace6693fa..c9513eb68f2ece161ebda5a56b7fb0f36888a59d 100644 (file)
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -16,6 +16,7 @@
  #include "vpx_config.h"
  #include "vp8_rtcd.h"
  #include "./vpx_dsp_rtcd.h"
+#include "encodeframe.h"
  #include "tokenize.h"
  #include "treewriter.h"
  #include "onyx_int.h"
diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h

index 8186ff1051797642e123d1cce97bcf5dcc95c976..960bd8f1cdfa319a4c32a9cbe735d89ece427a6c 100644 (file)
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -19,6 +19,9 @@ extern "C" {
  
  #define RDCOST(RM, DM, R, D) (((128 + (R) * (RM)) >> 8) + (DM) * (D))
  
+void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
+void vp8_auto_select_speed(VP8_COMP *cpi);
+
  static INLINE void insertsortmv(int arr[], int len) {
    int i, j, k;
  
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c

index 1b2f46bb69b4bc47237a045dfccdcc1221ad9d52..0a7d25fb0a78803d9555360872d8580a3df63103 100644 (file)
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -20,6 +20,7 @@
  #include "ratectrl.h"
  #include "vp8/common/quant_common.h"
  #include "segmentation.h"
+#include "temporal_filter.h"
  #include "vpx_mem/vpx_mem.h"
  #include "vp8/common/swapyv12buffer.h"
  #include "vp8/common/threading.h"
diff --git a/vp8/encoder/temporal_filter.h b/vp8/encoder/temporal_filter.h

new file mode 100644 (file)

index 0000000..865d909
--- /dev/null
+++ b/vp8/encoder/temporal_filter.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_TEMPORAL_FILTER_H_
+#define VP8_ENCODER_TEMPORAL_FILTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+
+void vp8_temporal_filter_prepare_c(struct VP8_COMP *cpi, int distance);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP8_ENCODER_TEMPORAL_FILTER_H_
diff --git a/vp8/encoder/x86/quantize_mmx.asm b/vp8/encoder/x86/quantize_mmx.asm

deleted file mode 100644 (file)

index 2864ce1..0000000
--- a/vp8/encoder/x86/quantize_mmx.asm
+++ /dev/null
@@ -1,286 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
-;                           short *qcoeff_ptr,short *dequant_ptr,
-;                           short *scan_mask, short *round_ptr,
-;                           short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE
-sym(vp8_fast_quantize_b_impl_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;coeff_ptr
-        movq            mm0,        [rsi]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm1,        [rax]
-
-        movq            mm3,        mm0
-        psraw           mm0,        15
-
-        pxor            mm3,        mm0
-        psubw           mm3,        mm0         ; abs
-
-        movq            mm2,        mm3
-        pcmpgtw         mm1,        mm2
-
-        pandn           mm1,        mm2
-        movq            mm3,        mm1
-
-        mov             rdx,        arg(6) ;quant_ptr
-        movq            mm1,        [rdx]
-
-        mov             rcx,        arg(5) ;round_ptr
-        movq            mm2,        [rcx]
-
-        paddw           mm3,        mm2
-        pmulhuw         mm3,        mm1
-
-        pxor            mm3,        mm0
-        psubw           mm3,        mm0     ;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-        movq            mm0,        mm3
-
-        movq            [rdi],      mm3
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm2,        [rax]
-
-        pmullw          mm3,        mm2
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax],      mm3
-
-        ; next 8
-        movq            mm4,        [rsi+8]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+8]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+8]
-        movq            mm6,        [rcx+8]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+8],    mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+8]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+8],    mm7
-
-
-                ; next 8
-        movq            mm4,        [rsi+16]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+16]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+16]
-        movq            mm6,        [rcx+16]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+16],   mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+16]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+16],   mm7
-
-
-                ; next 8
-        movq            mm4,        [rsi+24]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+24]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+24]
-        movq            mm6,        [rcx+24]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+24],   mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+24]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+24],   mm7
-
-
-
-        mov             rdi,        arg(4) ;scan_mask
-        mov             rsi,        arg(2) ;qcoeff_ptr
-
-        pxor            mm5,        mm5
-        pxor            mm7,        mm7
-
-        movq            mm0,        [rsi]
-        movq            mm1,        [rsi+8]
-
-        movq            mm2,        [rdi]
-        movq            mm3,        [rdi+8];
-
-        pcmpeqw         mm0,        mm7
-        pcmpeqw         mm1,        mm7
-
-        pcmpeqw         mm6,        mm6
-        pxor            mm0,        mm6
-
-        pxor            mm1,        mm6
-        psrlw           mm0,        15
-
-        psrlw           mm1,        15
-        pmaddwd         mm0,        mm2
-
-        pmaddwd         mm1,        mm3
-        movq            mm5,        mm0
-
-        paddd           mm5,        mm1
-
-        movq            mm0,        [rsi+16]
-        movq            mm1,        [rsi+24]
-
-        movq            mm2,        [rdi+16]
-        movq            mm3,        [rdi+24];
-
-        pcmpeqw         mm0,        mm7
-        pcmpeqw         mm1,        mm7
-
-        pcmpeqw         mm6,        mm6
-        pxor            mm0,        mm6
-
-        pxor            mm1,        mm6
-        psrlw           mm0,        15
-
-        psrlw           mm1,        15
-        pmaddwd         mm0,        mm2
-
-        pmaddwd         mm1,        mm3
-        paddd           mm5,        mm0
-
-        paddd           mm5,        mm1
-        movq            mm0,        mm5
-
-        psrlq           mm5,        32
-        paddd           mm0,        mm5
-
-        ; eob adjustment begins here
-        movq            rcx,        mm0
-        and             rcx,        0xffff
-
-        xor             rdx,        rdx
-        sub             rdx,        rcx ; rdx=-rcx
-
-        bsr             rax,        rcx
-        inc             rax
-
-        sar             rdx,        31
-        and             rax,        rdx
-        ; Substitute the sse assembly for the old mmx mixed assembly/C. The
-        ; following is kept as reference
-        ;    movq            rcx,        mm0
-        ;    bsr             rax,        rcx
-        ;
-        ;    mov             eob,        rax
-        ;    mov             eee,        rcx
-        ;
-        ;if(eee==0)
-        ;{
-        ;    eob=-1;
-        ;}
-        ;else if(eee<0)
-        ;{
-        ;    eob=15;
-        ;}
-        ;d->eob = eob+1;
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/vp8/encoder/x86/quantize_ssse3.c b/vp8/encoder/x86/quantize_ssse3.c

index 322f0a151fe2616b600f70f1f538ce96baae2458..d5474501544c5ce3c1e4281fe0d1ed124710cfec 100644 (file)
--- a/vp8/encoder/x86/quantize_ssse3.c
+++ b/vp8/encoder/x86/quantize_ssse3.c
@@ -10,6 +10,7 @@
  
  #include <tmmintrin.h> /* SSSE3 */
  
+#include "./vp8_rtcd.h"
  #include "vp8/encoder/block.h"
  
  /* bitscan reverse (bsr) */
diff --git a/vp8/encoder/x86/vp8_enc_stubs_mmx.c b/vp8/encoder/x86/vp8_enc_stubs_mmx.c

deleted file mode 100644 (file)

index 4406dd0..0000000
--- a/vp8/encoder/x86/vp8_enc_stubs_mmx.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vpx_ports/x86.h"
-#include "vp8/encoder/block.h"
-
-int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
-                                 short *qcoeff_ptr, short *dequant_ptr,
-                                 const short *scan_mask, short *round_ptr,
-                                 short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) {
-  const short *scan_mask = vp8_default_zig_zag_mask;
-  short *coeff_ptr = b->coeff;
-  short *zbin_ptr = b->zbin;
-  short *round_ptr = b->round;
-  short *quant_ptr = b->quant_fast;
-  short *qcoeff_ptr = d->qcoeff;
-  short *dqcoeff_ptr = d->dqcoeff;
-  short *dequant_ptr = d->dequant;
-
-  *d->eob = (char)vp8_fast_quantize_b_impl_mmx(
-      coeff_ptr, zbin_ptr, qcoeff_ptr, dequant_ptr, scan_mask,
-
-      round_ptr, quant_ptr, dqcoeff_ptr);
-}
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c

index 9ea9c7f042e665bbc9a6b6784691b63cd13c7b87..987a5b8a4fde9274f0a473be6fc5853de1c39e5d 100644 (file)
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -228,7 +228,8 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12,
  }
  
  static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
-                            unsigned int data_sz, vpx_codec_err_t *res) {
+                            unsigned int data_sz,
+                            volatile vpx_codec_err_t *res) {
    *res = VPX_CODEC_OK;
  
    if (ctx->fragments.count == 0) {
@@ -267,7 +268,7 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
  static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
                                    const uint8_t *data, unsigned int data_sz,
                                    void *user_priv, long deadline) {
-  vpx_codec_err_t res = VPX_CODEC_OK;
+  volatile vpx_codec_err_t res;
    unsigned int resolution_change = 0;
    unsigned int w, h;
  
@@ -580,7 +581,6 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
    }
  }
  
-extern int vp8dx_references_buffer(VP8_COMMON *oci, int ref_frame);
  static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx,
                                                va_list args) {
    int *ref_info = va_arg(args, int *);
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk

index d6ed7b5674da967f9216ef3a3e867d50642f82a5..89abad2aa2e0fb26d65e647bb2e5af436a520910 100644 (file)
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -30,6 +30,7 @@ VP8_CX_SRCS-yes += encoder/encodeintra.c
  VP8_CX_SRCS-yes += encoder/encodemb.c
  VP8_CX_SRCS-yes += encoder/encodemv.c
  VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c
+VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.h
  VP8_CX_SRCS-yes += encoder/firstpass.c
  VP8_CX_SRCS-yes += encoder/block.h
  VP8_CX_SRCS-yes += encoder/boolhuff.h
@@ -56,6 +57,7 @@ VP8_CX_SRCS-yes += encoder/modecosts.c
  VP8_CX_SRCS-yes += encoder/onyx_if.c
  VP8_CX_SRCS-yes += encoder/pickinter.c
  VP8_CX_SRCS-yes += encoder/picklpf.c
+VP8_CX_SRCS-yes += encoder/picklpf.h
  VP8_CX_SRCS-yes += encoder/vp8_quantize.c
  VP8_CX_SRCS-yes += encoder/ratectrl.c
  VP8_CX_SRCS-yes += encoder/rdopt.c
@@ -70,15 +72,16 @@ VP8_CX_SRCS-yes += encoder/treewriter.c
  VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
  VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
  VP8_CX_SRCS-yes += encoder/temporal_filter.c
+VP8_CX_SRCS-yes += encoder/temporal_filter.h
  VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c
  VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h
  
  ifeq ($(CONFIG_REALTIME_ONLY),yes)
  VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
  VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
+VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.h
  endif
  
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
  VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
  VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
  VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_quantize_sse2.c
@@ -91,7 +94,6 @@ endif
  
  VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
  VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
  VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
  
  ifeq ($(CONFIG_REALTIME_ONLY),yes)
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c

index 7e8089b51a807059645112fb53328242ec36c065..7f0ddb0ecfe9caca0f23ec1cd8cf1c0ce3062763 100644 (file)
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -18,8 +18,8 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
    __m128i in[2];
    const __m128i eight = _mm_set1_epi16(8);
  
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8);
+  in[0] = load_input_data8(input);
+  in[1] = load_input_data8(input + 8);
  
    switch (tx_type) {
      case 0:  // DCT_DCT
@@ -57,14 +57,14 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
    const __m128i final_rounding = _mm_set1_epi16(1 << 4);
  
    // load input data
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 1);
-  in[2] = load_input_data(input + 8 * 2);
-  in[3] = load_input_data(input + 8 * 3);
-  in[4] = load_input_data(input + 8 * 4);
-  in[5] = load_input_data(input + 8 * 5);
-  in[6] = load_input_data(input + 8 * 6);
-  in[7] = load_input_data(input + 8 * 7);
+  in[0] = load_input_data8(input);
+  in[1] = load_input_data8(input + 8 * 1);
+  in[2] = load_input_data8(input + 8 * 2);
+  in[3] = load_input_data8(input + 8 * 3);
+  in[4] = load_input_data8(input + 8 * 4);
+  in[5] = load_input_data8(input + 8 * 5);
+  in[6] = load_input_data8(input + 8 * 6);
+  in[7] = load_input_data8(input + 8 * 7);
  
    switch (tx_type) {
      case 0:  // DCT_DCT
diff --git a/vp9/encoder/vp9_alt_ref_aq.c b/vp9/encoder/vp9_alt_ref_aq.c

index 3aeefb58451246c1eae5f5a36680e058113c199a..acc3764c7a2d2213b93bd7e00bf9b7011a24f64a 100644 (file)
--- a/vp9/encoder/vp9_alt_ref_aq.c
+++ b/vp9/encoder/vp9_alt_ref_aq.c
@@ -15,7 +15,7 @@ struct ALT_REF_AQ {
    int dummy;
  };
  
-struct ALT_REF_AQ *vp9_alt_ref_aq_create() {
+struct ALT_REF_AQ *vp9_alt_ref_aq_create(void) {
    return (struct ALT_REF_AQ *)vpx_malloc(sizeof(struct ALT_REF_AQ));
  }
  
diff --git a/vp9/encoder/vp9_alt_ref_aq.h b/vp9/encoder/vp9_alt_ref_aq.h

index 18acd8a85b3d71e439e6edf342354176c925dad7..e508cb44ac2d6469ec53a19db5a8bcfb2fb804a5 100644 (file)
--- a/vp9/encoder/vp9_alt_ref_aq.h
+++ b/vp9/encoder/vp9_alt_ref_aq.h
@@ -54,7 +54,7 @@ struct ALT_REF_AQ;
   *
   * \return Instance of the class
   */
-struct ALT_REF_AQ *vp9_alt_ref_aq_create();
+struct ALT_REF_AQ *vp9_alt_ref_aq_create(void);
  
  /*!\brief Upload segmentation_map to self object
   *
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c

index 048ea629f5aba1376a28c9ba31715b827a830009..2f2f0055a7c9e4087c9813e4b334f53422781e11 100644 (file)
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -425,9 +425,10 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
    int target_refresh = 0;
    double weight_segment_target = 0;
    double weight_segment = 0;
+  int thresh_low_motion = (cm->width < 720) ? 55 : 20;
    cr->apply_cyclic_refresh = 1;
    if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 ||
-      (!cpi->use_svc && rc->avg_frame_low_motion < 55 &&
+      (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion &&
         rc->frames_since_key > 40)) {
      cr->apply_cyclic_refresh = 0;
      return;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c

index 6215e198ca6d3842e7e70152e95ab9ad1dbb5e86..2b694a389c3e48cdc587b8675cd7e0edaeb50a83 100644 (file)
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -489,8 +489,9 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x,
    return 0;
  }
  
-int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, int width,
-                                  int height, int content_state) {
+static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed,
+                                         int width, int height,
+                                         int content_state) {
    if (speed >= 8) {
      if (width <= 640 && height <= 480)
        return (5 * threshold_base) >> 2;
@@ -1022,6 +1023,9 @@ static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift,
    if (tmp_variance < (tmp_sse >> 3) && (tmp_sse - tmp_variance) > 10000)
      x->content_state_sb = kLowVarHighSumdiff;
  
+  if (tmp_sad > (avg_source_sad_threshold << 1))
+    x->content_state_sb = kVeryHighSad;
+
    if (cpi->content_state_sb_fd != NULL) {
      if (tmp_sad < avg_source_sad_threshold2) {
        // Cap the increment to 255.
@@ -1197,7 +1201,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
      set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
      vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
  
-    x->sb_is_skin = skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split);
+    if (cpi->use_skin_detection)
+      x->sb_is_skin =
+          skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split);
  
      d = xd->plane[0].dst.buf;
      dp = xd->plane[0].dst.stride;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c

index d8ea92af02775958ade5b33dcfe9b28a1d1e18d0..3a04c400ff02fce16b617dd6cab5da7671b5dc4a 100644 (file)
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -53,7 +53,7 @@ static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
    { 10, 6 }, { 8, 5 },
  };
  
-#define USE_GREEDY_OPTIMIZE_B 0
+#define USE_GREEDY_OPTIMIZE_B 1
  
  #if USE_GREEDY_OPTIMIZE_B
  
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c

index bf33168e748e4420bf97f8d4b767437d927ce290..79d7d7a3118ec013a382f222eadf0d138c407961 100644 (file)
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -3524,12 +3524,13 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
  
    vp9_update_noise_estimate(cpi);
  
-  // Scene detection is used for VBR mode or screen-content case.
-  // Make sure compute_source_sad_onepass is set (which handles SVC case
-  // and dynamic resize).
+  // Scene detection is always used for VBR mode or screen-content case.
+  // For other cases (e.g., CBR mode) use it for 5 <= speed < 8 for now
+  // (need to check encoding time cost for doing this for speed 8).
    if (cpi->compute_source_sad_onepass &&
        (cpi->oxcf.rc_mode == VPX_VBR ||
-       cpi->oxcf.content == VP9E_CONTENT_SCREEN))
+       cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
+       (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8)))
      vp9_scene_detection_onepass(cpi);
  
    // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h

index 672c83bfdf9b61d734d614a253728ddb6ddb0208..7ab892000fb8e239f4a1a26ab7fe7431aad9aebf 100644 (file)
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -138,6 +138,7 @@ typedef enum {
    kHighSadLowSumdiff = 3,
    kHighSadHighSumdiff = 4,
    kLowVarHighSumdiff = 5,
+  kVeryHighSad = 6,
  } CONTENT_STATE_SB;
  
  typedef struct VP9EncoderConfig {
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c

index b1579a61a18613ea9ce94ffbe62a01ece0a813b3..17dc0637f33b8681cef74272469af827af842f53 100644 (file)
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1617,7 +1617,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
  
    if (cpi->oxcf.speed >= 8 && !cpi->use_svc &&
        ((cpi->rc.frames_since_golden + 1) < x->last_sb_high_content ||
-       x->last_sb_high_content > 40))
+       x->last_sb_high_content > 40 || cpi->rc.frames_since_golden > 120))
      usable_ref_frame = LAST_FRAME;
  
    for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
@@ -1693,7 +1693,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
        continue;
      }
  
-    if ((cpi->sf.short_circuit_low_temp_var >= 2 ||
+    if (x->content_state_sb != kVeryHighSad &&
+        (cpi->sf.short_circuit_low_temp_var >= 2 ||
           (cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64)) &&
          force_skip_low_temp_var && ref_frame == LAST_FRAME &&
          this_mode == NEWMV) {
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c

index 27fea5d4e7881ba0be718218d0e6a4c21b55a664..942d7ede3bd31badf32be2628899eb92817ca45b 100644 (file)
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -209,7 +209,7 @@ int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
    const int bpm =
        (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth));
    return VPXMAX(FRAME_OVERHEAD_BITS,
-                (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+                (int)(((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS));
  }
  
  int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
@@ -353,6 +353,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
    rc->af_ratio_onepass_vbr = 10;
    rc->prev_avg_source_sad_lag = 0;
    rc->high_source_sad = 0;
+  rc->reset_high_source_sad = 0;
    rc->high_source_sad_lagindex = -1;
    rc->alt_ref_gf_group = 0;
    rc->fac_active_worst_inter = 150;
@@ -585,7 +586,7 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
  
    // In CBR mode, this makes sure q is between oscillating Qs to prevent
    // resonance.
-  if (cpi->oxcf.rc_mode == VPX_CBR &&
+  if (cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.reset_high_source_sad &&
        (!cpi->oxcf.gf_cbr_boost_pct ||
         !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) &&
        (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
@@ -679,7 +680,8 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
    int active_worst_quality;
    int ambient_qp;
    unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers;
-  if (cm->frame_type == KEY_FRAME) return rc->worst_quality;
+  if (cm->frame_type == KEY_FRAME || rc->reset_high_source_sad)
+    return rc->worst_quality;
    // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
    // for the first few frames following key frame. These are both initialized
    // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
@@ -1464,6 +1466,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
    if (oxcf->pass == 0) {
      if (cm->frame_type != KEY_FRAME) compute_frame_low_motion(cpi);
    }
+  if (cm->frame_type != KEY_FRAME) rc->reset_high_source_sad = 0;
  }
  
  void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
@@ -2070,7 +2073,8 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
    return resize_action;
  }
  
-void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
+static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi,
+                                             uint64_t avg_sad_current) {
    VP9_COMMON *const cm = &cpi->common;
    RATE_CONTROL *const rc = &cpi->rc;
    int target;
@@ -2330,6 +2334,23 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
          }
        }
      }
+    // For CBR non-screen content mode, check if we should reset the rate
+    // control. Reset is done if high_source_sad is detected and the rate
+    // control is at very low QP with rate correction factor at min level.
+    if (cpi->oxcf.rc_mode == VPX_CBR &&
+        cpi->oxcf.content != VP9E_CONTENT_SCREEN && !cpi->use_svc) {
+      if (rc->high_source_sad && rc->last_q[INTER_FRAME] == rc->best_quality &&
+          rc->avg_frame_qindex[INTER_FRAME] < (rc->best_quality << 1) &&
+          rc->rate_correction_factors[INTER_NORMAL] == MIN_BPB_FACTOR) {
+        rc->rate_correction_factors[INTER_NORMAL] = 0.5;
+        rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality;
+        rc->buffer_level = rc->optimal_buffer_level;
+        rc->bits_off_target = rc->optimal_buffer_level;
+        rc->reset_high_source_sad = 1;
+      }
+      if (cm->frame_type != KEY_FRAME && rc->reset_high_source_sad)
+        rc->this_frame_target = rc->avg_frame_bandwidth;
+    }
      // For VBR, under scene change/high content change, force golden refresh.
      if (cpi->oxcf.rc_mode == VPX_VBR && cm->frame_type != KEY_FRAME &&
          rc->high_source_sad && rc->frames_to_key > 3 &&
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h

index 9e46231955f77439fd0cef8a0eb6445f3400b389..c5bc173e249d53a8c3c6d72367d472a6fc08a718 100644 (file)
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -169,6 +169,7 @@ typedef struct {
    int avg_frame_low_motion;
    int af_ratio_onepass_vbr;
    int force_qpmin;
+  int reset_high_source_sad;
  } RATE_CONTROL;
  
  struct VP9_COMP;
diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c

index be4cd8685c5a13901e64406245c3b291f298ddda..460dab659380a8c24bf7f74ad6b133a20bc07e7c 100644 (file)
--- a/vp9/encoder/x86/temporal_filter_sse4.c
+++ b/vp9/encoder/x86/temporal_filter_sse4.c
@@ -11,6 +11,7 @@
  #include <assert.h>
  #include <smmintrin.h>
  
+#include "./vp9_rtcd.h"
  #include "./vpx_config.h"
  #include "vpx/vpx_integer.h"
  
diff --git a/vpx_dsp/add_noise.c b/vpx_dsp/add_noise.c

index a2b4c9010fd81c89b81d65d60c7919eba2c4c0e0..cda6ae8814a36c80d33b0a0615f8c9c7ba49ef5e 100644 (file)
--- a/vpx_dsp/add_noise.c
+++ b/vpx_dsp/add_noise.c
@@ -15,6 +15,7 @@
  #include "./vpx_dsp_rtcd.h"
  
  #include "vpx/vpx_integer.h"
+#include "vpx_dsp/postproc.h"
  #include "vpx_ports/mem.h"
  
  void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c

new file mode 100644 (file)

index 0000000..5226cc4
--- /dev/null
+++ b/vpx_dsp/arm/fdct32x32_neon.c
@@ -0,0 +1,1113 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+// Most gcc 4.9 distributions outside of Android do not generate correct code
+// for this function.
+#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
+    __GNUC__ == 4 && __GNUC_MINOR__ <= 9
+
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
+  vpx_fdct32x32_c(input, output, stride);
+}
+
+#else
+
+#define LOAD_INCREMENT(src, stride, dest, index) \
+  do {                                           \
+    dest[index] = vld1q_s16(src);                \
+    src += stride;                               \
+  } while (0)
+
+#define ADD_S16(src, index0, index1, dest, index3)      \
+  do {                                                  \
+    dest[index3] = vaddq_s16(src[index0], src[index1]); \
+  } while (0)
+
+#define ADD_SHIFT_S16(src, index0, index1)                             \
+  do {                                                                 \
+    src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \
+  } while (0)
+
+// Load, cross, and multiply by 4. Load the first 8 and last 8, then the
+// middle
+// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better?
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
+  const int16_t *a_end = a + 24 * stride;
+  int16x8_t c[8];
+
+  LOAD_INCREMENT(a, stride, b, 0);
+  LOAD_INCREMENT(a, stride, b, 1);
+  LOAD_INCREMENT(a, stride, b, 2);
+  LOAD_INCREMENT(a, stride, b, 3);
+  LOAD_INCREMENT(a, stride, b, 4);
+  LOAD_INCREMENT(a, stride, b, 5);
+  LOAD_INCREMENT(a, stride, b, 6);
+  LOAD_INCREMENT(a, stride, b, 7);
+
+  LOAD_INCREMENT(a_end, stride, b, 24);
+  LOAD_INCREMENT(a_end, stride, b, 25);
+  LOAD_INCREMENT(a_end, stride, b, 26);
+  LOAD_INCREMENT(a_end, stride, b, 27);
+  LOAD_INCREMENT(a_end, stride, b, 28);
+  LOAD_INCREMENT(a_end, stride, b, 29);
+  LOAD_INCREMENT(a_end, stride, b, 30);
+  LOAD_INCREMENT(a_end, stride, b, 31);
+
+  ADD_S16(b, 0, 31, c, 0);
+  ADD_S16(b, 1, 30, c, 1);
+  ADD_S16(b, 2, 29, c, 2);
+  ADD_S16(b, 3, 28, c, 3);
+  ADD_S16(b, 4, 27, c, 4);
+  ADD_S16(b, 5, 26, c, 5);
+  ADD_S16(b, 6, 25, c, 6);
+  ADD_S16(b, 7, 24, c, 7);
+
+  ADD_SHIFT_S16(b, 7, 24);
+  ADD_SHIFT_S16(b, 6, 25);
+  ADD_SHIFT_S16(b, 5, 26);
+  ADD_SHIFT_S16(b, 4, 27);
+  ADD_SHIFT_S16(b, 3, 28);
+  ADD_SHIFT_S16(b, 2, 29);
+  ADD_SHIFT_S16(b, 1, 30);
+  ADD_SHIFT_S16(b, 0, 31);
+
+  b[0] = vshlq_n_s16(c[0], 2);
+  b[1] = vshlq_n_s16(c[1], 2);
+  b[2] = vshlq_n_s16(c[2], 2);
+  b[3] = vshlq_n_s16(c[3], 2);
+  b[4] = vshlq_n_s16(c[4], 2);
+  b[5] = vshlq_n_s16(c[5], 2);
+  b[6] = vshlq_n_s16(c[6], 2);
+  b[7] = vshlq_n_s16(c[7], 2);
+
+  LOAD_INCREMENT(a, stride, b, 8);
+  LOAD_INCREMENT(a, stride, b, 9);
+  LOAD_INCREMENT(a, stride, b, 10);
+  LOAD_INCREMENT(a, stride, b, 11);
+  LOAD_INCREMENT(a, stride, b, 12);
+  LOAD_INCREMENT(a, stride, b, 13);
+  LOAD_INCREMENT(a, stride, b, 14);
+  LOAD_INCREMENT(a, stride, b, 15);
+  LOAD_INCREMENT(a, stride, b, 16);
+  LOAD_INCREMENT(a, stride, b, 17);
+  LOAD_INCREMENT(a, stride, b, 18);
+  LOAD_INCREMENT(a, stride, b, 19);
+  LOAD_INCREMENT(a, stride, b, 20);
+  LOAD_INCREMENT(a, stride, b, 21);
+  LOAD_INCREMENT(a, stride, b, 22);
+  LOAD_INCREMENT(a, stride, b, 23);
+
+  ADD_S16(b, 8, 23, c, 0);
+  ADD_S16(b, 9, 22, c, 1);
+  ADD_S16(b, 10, 21, c, 2);
+  ADD_S16(b, 11, 20, c, 3);
+  ADD_S16(b, 12, 19, c, 4);
+  ADD_S16(b, 13, 18, c, 5);
+  ADD_S16(b, 14, 17, c, 6);
+  ADD_S16(b, 15, 16, c, 7);
+
+  ADD_SHIFT_S16(b, 15, 16);
+  ADD_SHIFT_S16(b, 14, 17);
+  ADD_SHIFT_S16(b, 13, 18);
+  ADD_SHIFT_S16(b, 12, 19);
+  ADD_SHIFT_S16(b, 11, 20);
+  ADD_SHIFT_S16(b, 10, 21);
+  ADD_SHIFT_S16(b, 9, 22);
+  ADD_SHIFT_S16(b, 8, 23);
+
+  b[8] = vshlq_n_s16(c[0], 2);
+  b[9] = vshlq_n_s16(c[1], 2);
+  b[10] = vshlq_n_s16(c[2], 2);
+  b[11] = vshlq_n_s16(c[3], 2);
+  b[12] = vshlq_n_s16(c[4], 2);
+  b[13] = vshlq_n_s16(c[5], 2);
+  b[14] = vshlq_n_s16(c[6], 2);
+  b[15] = vshlq_n_s16(c[7], 2);
+}
+
+#undef LOAD_INCREMENT
+#undef ADD_S16
+#undef ADD_SHIFT_S16
+
+#define STORE_S16(src, index, dest)           \
+  do {                                        \
+    store_s16q_to_tran_low(dest, src[index]); \
+    dest += 8;                                \
+  } while (0);
+
+// Store 32 16x8 values, assuming stride == 32.
+// Slight twist: store horizontally in blocks of 8.
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+  STORE_S16(b, 0, a);
+  STORE_S16(b, 8, a);
+  STORE_S16(b, 16, a);
+  STORE_S16(b, 24, a);
+  STORE_S16(b, 1, a);
+  STORE_S16(b, 9, a);
+  STORE_S16(b, 17, a);
+  STORE_S16(b, 25, a);
+  STORE_S16(b, 2, a);
+  STORE_S16(b, 10, a);
+  STORE_S16(b, 18, a);
+  STORE_S16(b, 26, a);
+  STORE_S16(b, 3, a);
+  STORE_S16(b, 11, a);
+  STORE_S16(b, 19, a);
+  STORE_S16(b, 27, a);
+  STORE_S16(b, 4, a);
+  STORE_S16(b, 12, a);
+  STORE_S16(b, 20, a);
+  STORE_S16(b, 28, a);
+  STORE_S16(b, 5, a);
+  STORE_S16(b, 13, a);
+  STORE_S16(b, 21, a);
+  STORE_S16(b, 29, a);
+  STORE_S16(b, 6, a);
+  STORE_S16(b, 14, a);
+  STORE_S16(b, 22, a);
+  STORE_S16(b, 30, a);
+  STORE_S16(b, 7, a);
+  STORE_S16(b, 15, a);
+  STORE_S16(b, 23, a);
+  STORE_S16(b, 31, a);
+}
+
+#undef STORE_S16
+
+// fdct_round_shift((a +/- b) * c)
+static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_high_t constant,
+                                       int16x8_t *add, int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c0 +/- b * c1)
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_high_t constant0,
+                                       const tran_high_t constant1,
+                                       int16x8_t *add, int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
+  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
+  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
+  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
+  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
+  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+  return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
+}
+
+static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // Mini cross. X the first 16 values and the middle 8 of the second half.
+  a[0] = vaddq_s16(in[0], in[15]);
+  a[1] = vaddq_s16(in[1], in[14]);
+  a[2] = vaddq_s16(in[2], in[13]);
+  a[3] = vaddq_s16(in[3], in[12]);
+  a[4] = vaddq_s16(in[4], in[11]);
+  a[5] = vaddq_s16(in[5], in[10]);
+  a[6] = vaddq_s16(in[6], in[9]);
+  a[7] = vaddq_s16(in[7], in[8]);
+
+  a[8] = vsubq_s16(in[7], in[8]);
+  a[9] = vsubq_s16(in[6], in[9]);
+  a[10] = vsubq_s16(in[5], in[10]);
+  a[11] = vsubq_s16(in[4], in[11]);
+  a[12] = vsubq_s16(in[3], in[12]);
+  a[13] = vsubq_s16(in[2], in[13]);
+  a[14] = vsubq_s16(in[1], in[14]);
+  a[15] = vsubq_s16(in[0], in[15]);
+
+  a[16] = in[16];
+  a[17] = in[17];
+  a[18] = in[18];
+  a[19] = in[19];
+
+  butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]);
+  butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]);
+  butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]);
+  butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]);
+
+  a[28] = in[28];
+  a[29] = in[29];
+  a[30] = in[30];
+  a[31] = in[31];
+
+  // Stage 3.
+  b[0] = vaddq_s16(a[0], a[7]);
+  b[1] = vaddq_s16(a[1], a[6]);
+  b[2] = vaddq_s16(a[2], a[5]);
+  b[3] = vaddq_s16(a[3], a[4]);
+
+  b[4] = vsubq_s16(a[3], a[4]);
+  b[5] = vsubq_s16(a[2], a[5]);
+  b[6] = vsubq_s16(a[1], a[6]);
+  b[7] = vsubq_s16(a[0], a[7]);
+
+  b[8] = a[8];
+  b[9] = a[9];
+
+  butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]);
+  butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]);
+
+  b[14] = a[14];
+  b[15] = a[15];
+
+  b[16] = vaddq_s16(in[16], a[23]);
+  b[17] = vaddq_s16(in[17], a[22]);
+  b[18] = vaddq_s16(in[18], a[21]);
+  b[19] = vaddq_s16(in[19], a[20]);
+
+  b[20] = vsubq_s16(in[19], a[20]);
+  b[21] = vsubq_s16(in[18], a[21]);
+  b[22] = vsubq_s16(in[17], a[22]);
+  b[23] = vsubq_s16(in[16], a[23]);
+
+  b[24] = vsubq_s16(in[31], a[24]);
+  b[25] = vsubq_s16(in[30], a[25]);
+  b[26] = vsubq_s16(in[29], a[26]);
+  b[27] = vsubq_s16(in[28], a[27]);
+
+  b[28] = vaddq_s16(in[28], a[27]);
+  b[29] = vaddq_s16(in[29], a[26]);
+  b[30] = vaddq_s16(in[30], a[25]);
+  b[31] = vaddq_s16(in[31], a[24]);
+
+  // Stage 4.
+  a[0] = vaddq_s16(b[0], b[3]);
+  a[1] = vaddq_s16(b[1], b[2]);
+  a[2] = vsubq_s16(b[1], b[2]);
+  a[3] = vsubq_s16(b[0], b[3]);
+
+  a[4] = b[4];
+
+  butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]);
+
+  a[7] = b[7];
+
+  a[8] = vaddq_s16(b[8], b[11]);
+  a[9] = vaddq_s16(b[9], b[10]);
+  a[10] = vsubq_s16(b[9], b[10]);
+  a[11] = vsubq_s16(b[8], b[11]);
+  a[12] = vsubq_s16(b[15], b[12]);
+  a[13] = vsubq_s16(b[14], b[13]);
+  a[14] = vaddq_s16(b[14], b[13]);
+  a[15] = vaddq_s16(b[15], b[12]);
+
+  a[16] = b[16];
+  a[17] = b[17];
+
+  butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]);
+  butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]);
+  butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]);
+  butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]);
+
+  a[22] = b[22];
+  a[23] = b[23];
+  a[24] = b[24];
+  a[25] = b[25];
+
+  a[30] = b[30];
+  a[31] = b[31];
+
+  // Stage 5.
+  butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]);
+  butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]);
+
+  b[4] = vaddq_s16(a[4], a[5]);
+  b[5] = vsubq_s16(a[4], a[5]);
+  b[6] = vsubq_s16(a[7], a[6]);
+  b[7] = vaddq_s16(a[7], a[6]);
+
+  b[8] = a[8];
+
+  butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]);
+  butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]);
+
+  b[11] = a[11];
+  b[12] = a[12];
+
+  b[15] = a[15];
+
+  b[16] = vaddq_s16(a[19], a[16]);
+  b[17] = vaddq_s16(a[18], a[17]);
+  b[18] = vsubq_s16(a[17], a[18]);
+  b[19] = vsubq_s16(a[16], a[19]);
+  b[20] = vsubq_s16(a[23], a[20]);
+  b[21] = vsubq_s16(a[22], a[21]);
+  b[22] = vaddq_s16(a[21], a[22]);
+  b[23] = vaddq_s16(a[20], a[23]);
+  b[24] = vaddq_s16(a[27], a[24]);
+  b[25] = vaddq_s16(a[26], a[25]);
+  b[26] = vsubq_s16(a[25], a[26]);
+  b[27] = vsubq_s16(a[24], a[27]);
+  b[28] = vsubq_s16(a[31], a[28]);
+  b[29] = vsubq_s16(a[30], a[29]);
+  b[30] = vaddq_s16(a[29], a[30]);
+  b[31] = vaddq_s16(a[28], a[31]);
+
+  // Stage 6.
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+
+  butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]);
+  butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]);
+
+  a[8] = vaddq_s16(b[8], b[9]);
+  a[9] = vsubq_s16(b[8], b[9]);
+  a[10] = vsubq_s16(b[11], b[10]);
+  a[11] = vaddq_s16(b[11], b[10]);
+  a[12] = vaddq_s16(b[12], b[13]);
+  a[13] = vsubq_s16(b[12], b[13]);
+  a[14] = vsubq_s16(b[15], b[14]);
+  a[15] = vaddq_s16(b[15], b[14]);
+
+  a[16] = b[16];
+  a[19] = b[19];
+  a[20] = b[20];
+  a[23] = b[23];
+  a[24] = b[24];
+  a[27] = b[27];
+  a[28] = b[28];
+  a[31] = b[31];
+
+  butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]);
+  butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]);
+
+  butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]);
+  butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]);
+
+  // Stage 7.
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+  b[4] = a[4];
+  b[5] = a[5];
+  b[6] = a[6];
+  b[7] = a[7];
+
+  butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]);
+  butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]);
+  butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]);
+  butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]);
+
+  b[16] = vaddq_s16(a[16], a[17]);
+  b[17] = vsubq_s16(a[16], a[17]);
+  b[18] = vsubq_s16(a[19], a[18]);
+  b[19] = vaddq_s16(a[19], a[18]);
+  b[20] = vaddq_s16(a[20], a[21]);
+  b[21] = vsubq_s16(a[20], a[21]);
+  b[22] = vsubq_s16(a[23], a[22]);
+  b[23] = vaddq_s16(a[23], a[22]);
+  b[24] = vaddq_s16(a[24], a[25]);
+  b[25] = vsubq_s16(a[24], a[25]);
+  b[26] = vsubq_s16(a[27], a[26]);
+  b[27] = vaddq_s16(a[27], a[26]);
+  b[28] = vaddq_s16(a[28], a[29]);
+  b[29] = vsubq_s16(a[28], a[29]);
+  b[30] = vsubq_s16(a[31], a[30]);
+  b[31] = vaddq_s16(a[31], a[30]);
+
+  // Final stage.
+  // Also compute partial rounding shift:
+  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  out[0] = sub_round_shift(b[0]);
+  out[16] = sub_round_shift(b[1]);
+  out[8] = sub_round_shift(b[2]);
+  out[24] = sub_round_shift(b[3]);
+  out[4] = sub_round_shift(b[4]);
+  out[20] = sub_round_shift(b[5]);
+  out[12] = sub_round_shift(b[6]);
+  out[28] = sub_round_shift(b[7]);
+  out[2] = sub_round_shift(b[8]);
+  out[18] = sub_round_shift(b[9]);
+  out[10] = sub_round_shift(b[10]);
+  out[26] = sub_round_shift(b[11]);
+  out[6] = sub_round_shift(b[12]);
+  out[22] = sub_round_shift(b[13]);
+  out[14] = sub_round_shift(b[14]);
+  out[30] = sub_round_shift(b[15]);
+
+  butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]);
+  out[1] = sub_round_shift(a[1]);
+  out[31] = sub_round_shift(a[31]);
+
+  butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]);
+  out[17] = sub_round_shift(a[17]);
+  out[15] = sub_round_shift(a[15]);
+
+  butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]);
+  out[9] = sub_round_shift(a[9]);
+  out[23] = sub_round_shift(a[23]);
+
+  butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]);
+  out[25] = sub_round_shift(a[25]);
+  out[7] = sub_round_shift(a[7]);
+
+  butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]);
+  out[5] = sub_round_shift(a[5]);
+  out[27] = sub_round_shift(a[27]);
+
+  butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]);
+  out[21] = sub_round_shift(a[21]);
+  out[11] = sub_round_shift(a[11]);
+
+  butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]);
+  out[13] = sub_round_shift(a[13]);
+  out[19] = sub_round_shift(a[19]);
+
+  butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]);
+  out[29] = sub_round_shift(a[29]);
+  out[3] = sub_round_shift(a[3]);
+}
+
+#define PASS_THROUGH(src, dst, element)    \
+  do {                                     \
+    dst##_lo[element] = src##_lo[element]; \
+    dst##_hi[element] = src##_hi[element]; \
+  } while (0)
+
+#define ADD_S16_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                        \
+    b##_lo[b_index] =                                                         \
+        vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+    b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]),                 \
+                                vget_high_s16(a[right_index]));               \
+  } while (0)
+
+#define SUB_S16_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                        \
+    b##_lo[b_index] =                                                         \
+        vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+    b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]),                 \
+                                vget_high_s16(a[right_index]));               \
+  } while (0)
+
+#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index)                     \
+  do {                                                                       \
+    c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index]));  \
+    c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
+  } while (0)
+
+#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
+  do {                                                                     \
+    temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index]));           \
+    temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index]));          \
+    c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]);   \
+    c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]);   \
+  } while (0)
+
+#define ADD_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                    \
+    b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
+    b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
+  } while (0)
+
+#define SUB_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                    \
+    b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
+    b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
+  } while (0)
+
+// Like butterfly_one_coeff, but don't narrow results.
+static INLINE void butterfly_one_coeff_s16_s32(
+    const int16x8_t a, const int16x8_t b, const tran_high_t constant,
+    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+    int32x4_t *sub_hi) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b,   \
+                              add_index, sub_index)                      \
+  do {                                                                   \
+    butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
+                                &b##_lo[add_index], &b##_hi[add_index],  \
+                                &b##_lo[sub_index], &b##_hi[sub_index]); \
+  } while (0)
+
+// Like butterfly_one_coeff, but with s32.
+static INLINE void butterfly_one_coeff_s32(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_high_t constant, int32x4_t *add_lo,
+    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  // TODO(johannkoenig): Strangely there is only a conversion warning on int64_t
+  // to int32_t (const tran_high_t (aka const long long)) but not for int64_t to
+  // int16_t. The constants fit in int16_t. Investigate using int16_t for the
+  // constants to avoid bouncing between types.
+  const int32_t constant_s32 = (int32_t)constant;
+  const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant_s32);
+  const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant_s32);
+  const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant_s32);
+  const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant_s32);
+  const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant_s32);
+  const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant_s32);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
+                          sub_index)                                          \
+  do {                                                                        \
+    butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index],           \
+                            a##_lo[right_index], a##_hi[right_index],         \
+                            constant, &b##_lo[add_index], &b##_hi[add_index], \
+                            &b##_lo[sub_index], &b##_hi[sub_index]);          \
+  } while (0)
+
+// Like butterfly_two_coeff, but with s32.
+static INLINE void butterfly_two_coeff_s32(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_high_t constant0,
+    const tran_high_t constant1, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32_t constant0_s32 = (int32_t)constant0;
+  const int32_t constant1_s32 = (int32_t)constant1;
+  const int32x4_t a0 = vmulq_n_s32(a_lo, constant0_s32);
+  const int32x4_t a1 = vmulq_n_s32(a_hi, constant0_s32);
+  const int32x4_t a2 = vmulq_n_s32(a_lo, constant1_s32);
+  const int32x4_t a3 = vmulq_n_s32(a_hi, constant1_s32);
+  const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0_s32);
+  const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0_s32);
+  const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1_s32);
+  const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1_s32);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant,           \
+                          right_constant, b, add_index, sub_index)             \
+  do {                                                                         \
+    butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index],            \
+                            a##_lo[right_index], a##_hi[right_index],          \
+                            left_constant, right_constant, &b##_lo[add_index], \
+                            &b##_hi[add_index], &b##_lo[sub_index],            \
+                            &b##_hi[sub_index]);                               \
+  } while (0)
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
+                                            const int32x4_t a_hi) {
+  const int32x4_t one = vdupq_n_s32(1);
+  const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
+  const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
+  const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
+  const int16x4_t b_lo =
+      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
+  const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
+  const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
+  const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
+  const int16x4_t b_hi =
+      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
+  return vcombine_s16(b_lo, b_hi);
+}
+
+static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+  int32x4_t c_lo[32];
+  int32x4_t c_hi[32];
+  int32x4_t d_lo[32];
+  int32x4_t d_hi[32];
+
+  // Stage 1. Done as part of the load for the first pass.
+  a[0] = vaddq_s16(in[0], in[31]);
+  a[1] = vaddq_s16(in[1], in[30]);
+  a[2] = vaddq_s16(in[2], in[29]);
+  a[3] = vaddq_s16(in[3], in[28]);
+  a[4] = vaddq_s16(in[4], in[27]);
+  a[5] = vaddq_s16(in[5], in[26]);
+  a[6] = vaddq_s16(in[6], in[25]);
+  a[7] = vaddq_s16(in[7], in[24]);
+  a[8] = vaddq_s16(in[8], in[23]);
+  a[9] = vaddq_s16(in[9], in[22]);
+  a[10] = vaddq_s16(in[10], in[21]);
+  a[11] = vaddq_s16(in[11], in[20]);
+  a[12] = vaddq_s16(in[12], in[19]);
+  a[13] = vaddq_s16(in[13], in[18]);
+  a[14] = vaddq_s16(in[14], in[17]);
+  a[15] = vaddq_s16(in[15], in[16]);
+  a[16] = vsubq_s16(in[15], in[16]);
+  a[17] = vsubq_s16(in[14], in[17]);
+  a[18] = vsubq_s16(in[13], in[18]);
+  a[19] = vsubq_s16(in[12], in[19]);
+  a[20] = vsubq_s16(in[11], in[20]);
+  a[21] = vsubq_s16(in[10], in[21]);
+  a[22] = vsubq_s16(in[9], in[22]);
+  a[23] = vsubq_s16(in[8], in[23]);
+  a[24] = vsubq_s16(in[7], in[24]);
+  a[25] = vsubq_s16(in[6], in[25]);
+  a[26] = vsubq_s16(in[5], in[26]);
+  a[27] = vsubq_s16(in[4], in[27]);
+  a[28] = vsubq_s16(in[3], in[28]);
+  a[29] = vsubq_s16(in[2], in[29]);
+  a[30] = vsubq_s16(in[1], in[30]);
+  a[31] = vsubq_s16(in[0], in[31]);
+
+  // Stage 2.
+  b[0] = vaddq_s16(a[0], a[15]);
+  b[1] = vaddq_s16(a[1], a[14]);
+  b[2] = vaddq_s16(a[2], a[13]);
+  b[3] = vaddq_s16(a[3], a[12]);
+  b[4] = vaddq_s16(a[4], a[11]);
+  b[5] = vaddq_s16(a[5], a[10]);
+  b[6] = vaddq_s16(a[6], a[9]);
+  b[7] = vaddq_s16(a[7], a[8]);
+
+  b[8] = vsubq_s16(a[7], a[8]);
+  b[9] = vsubq_s16(a[6], a[9]);
+  b[10] = vsubq_s16(a[5], a[10]);
+  b[11] = vsubq_s16(a[4], a[11]);
+  b[12] = vsubq_s16(a[3], a[12]);
+  b[13] = vsubq_s16(a[2], a[13]);
+  b[14] = vsubq_s16(a[1], a[14]);
+  b[15] = vsubq_s16(a[0], a[15]);
+
+  b[16] = a[16];
+  b[17] = a[17];
+  b[18] = a[18];
+  b[19] = a[19];
+
+  butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+  butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+  butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+  butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+
+  b[28] = a[28];
+  b[29] = a[29];
+  b[30] = a[30];
+  b[31] = a[31];
+
+  // Stage 3. With extreme values for input this calculation rolls over int16_t.
+  // The sources for b[0] get added multiple times and, through testing, have
+  // been shown to overflow starting here.
+  ADD_S16_S32(b, 0, 7, c, 0);
+  ADD_S16_S32(b, 1, 6, c, 1);
+  ADD_S16_S32(b, 2, 5, c, 2);
+  ADD_S16_S32(b, 3, 4, c, 3);
+  SUB_S16_S32(b, 3, 4, c, 4);
+  SUB_S16_S32(b, 2, 5, c, 5);
+  SUB_S16_S32(b, 1, 6, c, 6);
+  SUB_S16_S32(b, 0, 7, c, 7);
+
+  a[8] = b[8];
+  a[9] = b[9];
+
+  BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
+  BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
+
+  a[14] = b[14];
+  a[15] = b[15];
+
+  ADD_S16_S32(b, 16, 23, c, 16);
+  ADD_S16_S32(b, 17, 22, c, 17);
+  ADD_S16_S32(b, 18, 21, c, 18);
+  ADD_S16_S32(b, 19, 20, c, 19);
+  SUB_S16_S32(b, 19, 20, c, 20);
+  SUB_S16_S32(b, 18, 21, c, 21);
+  SUB_S16_S32(b, 17, 22, c, 22);
+  SUB_S16_S32(b, 16, 23, c, 23);
+  SUB_S16_S32(b, 31, 24, c, 24);
+  SUB_S16_S32(b, 30, 25, c, 25);
+  SUB_S16_S32(b, 29, 26, c, 26);
+  SUB_S16_S32(b, 28, 27, c, 27);
+  ADD_S16_S32(b, 28, 27, c, 28);
+  ADD_S16_S32(b, 29, 26, c, 29);
+  ADD_S16_S32(b, 30, 25, c, 30);
+  ADD_S16_S32(b, 31, 24, c, 31);
+
+  // Stage 4.
+  ADD_S32(c, 0, 3, d, 0);
+  ADD_S32(c, 1, 2, d, 1);
+  SUB_S32(c, 1, 2, d, 2);
+  SUB_S32(c, 0, 3, d, 3);
+
+  PASS_THROUGH(c, d, 4);
+
+  BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
+
+  PASS_THROUGH(c, d, 7);
+
+  ADDW_S16_S32(c, 11, a, 8, d, 8);
+  ADDW_S16_S32(c, 10, a, 9, d, 9);
+  SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
+  SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
+  SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
+  SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
+  ADDW_S16_S32(c, 13, b, 14, d, 14);
+  ADDW_S16_S32(c, 12, b, 15, d, 15);
+
+  PASS_THROUGH(c, d, 16);
+  PASS_THROUGH(c, d, 17);
+
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18);
+  BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19);
+  BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20);
+  BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21);
+
+  PASS_THROUGH(c, d, 22);
+  PASS_THROUGH(c, d, 23);
+  PASS_THROUGH(c, d, 24);
+  PASS_THROUGH(c, d, 25);
+
+  PASS_THROUGH(c, d, 30);
+  PASS_THROUGH(c, d, 31);
+
+  // Stage 5.
+  BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
+  BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3);
+
+  ADD_S32(d, 4, 5, c, 4);
+  SUB_S32(d, 4, 5, c, 5);
+  SUB_S32(d, 7, 6, c, 6);
+  ADD_S32(d, 7, 6, c, 7);
+
+  PASS_THROUGH(d, c, 8);
+
+  BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9);
+  BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10);
+
+  PASS_THROUGH(d, c, 11);
+  PASS_THROUGH(d, c, 12);
+  PASS_THROUGH(d, c, 15);
+
+  ADD_S32(d, 16, 19, c, 16);
+  ADD_S32(d, 17, 18, c, 17);
+  SUB_S32(d, 17, 18, c, 18);
+  SUB_S32(d, 16, 19, c, 19);
+  SUB_S32(d, 23, 20, c, 20);
+  SUB_S32(d, 22, 21, c, 21);
+  ADD_S32(d, 22, 21, c, 22);
+  ADD_S32(d, 23, 20, c, 23);
+  ADD_S32(d, 24, 27, c, 24);
+  ADD_S32(d, 25, 26, c, 25);
+  SUB_S32(d, 25, 26, c, 26);
+  SUB_S32(d, 24, 27, c, 27);
+  SUB_S32(d, 31, 28, c, 28);
+  SUB_S32(d, 30, 29, c, 29);
+  ADD_S32(d, 30, 29, c, 30);
+  ADD_S32(d, 31, 28, c, 31);
+
+  // Stage 6.
+  PASS_THROUGH(c, d, 0);
+  PASS_THROUGH(c, d, 1);
+  PASS_THROUGH(c, d, 2);
+  PASS_THROUGH(c, d, 3);
+
+  BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7);
+  BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6);
+
+  ADD_S32(c, 8, 9, d, 8);
+  SUB_S32(c, 8, 9, d, 9);
+  SUB_S32(c, 11, 10, d, 10);
+  ADD_S32(c, 11, 10, d, 11);
+  ADD_S32(c, 12, 13, d, 12);
+  SUB_S32(c, 12, 13, d, 13);
+  SUB_S32(c, 15, 14, d, 14);
+  ADD_S32(c, 15, 14, d, 15);
+
+  PASS_THROUGH(c, d, 16);
+  PASS_THROUGH(c, d, 19);
+  PASS_THROUGH(c, d, 20);
+  PASS_THROUGH(c, d, 23);
+  PASS_THROUGH(c, d, 24);
+  PASS_THROUGH(c, d, 27);
+  PASS_THROUGH(c, d, 28);
+  PASS_THROUGH(c, d, 31);
+
+  BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17);
+  BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18);
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21);
+  BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22);
+
+  // Stage 7.
+  PASS_THROUGH(d, c, 0);
+  PASS_THROUGH(d, c, 1);
+  PASS_THROUGH(d, c, 2);
+  PASS_THROUGH(d, c, 3);
+  PASS_THROUGH(d, c, 4);
+  PASS_THROUGH(d, c, 5);
+  PASS_THROUGH(d, c, 6);
+  PASS_THROUGH(d, c, 7);
+
+  BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15);
+  BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14);
+  BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13);
+  BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12);
+
+  ADD_S32(d, 16, 17, c, 16);
+  SUB_S32(d, 16, 17, c, 17);
+  SUB_S32(d, 19, 18, c, 18);
+  ADD_S32(d, 19, 18, c, 19);
+  ADD_S32(d, 20, 21, c, 20);
+  SUB_S32(d, 20, 21, c, 21);
+  SUB_S32(d, 23, 22, c, 22);
+  ADD_S32(d, 23, 22, c, 23);
+  ADD_S32(d, 24, 25, c, 24);
+  SUB_S32(d, 24, 25, c, 25);
+  SUB_S32(d, 27, 26, c, 26);
+  ADD_S32(d, 27, 26, c, 27);
+  ADD_S32(d, 28, 29, c, 28);
+  SUB_S32(d, 28, 29, c, 29);
+  SUB_S32(d, 31, 30, c, 30);
+  ADD_S32(d, 31, 30, c, 31);
+
+  // Final stage.
+  // Roll rounding into this function so we can pass back int16x8.
+
+  out[0] = add_round_shift_s32(c_lo[0], c_hi[0]);
+  out[16] = add_round_shift_s32(c_lo[1], c_hi[1]);
+
+  out[8] = add_round_shift_s32(c_lo[2], c_hi[2]);
+  out[24] = add_round_shift_s32(c_lo[3], c_hi[3]);
+  out[4] = add_round_shift_s32(c_lo[4], c_hi[4]);
+  out[20] = add_round_shift_s32(c_lo[5], c_hi[5]);
+  out[12] = add_round_shift_s32(c_lo[6], c_hi[6]);
+
+  out[28] = add_round_shift_s32(c_lo[7], c_hi[7]);
+  out[2] = add_round_shift_s32(c_lo[8], c_hi[8]);
+  out[18] = add_round_shift_s32(c_lo[9], c_hi[9]);
+  out[10] = add_round_shift_s32(c_lo[10], c_hi[10]);
+
+  out[26] = add_round_shift_s32(c_lo[11], c_hi[11]);
+  out[6] = add_round_shift_s32(c_lo[12], c_hi[12]);
+  out[22] = add_round_shift_s32(c_lo[13], c_hi[13]);
+  out[14] = add_round_shift_s32(c_lo[14], c_hi[14]);
+  out[30] = add_round_shift_s32(c_lo[15], c_hi[15]);
+
+  BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31);
+  out[1] = add_round_shift_s32(d_lo[1], d_hi[1]);
+  out[31] = add_round_shift_s32(d_lo[31], d_hi[31]);
+
+  BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15);
+  out[17] = add_round_shift_s32(d_lo[17], d_hi[17]);
+  out[15] = add_round_shift_s32(d_lo[15], d_hi[15]);
+
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23);
+  out[9] = add_round_shift_s32(d_lo[9], d_hi[9]);
+  out[23] = add_round_shift_s32(d_lo[23], d_hi[23]);
+
+  BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7);
+  out[25] = add_round_shift_s32(d_lo[25], d_hi[25]);
+  out[7] = add_round_shift_s32(d_lo[7], d_hi[7]);
+
+  BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27);
+  out[5] = add_round_shift_s32(d_lo[5], d_hi[5]);
+  out[27] = add_round_shift_s32(d_lo[27], d_hi[27]);
+
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11);
+  out[21] = add_round_shift_s32(d_lo[21], d_hi[21]);
+  out[11] = add_round_shift_s32(d_lo[11], d_hi[11]);
+
+  BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19);
+  out[13] = add_round_shift_s32(d_lo[13], d_hi[13]);
+  out[19] = add_round_shift_s32(d_lo[19], d_hi[19]);
+
+  BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3);
+  out[29] = add_round_shift_s32(d_lo[29], d_hi[29]);
+  out[3] = add_round_shift_s32(d_lo[3], d_hi[3]);
+}
+
+#undef PASS_THROUGH
+#undef ADD_S16_S32
+#undef SUB_S16_S32
+#undef ADDW_S16_S32
+#undef SUBW_S16_S32
+#undef ADD_S32
+#undef SUB_S32
+#undef BUTTERFLY_ONE_S16_S32
+#undef BUTTERFLY_ONE_S32
+#undef BUTTERFLY_TWO_S32
+
+// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
+// are all in-place.
+// TODO(johannkoenig): share with other fdcts.
+static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) {
+  // Swap 16 bit elements.
+  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+
+  // Swap 32 bit elements.
+  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
+  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
+                                   vreinterpretq_s32_s16(c3.val[0]));
+  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
+                                   vreinterpretq_s32_s16(c3.val[1]));
+
+  // Swap 64 bit elements
+  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
+  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
+  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
+  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+
+  b[0] = e0.val[0];
+  b[1] = e1.val[0];
+  b[2] = e2.val[0];
+  b[3] = e3.val[0];
+  b[4] = e0.val[1];
+  b[5] = e1.val[1];
+  b[6] = e2.val[1];
+  b[7] = e3.val[1];
+}
+
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
+  int16x8_t temp0[32];
+  int16x8_t temp1[32];
+  int16x8_t temp2[32];
+  int16x8_t temp3[32];
+  int16x8_t temp4[32];
+  int16x8_t temp5[32];
+
+  // Process in 8x32 columns.
+  load(input, stride, temp0);
+  dct_body_first_pass(temp0, temp1);
+
+  load(input + 8, stride, temp0);
+  dct_body_first_pass(temp0, temp2);
+
+  load(input + 16, stride, temp0);
+  dct_body_first_pass(temp0, temp3);
+
+  load(input + 24, stride, temp0);
+  dct_body_first_pass(temp0, temp4);
+
+  // Generate the top row by munging the first set of 8 from each one together.
+  transpose_8x8(&temp1[0], &temp0[0]);
+  transpose_8x8(&temp2[0], &temp0[8]);
+  transpose_8x8(&temp3[0], &temp0[16]);
+  transpose_8x8(&temp4[0], &temp0[24]);
+
+  dct_body_second_pass(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output, temp5);
+
+  // Second row of 8x32.
+  transpose_8x8(&temp1[8], &temp0[0]);
+  transpose_8x8(&temp2[8], &temp0[8]);
+  transpose_8x8(&temp3[8], &temp0[16]);
+  transpose_8x8(&temp4[8], &temp0[24]);
+
+  dct_body_second_pass(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 8 * 32, temp5);
+
+  // Third row of 8x32
+  transpose_8x8(&temp1[16], &temp0[0]);
+  transpose_8x8(&temp2[16], &temp0[8]);
+  transpose_8x8(&temp3[16], &temp0[16]);
+  transpose_8x8(&temp4[16], &temp0[24]);
+
+  dct_body_second_pass(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 16 * 32, temp5);
+
+  // Final row of 8x32.
+  transpose_8x8(&temp1[24], &temp0[0]);
+  transpose_8x8(&temp2[24], &temp0[8]);
+  transpose_8x8(&temp3[24], &temp0[16]);
+  transpose_8x8(&temp4[24], &temp0[24]);
+
+  dct_body_second_pass(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 24 * 32, temp5);
+}
+#endif  // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
+        // __GNUC__ == 4 && __GNUC_MINOR__ <= 9
diff --git a/vpx_dsp/deblock.c b/vpx_dsp/deblock.c

index a0db1e40c98e87ecdfd3fb8d6d33485051bca995..94acbb391958841b20ff352ab1d75109b34c9f1f 100644 (file)
--- a/vpx_dsp/deblock.c
+++ b/vpx_dsp/deblock.c
@@ -9,6 +9,7 @@
   */
  #include <assert.h>
  #include <stdlib.h>
+#include "./vpx_dsp_rtcd.h"
  #include "vpx/vpx_integer.h"
  
  const int16_t vpx_rv[] = {
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk

index 4b9173c741b1d537e2480b342bc8e10139e1ea41..8c5eb1042ec53ee1f637c9834c40439ec4a8a632 100644 (file)
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -195,6 +195,7 @@ DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
  DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
  DSP_SRCS-$(HAVE_NEON)   += arm/fdct_neon.c
  DSP_SRCS-$(HAVE_NEON)   += arm/fdct16x16_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct32x32_neon.c
  DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
  DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
  DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
@@ -238,6 +239,7 @@ DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct4x4_add_sse2.c
  DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct8x8_add_sse2.c
  DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct16x16_add_sse2.c
  DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct32x32_add_sse2.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct4x4_add_sse4.c
  endif  # !CONFIG_VP9_HIGHBITDEPTH
  
  ifeq ($(HAVE_NEON_ASM),yes)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl

index 371c2455fd7553d0b69212f926b30e9dc551d52a..51aa9f637ffe7d2547e90b82f3eaaf5ad52beebe 100644 (file)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -502,7 +502,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    specialize qw/vpx_fdct16x16_1 sse2/;
  
    add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32 sse2/;
+  specialize qw/vpx_fdct32x32 neon sse2/;
  
    add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vpx_fdct32x32_rd sse2/;
@@ -550,7 +550,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    specialize qw/vpx_fdct16x16_1 sse2 msa/;
  
    add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32 sse2 avx2 msa/;
+  specialize qw/vpx_fdct32x32 neon sse2 avx2 msa/;
  
    add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vpx_fdct32x32_rd sse2 avx2 msa/;
@@ -592,7 +592,7 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
    $vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2;
    specialize qw/vpx_idct16x16_10_add neon sse2/;
    specialize qw/vpx_idct16x16_1_add neon sse2/;
-  specialize qw/vpx_idct32x32_1024_add neon sse2 ssse3/;
+  specialize qw/vpx_idct32x32_1024_add neon sse2/;
    specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/;
    $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
    specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/;
@@ -652,7 +652,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
  
    if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
-    specialize qw/vpx_highbd_idct4x4_16_add neon sse2/;
+    specialize qw/vpx_highbd_idct4x4_16_add neon sse2 sse4_1/;
      specialize qw/vpx_highbd_idct8x8_64_add neon sse2/;
      specialize qw/vpx_highbd_idct8x8_12_add neon sse2/;
      specialize qw/vpx_highbd_idct16x16_256_add neon sse2/;
diff --git a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c

index 5293f5694f65a6c1f114e68cf1177b2fa5b3056b..ac6f73d831b81ae659e764db791477ba866a7f46 100644 (file)
--- a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -12,7 +12,6 @@
  #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
  #include "vpx_dsp/x86/inv_txfm_sse2.h"
  #include "vpx_dsp/x86/transpose_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
  
  static INLINE __m128i dct_const_round_shift_4_sse2(const __m128i in0,
                                                     const __m128i in1) {
@@ -22,16 +21,6 @@ static INLINE __m128i dct_const_round_shift_4_sse2(const __m128i in0,
    return dct_const_round_shift_sse2(t2);
  }
  
-static INLINE __m128i wraplow_16bit_sse2(const __m128i in0, const __m128i in1,
-                                         const __m128i rounding) {
-  __m128i temp[2];
-  temp[0] = _mm_add_epi32(in0, rounding);
-  temp[1] = _mm_add_epi32(in1, rounding);
-  temp[0] = _mm_srai_epi32(temp[0], 4);
-  temp[1] = _mm_srai_epi32(temp[1], 4);
-  return _mm_packs_epi32(temp[0], temp[1]);
-}
-
  static INLINE void highbd_idct4_small_sse2(__m128i *const io) {
    const __m128i cospi_p16_p16 = _mm_setr_epi32(cospi_16_64, 0, cospi_16_64, 0);
    const __m128i cospi_p08_p08 = _mm_setr_epi32(cospi_8_64, 0, cospi_8_64, 0);
@@ -100,19 +89,6 @@ static INLINE __m128i multiply_apply_sign_sse2(const __m128i in,
    return _mm_sub_epi64(out, sign);
  }
  
-static INLINE __m128i dct_const_round_shift_64bit_sse2(const __m128i in) {
-  const __m128i t = _mm_add_epi64(
-      in,
-      _mm_setr_epi32(DCT_CONST_ROUNDING << 2, 0, DCT_CONST_ROUNDING << 2, 0));
-  return _mm_srli_si128(t, 2);
-}
-
-static INLINE __m128i pack_4_sse2(const __m128i in0, const __m128i in1) {
-  const __m128i t0 = _mm_unpacklo_epi32(in0, in1);  // 0, 2
-  const __m128i t1 = _mm_unpackhi_epi32(in0, in1);  // 1, 3
-  return _mm_unpacklo_epi32(t0, t1);                // 0, 1, 2, 3
-}
-
  static INLINE void highbd_idct4_large_sse2(__m128i *const io) {
    const __m128i cospi_p16_p16 =
        _mm_setr_epi32(cospi_16_64 << 2, 0, cospi_16_64 << 2, 0);
@@ -133,12 +109,12 @@ static INLINE void highbd_idct4_large_sse2(__m128i *const io) {
    temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], cospi_p16_p16);
    temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], cospi_p16_p16);
    temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], cospi_p16_p16);
-  temp1[0] = dct_const_round_shift_64bit_sse2(temp1[0]);
-  temp1[1] = dct_const_round_shift_64bit_sse2(temp1[1]);
-  temp2[0] = dct_const_round_shift_64bit_sse2(temp2[0]);
-  temp2[1] = dct_const_round_shift_64bit_sse2(temp2[1]);
-  step[0] = pack_4_sse2(temp1[0], temp1[1]);
-  step[1] = pack_4_sse2(temp2[0], temp2[1]);
+  temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+  temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+  temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+  temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+  step[0] = pack_4(temp1[0], temp1[1]);
+  step[1] = pack_4(temp2[0], temp2[1]);
  
    abs_extend_64bit_sse2(io[1], temp1, sign1);
    abs_extend_64bit_sse2(io[3], temp2, sign2);
@@ -154,12 +130,12 @@ static INLINE void highbd_idct4_large_sse2(__m128i *const io) {
    temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);  // [1]*cospi_24 - [3]*cospi_8
    temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);  // [1]*cospi_8 + [3]*cospi_24
    temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);  // [1]*cospi_8 + [3]*cospi_24
-  temp1[0] = dct_const_round_shift_64bit_sse2(temp1[0]);
-  temp1[1] = dct_const_round_shift_64bit_sse2(temp1[1]);
-  temp2[0] = dct_const_round_shift_64bit_sse2(temp2[0]);
-  temp2[1] = dct_const_round_shift_64bit_sse2(temp2[1]);
-  step[2] = pack_4_sse2(temp1[0], temp1[1]);
-  step[3] = pack_4_sse2(temp2[0], temp2[1]);
+  temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+  temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+  temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+  temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+  step[2] = pack_4(temp1[0], temp1[1]);
+  step[3] = pack_4(temp2[0], temp2[1]);
  
    // stage 2
    io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
@@ -211,31 +187,11 @@ void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
        highbd_idct4_large_sse2(io);
        highbd_idct4_large_sse2(io);
      }
-    io[0] = wraplow_16bit_sse2(io[0], io[1], _mm_set1_epi32(8));
-    io[1] = wraplow_16bit_sse2(io[2], io[3], _mm_set1_epi32(8));
+    io[0] = wraplow_16bit(io[0], io[1], _mm_set1_epi32(8));
+    io[1] = wraplow_16bit(io[2], io[3], _mm_set1_epi32(8));
    }
  
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
-    __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi64(d0,
-                            _mm_loadl_epi64((const __m128i *)(dest + stride)));
-    d2 = _mm_unpacklo_epi64(
-        d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
-    d0 = clamp_high_sse2(_mm_adds_epi16(d0, io[0]), bd);
-    d2 = clamp_high_sse2(_mm_adds_epi16(d2, io[1]), bd);
-    // store input0
-    _mm_storel_epi64((__m128i *)dest, d0);
-    // store input1
-    d0 = _mm_srli_si128(d0, 8);
-    _mm_storel_epi64((__m128i *)(dest + stride), d0);
-    // store input2
-    _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
-    // store input3
-    d2 = _mm_srli_si128(d2, 8);
-    _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
-  }
+  recon_and_store_4(dest, io, stride, bd);
  }
  
  void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest,
diff --git a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c

new file mode 100644 (file)

index 0000000..9d1c6f4
--- /dev/null
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
@@ -0,0 +1,107 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static INLINE void extend_64bit(const __m128i in,
+                                __m128i *const out /*out[2]*/) {
+  out[0] = _mm_unpacklo_epi32(in, in);  // 0, 0, 1, 1
+  out[1] = _mm_unpackhi_epi32(in, in);  // 2, 2, 3, 3
+}
+
+static INLINE void highbd_idct4(__m128i *const io) {
+  const __m128i cospi_p16_p16 =
+      _mm_setr_epi32(cospi_16_64 << 2, 0, cospi_16_64 << 2, 0);
+  const __m128i cospi_p08_p08 =
+      _mm_setr_epi32(cospi_8_64 << 2, 0, cospi_8_64 << 2, 0);
+  const __m128i cospi_p24_p24 =
+      _mm_setr_epi32(cospi_24_64 << 2, 0, cospi_24_64 << 2, 0);
+  __m128i temp1[4], temp2[4], step[4];
+
+  transpose_32bit_4x4(&io[0], &io[1], &io[2], &io[3]);
+
+  // stage 1
+  temp1[0] = _mm_add_epi32(io[0], io[2]);  // input[0] + input[2]
+  temp2[0] = _mm_sub_epi32(io[0], io[2]);  // input[0] - input[2]
+  extend_64bit(temp1[0], temp1);
+  extend_64bit(temp2[0], temp2);
+  temp1[0] = _mm_mul_epi32(temp1[0], cospi_p16_p16);
+  temp1[1] = _mm_mul_epi32(temp1[1], cospi_p16_p16);
+  temp2[0] = _mm_mul_epi32(temp2[0], cospi_p16_p16);
+  temp2[1] = _mm_mul_epi32(temp2[1], cospi_p16_p16);
+  temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+  temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+  temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+  temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+  step[0] = pack_4(temp1[0], temp1[1]);
+  step[1] = pack_4(temp2[0], temp2[1]);
+
+  extend_64bit(io[1], temp1);
+  extend_64bit(io[3], temp2);
+  temp1[2] = _mm_mul_epi32(temp1[0], cospi_p08_p08);
+  temp1[3] = _mm_mul_epi32(temp1[1], cospi_p08_p08);
+  temp1[0] = _mm_mul_epi32(temp1[0], cospi_p24_p24);
+  temp1[1] = _mm_mul_epi32(temp1[1], cospi_p24_p24);
+  temp2[2] = _mm_mul_epi32(temp2[0], cospi_p24_p24);
+  temp2[3] = _mm_mul_epi32(temp2[1], cospi_p24_p24);
+  temp2[0] = _mm_mul_epi32(temp2[0], cospi_p08_p08);
+  temp2[1] = _mm_mul_epi32(temp2[1], cospi_p08_p08);
+  temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);  // [1]*cospi_24 - [3]*cospi_8
+  temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);  // [1]*cospi_24 - [3]*cospi_8
+  temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);  // [1]*cospi_8 + [3]*cospi_24
+  temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);  // [1]*cospi_8 + [3]*cospi_24
+  temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+  temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+  temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+  temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+  step[2] = pack_4(temp1[0], temp1[1]);
+  step[3] = pack_4(temp2[0], temp2[1]);
+
+  // stage 2
+  io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
+  io[1] = _mm_add_epi32(step[1], step[2]);  // step[1] + step[2]
+  io[2] = _mm_sub_epi32(step[1], step[2]);  // step[1] - step[2]
+  io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
+}
+
+void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  __m128i io[4];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 8));
+  io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+  if (bd == 8) {
+    __m128i io_short[2];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[1]);
+    io_short[1] = _mm_packs_epi32(io[2], io[3]);
+    idct4_sse2(io_short);
+    idct4_sse2(io_short);
+    io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+    io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+    io[0] = _mm_srai_epi16(io_short[0], 4);
+    io[1] = _mm_srai_epi16(io_short[1], 4);
+  } else {
+    highbd_idct4(io);
+    highbd_idct4(io);
+    io[0] = wraplow_16bit(io[0], io[1], _mm_set1_epi32(8));
+    io[1] = wraplow_16bit(io[2], io[3], _mm_set1_epi32(8));
+  }
+
+  recon_and_store_4(dest, io, stride, bd);
+}
diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h

index ea100c6e1975306e5d9d418b9781ae408f19a25f..be740a8c7fe690890778cb7ad1f8e8d298f18cf5 100644 (file)
--- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -17,6 +17,29 @@
  #include "vpx_dsp/inv_txfm.h"
  #include "vpx_dsp/x86/txfm_common_sse2.h"
  
+static INLINE __m128i wraplow_16bit(const __m128i in0, const __m128i in1,
+                                    const __m128i rounding) {
+  __m128i temp[2];
+  temp[0] = _mm_add_epi32(in0, rounding);
+  temp[1] = _mm_add_epi32(in1, rounding);
+  temp[0] = _mm_srai_epi32(temp[0], 4);
+  temp[1] = _mm_srai_epi32(temp[1], 4);
+  return _mm_packs_epi32(temp[0], temp[1]);
+}
+
+static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) {
+  const __m128i t = _mm_add_epi64(
+      in,
+      _mm_setr_epi32(DCT_CONST_ROUNDING << 2, 0, DCT_CONST_ROUNDING << 2, 0));
+  return _mm_srli_si128(t, 2);
+}
+
+static INLINE __m128i pack_4(const __m128i in0, const __m128i in1) {
+  const __m128i t0 = _mm_unpacklo_epi32(in0, in1);  // 0, 2
+  const __m128i t1 = _mm_unpackhi_epi32(in0, in1);  // 1, 3
+  return _mm_unpacklo_epi32(t0, t1);                // 0, 1, 2, 3
+}
+
  static INLINE __m128i add_dc_clamp(const __m128i *const min,
                                     const __m128i *const max,
                                     const __m128i *const dc,
@@ -67,4 +90,23 @@ static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
    return retval;
  }
  
+static INLINE void recon_and_store_4(uint16_t *const dest,
+                                     const __m128i *const io, const int stride,
+                                     int bd) {
+  __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+  __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+  d0 =
+      _mm_unpacklo_epi64(d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
+  d2 = _mm_unpacklo_epi64(
+      d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+  d0 = clamp_high_sse2(_mm_adds_epi16(d0, io[0]), bd);
+  d2 = clamp_high_sse2(_mm_adds_epi16(d2, io[1]), bd);
+  _mm_storel_epi64((__m128i *)dest, d0);
+  d0 = _mm_srli_si128(d0, 8);
+  _mm_storel_epi64((__m128i *)(dest + stride), d0);
+  _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+  d2 = _mm_srli_si128(d2, 8);
+  _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+}
+
  #endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c

index 9e77bdbdb1f61bb2b46fa64b5ef8d86662e5a992..5d12d1ff146802218bd0ccc91002caec2fabc132 100644 (file)
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -27,8 +27,8 @@ void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
    __m128i in[2];
  
    // Rows
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8);
+  in[0] = load_input_data8(input);
+  in[1] = load_input_data8(input + 8);
    idct4_sse2(in);
  
    // Columns
@@ -163,58 +163,53 @@ static INLINE void multiplication_and_add(
    *res3 = idct_calc_wraplow_sse2(lo_1, hi_1, *cst3);
  }
  
-static void multiplication_and_add_2(const __m128i *const in0,
-                                     const __m128i *const in1,
-                                     const __m128i *const cst0,
-                                     const __m128i *const cst1,
-                                     __m128i *const res0, __m128i *const res1) {
-  const __m128i lo = _mm_unpacklo_epi16(*in0, *in1);
-  const __m128i hi = _mm_unpackhi_epi16(*in0, *in1);
-  *res0 = idct_calc_wraplow_sse2(lo, hi, *cst0);
-  *res1 = idct_calc_wraplow_sse2(lo, hi, *cst1);
-}
-
  static INLINE void idct8(const __m128i *const in, __m128i *const out) {
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  /* Stage1 */
-  multiplication_and_add(&in[1], &in[7], &in[3], &in[5], &stg1_0, &stg1_1,
-                         &stg1_2, &stg1_3, &stp1_4, &stp1_7, &stp1_5, &stp1_6);
-
-  /* Stage2 */
-  multiplication_and_add(&in[0], &in[4], &in[2], &in[6], &stg2_0, &stg2_1,
-                         &stg2_2, &stg2_3, &stp2_0, &stp2_1, &stp2_2, &stp2_3);
-
-  stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
-  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
-  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
-  stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-
-  /* Stage3 */
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
-  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
-  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-  multiplication_and_add_2(&stp2_6, &stp2_5, &stg2_1, &stg2_0, &stp1_5,
-                           &stp1_6);
-
-  /* Stage4  */
-  out[0] = _mm_add_epi16(stp1_0, stp2_7);
-  out[1] = _mm_add_epi16(stp1_1, stp1_6);
-  out[2] = _mm_add_epi16(stp1_2, stp1_5);
-  out[3] = _mm_add_epi16(stp1_3, stp2_4);
-  out[4] = _mm_sub_epi16(stp1_3, stp2_4);
-  out[5] = _mm_sub_epi16(stp1_2, stp1_5);
-  out[6] = _mm_sub_epi16(stp1_1, stp1_6);
-  out[7] = _mm_sub_epi16(stp1_0, stp2_7);
+  const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  __m128i step1[8], step2[8];
+
+  // stage 1
+  {
+    const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+    const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+    const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+    const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+    multiplication_and_add(&in[1], &in[7], &in[3], &in[5], &cp_28_n4, &cp_4_28,
+                           &cp_n20_12, &cp_12_20, &step1[4], &step1[7],
+                           &step1[5], &step1[6]);
+  }
+
+  // stage 2
+  {
+    const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+    const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+    multiplication_and_add(&in[0], &in[4], &in[2], &in[6], &cp_16_16,
+                           &cp_16_n16, &cp_24_n8, &cp_8_24, &step2[0],
+                           &step2[1], &step2[2], &step2[3]);
+  }
+
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16,
+                           &step1[5], &step1[6]);
+
+  // stage 4
+  out[0] = _mm_add_epi16(step1[0], step2[7]);
+  out[1] = _mm_add_epi16(step1[1], step1[6]);
+  out[2] = _mm_add_epi16(step1[2], step1[5]);
+  out[3] = _mm_add_epi16(step1[3], step2[4]);
+  out[4] = _mm_sub_epi16(step1[3], step2[4]);
+  out[5] = _mm_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm_sub_epi16(step1[0], step2[7]);
  }
  
  void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
@@ -233,25 +228,40 @@ void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
    write_buffer_8x8(in, dest, stride);
  }
  
+static INLINE void recon_and_store_8_dual(uint8_t *const dest,
+                                          const __m128i in_x,
+                                          const int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d0, d1;
+
+  d0 = _mm_loadl_epi64((__m128i *)(dest + 0 * stride));
+  d1 = _mm_loadl_epi64((__m128i *)(dest + 1 * stride));
+  d0 = _mm_unpacklo_epi8(d0, zero);
+  d1 = _mm_unpacklo_epi8(d1, zero);
+  d0 = _mm_add_epi16(in_x, d0);
+  d1 = _mm_add_epi16(in_x, d1);
+  d0 = _mm_packus_epi16(d0, d1);
+  _mm_storel_epi64((__m128i *)(dest + 0 * stride), d0);
+  _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d0));
+}
+
  void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
                              int stride) {
    __m128i dc_value;
-  int a;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 5);
-
-  dc_value = _mm_set1_epi16(a);
-
-  recon_and_store(dest + 0 * stride, dc_value);
-  recon_and_store(dest + 1 * stride, dc_value);
-  recon_and_store(dest + 2 * stride, dc_value);
-  recon_and_store(dest + 3 * stride, dc_value);
-  recon_and_store(dest + 4 * stride, dc_value);
-  recon_and_store(dest + 5 * stride, dc_value);
-  recon_and_store(dest + 6 * stride, dc_value);
-  recon_and_store(dest + 7 * stride, dc_value);
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  dc_value = _mm_set1_epi16(a1);
+
+  recon_and_store_8_dual(dest, dc_value, stride);
+  dest += 2 * stride;
+  recon_and_store_8_dual(dest, dc_value, stride);
+  dest += 2 * stride;
+  recon_and_store_8_dual(dest, dc_value, stride);
+  dest += 2 * stride;
+  recon_and_store_8_dual(dest, dc_value, stride);
  }
  
  void idct8_sse2(__m128i *in) {
@@ -466,70 +476,59 @@ void iadst8_sse2(__m128i *in) {
  void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
                               int stride) {
    const __m128i zero = _mm_setzero_si128();
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  __m128i in[8], step1[8], step2[8], tmp[4];
  
-  __m128i in[8];
-  __m128i stp1_2, stp1_3, stp1_4, stp1_5;
-  __m128i stp2_0, stp2_2, stp2_4, stp2_5, stp2_6;
-  __m128i tmp[4];
-
-  // Rows. Load 4-row input data.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 1);
-  in[2] = load_input_data(input + 8 * 2);
-  in[3] = load_input_data(input + 8 * 3);
+  in[0] = load_input_data4(input + 0 * 8);
+  in[1] = load_input_data4(input + 1 * 8);
+  in[2] = load_input_data4(input + 2 * 8);
+  in[3] = load_input_data4(input + 3 * 8);
  
-  // 8x4 Transpose
    transpose_16bit_4x4(in, in);
-  // Stage1
-  {
-    const __m128i lo_17 = _mm_unpackhi_epi16(in[0], zero);
-    const __m128i lo_35 = _mm_unpackhi_epi16(in[1], zero);
+  // in[0]: 00 10 20 30  01 11 21 31
+  // in[1]: 02 12 22 32  03 13 23 33
  
-    stp1_4 = idct_calc_wraplow_sse2(stg1_0, stg1_1, lo_17);
-    stp1_5 = idct_calc_wraplow_sse2(stg1_2, stg1_3, lo_35);
+  // stage 1
+  {
+    const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+    const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+    const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+    const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+    const __m128i lo_1 = _mm_unpackhi_epi16(in[0], zero);
+    const __m128i lo_3 = _mm_unpackhi_epi16(in[1], zero);
+    step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1);    // step1 4&7
+    step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3);  // step1 5&6
    }
  
-  // Stage2
+  // stage 2
    {
-    const __m128i lo_04 = _mm_unpacklo_epi16(in[0], zero);
-    const __m128i lo_26 = _mm_unpacklo_epi16(in[1], zero);
-
-    stp2_0 = idct_calc_wraplow_sse2(stg2_0, stg2_1, lo_04);
-    stp2_2 = idct_calc_wraplow_sse2(stg2_3, stg2_2, lo_26);
-
-    tmp[0] = _mm_add_epi16(stp1_4, stp1_5);
-    tmp[1] = _mm_sub_epi16(stp1_4, stp1_5);
-
-    stp2_4 = tmp[0];
-    stp2_5 = _mm_unpacklo_epi64(tmp[1], zero);
-    stp2_6 = _mm_unpackhi_epi64(tmp[1], zero);
+    const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+    const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+    const __m128i lo_0 = _mm_unpacklo_epi16(in[0], zero);
+    const __m128i lo_2 = _mm_unpacklo_epi16(in[1], zero);
+    step2[0] = idct_calc_wraplow_sse2(cp_16_16, cp_16_n16, lo_0);  // step2 0&1
+    step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2);    // step2 3&2
+    step2[4] = _mm_add_epi16(step1[4], step1[5]);                  // step2 4&7
+    step2[5] = _mm_sub_epi16(step1[4], step1[5]);                  // step2 5&6
+    step2[6] = _mm_unpackhi_epi64(step2[5], zero);                 // step2 6
    }
  
-  // Stage3
+  // stage 3
    {
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-
-    tmp[0] = _mm_add_epi16(stp2_0, stp2_2);
-    tmp[1] = _mm_sub_epi16(stp2_0, stp2_2);
-    stp1_2 = _mm_unpackhi_epi64(tmp[1], tmp[0]);
-    stp1_3 = _mm_unpacklo_epi64(tmp[1], tmp[0]);
-    stp1_5 = idct_calc_wraplow_sse2(stg3_0, stg2_0, lo_56);  // stg3_1 = stg2_0
+    const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]);
+    tmp[0] = _mm_add_epi16(step2[0], step2[2]);                     // step1 0&1
+    tmp[1] = _mm_sub_epi16(step2[0], step2[2]);                     // step1 3&2
+    step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]);                  // step1 2&1
+    step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]);                  // step1 3&0
+    step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65);  // step1 5&6
    }
  
-  // Stage4
-  tmp[0] = _mm_add_epi16(stp1_3, stp2_4);
-  tmp[1] = _mm_add_epi16(stp1_2, stp1_5);
-  tmp[2] = _mm_sub_epi16(stp1_3, stp2_4);
-  tmp[3] = _mm_sub_epi16(stp1_2, stp1_5);
+  // stage 4
+  tmp[0] = _mm_add_epi16(step1[3], step2[4]);  // output 3&0
+  tmp[1] = _mm_add_epi16(step1[2], step1[5]);  // output 2&1
+  tmp[2] = _mm_sub_epi16(step1[3], step2[4]);  // output 4&7
+  tmp[3] = _mm_sub_epi16(step1[2], step1[5]);  // output 5&6
  
    idct8x8_12_transpose_16bit_4x8(tmp, in);
    in[4] = in[5] = in[6] = in[7] = zero;
@@ -538,77 +537,6 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
    write_buffer_8x8(in, dest, stride);
  }
  
-#define IDCT16                                                               \
-  /* Stage2 */                                                               \
-  multiplication_and_add(&in[1], &in[15], &in[9], &in[7], &stg2_0, &stg2_1,  \
-                         &stg2_2, &stg2_3, &stp2_8, &stp2_15, &stp2_9,       \
-                         &stp2_14);                                          \
-                                                                             \
-  multiplication_and_add(&in[5], &in[11], &in[13], &in[3], &stg2_4, &stg2_5, \
-                         &stg2_6, &stg2_7, &stp2_10, &stp2_13, &stp2_11,     \
-                         &stp2_12);                                          \
-                                                                             \
-  /* Stage3 */                                                               \
-  multiplication_and_add(&in[2], &in[14], &in[10], &in[6], &stg3_0, &stg3_1, \
-                         &stg3_2, &stg3_3, &stp1_4, &stp1_7, &stp1_5,        \
-                         &stp1_6);                                           \
-                                                                             \
-  stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);                                  \
-  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
-  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
-  stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
-                                                                             \
-  stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);                               \
-  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
-  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
-  stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
-                                                                             \
-  /* Stage4 */                                                               \
-  multiplication_and_add(&in[0], &in[8], &in[4], &in[12], &stg4_0, &stg4_1,  \
-                         &stg4_2, &stg4_3, &stp2_0, &stp2_1, &stp2_2,        \
-                         &stp2_3);                                           \
-                                                                             \
-  stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
-  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
-  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
-  stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
-                                                                             \
-  multiplication_and_add(&stp1_9, &stp1_14, &stp1_10, &stp1_13, &stg4_4,     \
-                         &stg4_5, &stg4_6, &stg4_7, &stp2_9, &stp2_14,       \
-                         &stp2_10, &stp2_13);                                \
-                                                                             \
-  /* Stage5 */                                                               \
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
-  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
-  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
-  multiplication_and_add_2(&stp2_6, &stp2_5, &stg4_1, &stg4_0, &stp1_5,      \
-                           &stp1_6);                                         \
-                                                                             \
-  stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
-  stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-  stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
-                                                                             \
-  stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
-  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-  stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-  stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
-                                                                             \
-  /* Stage6 */                                                               \
-  stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
-  stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-  stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-  stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
-  stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
-  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-  stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
-                                                                             \
-  multiplication_and_add(&stp1_10, &stp1_13, &stp1_11, &stp1_12, &stg6_0,    \
-                         &stg4_0, &stg6_0, &stg4_0, &stp2_10, &stp2_13,      \
-                         &stp2_11, &stp2_12);
-
  #define IDCT16_10                                                              \
    /* Stage2 */                                                                 \
    multiplication_and_add(&in[1], &zero, &zero, &in[3], &stg2_0, &stg2_1,       \
@@ -662,142 +590,196 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
                           &stg4_0, &stg6_0, &stg4_0, &stp2_10, &stp2_13,        \
                           &stp2_11, &stp2_12);
  
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
-                                int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+static INLINE void idct16_8col(__m128i *const in) {
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  __m128i s[16], t[16];
  
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  // stage 2
+  {
+    const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+    const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+    const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+    const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+    multiplication_and_add(&in[1], &in[15], &in[9], &in[7], &k__cospi_p30_m02,
+                           &k__cospi_p02_p30, &k__cospi_p14_m18,
+                           &k__cospi_p18_p14, &s[8], &s[15], &s[9], &s[14]);
+  }
+  {
+    const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+    const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+    const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+    const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+    multiplication_and_add(&in[5], &in[11], &in[13], &in[3], &k__cospi_p22_m10,
+                           &k__cospi_p10_p22, &k__cospi_p06_m26,
+                           &k__cospi_p26_p06, &s[10], &s[13], &s[11], &s[12]);
+  }
  
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  // stage 3
+  {
+    const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+    const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+    const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+    const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+    multiplication_and_add(&in[2], &in[14], &in[10], &in[6], &k__cospi_p28_m04,
+                           &k__cospi_p04_p28, &k__cospi_p12_m20,
+                           &k__cospi_p20_p12, &t[4], &t[7], &t[5], &t[6]);
+  }
+  t[8] = _mm_add_epi16(s[8], s[9]);
+  t[9] = _mm_sub_epi16(s[8], s[9]);
+  t[10] = _mm_sub_epi16(s[11], s[10]);
+  t[11] = _mm_add_epi16(s[10], s[11]);
+  t[12] = _mm_add_epi16(s[12], s[13]);
+  t[13] = _mm_sub_epi16(s[12], s[13]);
+  t[14] = _mm_sub_epi16(s[15], s[14]);
+  t[15] = _mm_add_epi16(s[14], s[15]);
  
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  // stage 4
+  {
+    const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+    const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+    const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+    multiplication_and_add(&in[0], &in[8], &in[4], &in[12], &k__cospi_p16_p16,
+                           &k__cospi_p16_m16, &k__cospi_p24_m08,
+                           &k__cospi_p08_p24, &s[0], &s[1], &s[2], &s[3]);
+  }
+  s[5] = _mm_sub_epi16(t[4], t[5]);
+  t[4] = _mm_add_epi16(t[4], t[5]);
+  s[6] = _mm_sub_epi16(t[7], t[6]);
+  t[7] = _mm_add_epi16(t[6], t[7]);
+  s[8] = t[8];
+  {
+    const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+    const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+    const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+    multiplication_and_add(&t[9], &t[14], &t[10], &t[13], &k__cospi_m08_p24,
+                           &k__cospi_p24_p08, &k__cospi_m24_m08,
+                           &k__cospi_m08_p24, &s[9], &s[14], &s[10], &s[13]);
+  }
+  s[11] = t[11];
+  s[12] = t[12];
+  s[15] = t[15];
  
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  // stage 5
+  t[0] = _mm_add_epi16(s[0], s[3]);
+  t[1] = _mm_add_epi16(s[1], s[2]);
+  t[2] = _mm_sub_epi16(s[1], s[2]);
+  t[3] = _mm_sub_epi16(s[0], s[3]);
+  multiplication_and_add_2(&s[5], &s[6], &k__cospi_m16_p16, &k__cospi_p16_p16,
+                           &t[5], &t[6]);
+  t[8] = _mm_add_epi16(s[8], s[11]);
+  t[9] = _mm_add_epi16(s[9], s[10]);
+  t[10] = _mm_sub_epi16(s[9], s[10]);
+  t[11] = _mm_sub_epi16(s[8], s[11]);
+  t[12] = _mm_sub_epi16(s[15], s[12]);
+  t[13] = _mm_sub_epi16(s[14], s[13]);
+  t[14] = _mm_add_epi16(s[13], s[14]);
+  t[15] = _mm_add_epi16(s[12], s[15]);
  
-  __m128i in[16], l[16], r[16], *curr1;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_8_0, stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+  // stage 6
+  s[0] = _mm_add_epi16(t[0], t[7]);
+  s[1] = _mm_add_epi16(t[1], t[6]);
+  s[2] = _mm_add_epi16(t[2], t[5]);
+  s[3] = _mm_add_epi16(t[3], t[4]);
+  s[4] = _mm_sub_epi16(t[3], t[4]);
+  s[5] = _mm_sub_epi16(t[2], t[5]);
+  s[6] = _mm_sub_epi16(t[1], t[6]);
+  s[7] = _mm_sub_epi16(t[0], t[7]);
+  multiplication_and_add(&t[10], &t[13], &t[11], &t[12], &k__cospi_m16_p16,
+                         &k__cospi_p16_p16, &k__cospi_m16_p16,
+                         &k__cospi_p16_p16, &s[10], &s[13], &s[11], &s[12]);
+
+  // stage 7
+  in[0] = _mm_add_epi16(s[0], t[15]);
+  in[1] = _mm_add_epi16(s[1], t[14]);
+  in[2] = _mm_add_epi16(s[2], s[13]);
+  in[3] = _mm_add_epi16(s[3], s[12]);
+  in[4] = _mm_add_epi16(s[4], s[11]);
+  in[5] = _mm_add_epi16(s[5], s[10]);
+  in[6] = _mm_add_epi16(s[6], t[9]);
+  in[7] = _mm_add_epi16(s[7], t[8]);
+  in[8] = _mm_sub_epi16(s[7], t[8]);
+  in[9] = _mm_sub_epi16(s[6], t[9]);
+  in[10] = _mm_sub_epi16(s[5], s[10]);
+  in[11] = _mm_sub_epi16(s[4], s[11]);
+  in[12] = _mm_sub_epi16(s[3], s[12]);
+  in[13] = _mm_sub_epi16(s[2], s[13]);
+  in[14] = _mm_sub_epi16(s[1], t[14]);
+  in[15] = _mm_sub_epi16(s[0], t[15]);
+}
+
+static INLINE void idct16_load8x8(const tran_low_t *const input,
+                                  __m128i *const in) {
+  in[0] = load_input_data8(input);
+  in[1] = load_input_data8(input + 8 * 2);
+  in[2] = load_input_data8(input + 8 * 4);
+  in[3] = load_input_data8(input + 8 * 6);
+  in[4] = load_input_data8(input + 8 * 8);
+  in[5] = load_input_data8(input + 8 * 10);
+  in[6] = load_input_data8(input + 8 * 12);
+  in[7] = load_input_data8(input + 8 * 14);
+}
+
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  __m128i l[16], r[16], out[16], *in;
    int i;
  
-  curr1 = l;
+  in = l;
    for (i = 0; i < 2; i++) {
-    // 1-D idct
-
-    // Load input data.
-    in[0] = load_input_data(input);
-    in[8] = load_input_data(input + 8 * 1);
-    in[1] = load_input_data(input + 8 * 2);
-    in[9] = load_input_data(input + 8 * 3);
-    in[2] = load_input_data(input + 8 * 4);
-    in[10] = load_input_data(input + 8 * 5);
-    in[3] = load_input_data(input + 8 * 6);
-    in[11] = load_input_data(input + 8 * 7);
-    in[4] = load_input_data(input + 8 * 8);
-    in[12] = load_input_data(input + 8 * 9);
-    in[5] = load_input_data(input + 8 * 10);
-    in[13] = load_input_data(input + 8 * 11);
-    in[6] = load_input_data(input + 8 * 12);
-    in[14] = load_input_data(input + 8 * 13);
-    in[7] = load_input_data(input + 8 * 14);
-    in[15] = load_input_data(input + 8 * 15);
-
+    idct16_load8x8(input, in);
      transpose_16bit_8x8(in, in);
+    idct16_load8x8(input + 8, in + 8);
      transpose_16bit_8x8(in + 8, in + 8);
-
-    IDCT16
-
-    // Stage7
-    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
-    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
-    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
-    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
-    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
-    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
-    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
-    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
-    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    curr1 = r;
+    idct16_8col(in);
+    in = r;
      input += 128;
    }
+
    for (i = 0; i < 2; i++) {
      int j;
-    // 1-D idct
-    transpose_16bit_8x8(l + i * 8, in);
-    transpose_16bit_8x8(r + i * 8, in + 8);
-
-    IDCT16
-
-    // 2-D
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+    transpose_16bit_8x8(l + i * 8, out);
+    transpose_16bit_8x8(r + i * 8, out + 8);
+    idct16_8col(out);
  
+    // Final rounding and shift
      for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      recon_and_store(dest + j * stride, in[j]);
+      const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+      out[j] = _mm_adds_epi16(out[j], final_rounding);
+      out[j] = _mm_srai_epi16(out[j], 6);
+      recon_and_store(dest + j * stride, out[j]);
      }
  
      dest += 8;
    }
  }
  
+static INLINE void recon_and_store_16(uint8_t *const dest, const __m128i in_x) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d0, d1;
+
+  d0 = _mm_load_si128((__m128i *)(dest));
+  d1 = _mm_unpackhi_epi8(d0, zero);
+  d0 = _mm_unpacklo_epi8(d0, zero);
+  d0 = _mm_add_epi16(in_x, d0);
+  d1 = _mm_add_epi16(in_x, d1);
+  d0 = _mm_packus_epi16(d0, d1);
+  _mm_store_si128((__m128i *)(dest), d0);
+}
+
  void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
                                int stride) {
    __m128i dc_value;
-  int a, i;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
+  int i;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
  
-  dc_value = _mm_set1_epi16(a);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  dc_value = _mm_set1_epi16(a1);
  
    for (i = 0; i < 16; ++i) {
-    recon_and_store(dest + 0, dc_value);
-    recon_and_store(dest + 8, dc_value);
+    recon_and_store_16(dest, dc_value);
      dest += stride;
    }
  }
@@ -1222,179 +1204,6 @@ static void iadst16_8col(__m128i *in) {
    in[15] = _mm_sub_epi16(kZero, s[1]);
  }
  
-static void idct16_8col(__m128i *in) {
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i u[16], s[16], t[16];
-
-  // stage 1
-  s[0] = in[0];
-  s[1] = in[8];
-  s[2] = in[4];
-  s[3] = in[12];
-  s[4] = in[2];
-  s[5] = in[10];
-  s[6] = in[6];
-  s[7] = in[14];
-  s[8] = in[1];
-  s[9] = in[9];
-  s[10] = in[5];
-  s[11] = in[13];
-  s[12] = in[3];
-  s[13] = in[11];
-  s[14] = in[7];
-  s[15] = in[15];
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
-  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
-  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
-  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
-  s[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p30_m02);
-  s[15] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p02_p30);
-  s[9] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p14_m18);
-  s[14] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p18_p14);
-  s[10] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p22_m10);
-  s[13] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p10_p22);
-  s[11] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p06_m26);
-  s[12] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p26_p06);
-
-  // stage 3
-  t[0] = s[0];
-  t[1] = s[1];
-  t[2] = s[2];
-  t[3] = s[3];
-  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
-  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
-  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
-  t[4] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p28_m04);
-  t[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p04_p28);
-  t[5] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p12_m20);
-  t[6] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p20_p12);
-  t[8] = _mm_add_epi16(s[8], s[9]);
-  t[9] = _mm_sub_epi16(s[8], s[9]);
-  t[10] = _mm_sub_epi16(s[11], s[10]);
-  t[11] = _mm_add_epi16(s[10], s[11]);
-  t[12] = _mm_add_epi16(s[12], s[13]);
-  t[13] = _mm_sub_epi16(s[12], s[13]);
-  t[14] = _mm_sub_epi16(s[15], s[14]);
-  t[15] = _mm_add_epi16(s[14], s[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
-  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
-  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
-  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
-  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
-  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
-  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
-  s[0] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
-  s[1] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
-  s[2] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p24_m08);
-  s[3] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p08_p24);
-  s[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m08_p24);
-  s[14] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p24_p08);
-  s[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m24_m08);
-  s[13] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m08_p24);
-  s[4] = _mm_add_epi16(t[4], t[5]);
-  s[5] = _mm_sub_epi16(t[4], t[5]);
-  s[6] = _mm_sub_epi16(t[7], t[6]);
-  s[7] = _mm_add_epi16(t[6], t[7]);
-  s[8] = t[8];
-  s[15] = t[15];
-  s[11] = t[11];
-  s[12] = t[12];
-
-  // stage 5
-  t[0] = _mm_add_epi16(s[0], s[3]);
-  t[1] = _mm_add_epi16(s[1], s[2]);
-  t[2] = _mm_sub_epi16(s[1], s[2]);
-  t[3] = _mm_sub_epi16(s[0], s[3]);
-  t[4] = s[4];
-  t[7] = s[7];
-
-  multiplication_and_add_2(&s[5], &s[6], &k__cospi_m16_p16, &k__cospi_p16_p16,
-                           &t[5], &t[6]);
-
-  t[8] = _mm_add_epi16(s[8], s[11]);
-  t[9] = _mm_add_epi16(s[9], s[10]);
-  t[10] = _mm_sub_epi16(s[9], s[10]);
-  t[11] = _mm_sub_epi16(s[8], s[11]);
-  t[12] = _mm_sub_epi16(s[15], s[12]);
-  t[13] = _mm_sub_epi16(s[14], s[13]);
-  t[14] = _mm_add_epi16(s[13], s[14]);
-  t[15] = _mm_add_epi16(s[12], s[15]);
-
-  // stage 6
-  s[0] = _mm_add_epi16(t[0], t[7]);
-  s[1] = _mm_add_epi16(t[1], t[6]);
-  s[2] = _mm_add_epi16(t[2], t[5]);
-  s[3] = _mm_add_epi16(t[3], t[4]);
-  s[4] = _mm_sub_epi16(t[3], t[4]);
-  s[5] = _mm_sub_epi16(t[2], t[5]);
-  s[6] = _mm_sub_epi16(t[1], t[6]);
-  s[7] = _mm_sub_epi16(t[0], t[7]);
-  s[8] = t[8];
-  s[9] = t[9];
-
-  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
-  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
-  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
-  s[10] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_p16);
-  s[13] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
-  s[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16);
-  s[12] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
-  s[14] = t[14];
-  s[15] = t[15];
-
-  // stage 7
-  in[0] = _mm_add_epi16(s[0], s[15]);
-  in[1] = _mm_add_epi16(s[1], s[14]);
-  in[2] = _mm_add_epi16(s[2], s[13]);
-  in[3] = _mm_add_epi16(s[3], s[12]);
-  in[4] = _mm_add_epi16(s[4], s[11]);
-  in[5] = _mm_add_epi16(s[5], s[10]);
-  in[6] = _mm_add_epi16(s[6], s[9]);
-  in[7] = _mm_add_epi16(s[7], s[8]);
-  in[8] = _mm_sub_epi16(s[7], s[8]);
-  in[9] = _mm_sub_epi16(s[6], s[9]);
-  in[10] = _mm_sub_epi16(s[5], s[10]);
-  in[11] = _mm_sub_epi16(s[4], s[11]);
-  in[12] = _mm_sub_epi16(s[3], s[12]);
-  in[13] = _mm_sub_epi16(s[2], s[13]);
-  in[14] = _mm_sub_epi16(s[1], s[14]);
-  in[15] = _mm_sub_epi16(s[0], s[15]);
-}
-
  void idct16_sse2(__m128i *in0, __m128i *in1) {
    transpose_16bit_16x16(in0, in1);
    idct16_8col(in0);
@@ -1438,10 +1247,10 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
    int i;
    // First 1-D inverse DCT
    // Load input data.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 2);
-  in[2] = load_input_data(input + 8 * 4);
-  in[3] = load_input_data(input + 8 * 6);
+  in[0] = load_input_data4(input + 0 * 16);
+  in[1] = load_input_data4(input + 1 * 16);
+  in[2] = load_input_data4(input + 2 * 16);
+  in[3] = load_input_data4(input + 3 * 16);
  
    transpose_16bit_4x4(in, in);
  
@@ -1583,12 +1392,6 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
    }
  }
  
-#define LOAD_DQCOEFF(reg, input)  \
-  {                               \
-    reg = load_input_data(input); \
-    input += 8;                   \
-  }
-
  #define IDCT32_34                                                              \
    /* Stage1 */                                                                 \
    multiplication_and_add_2(&in[1], &zero, &stg1_0, &stg1_1, &stp1_16,          \
@@ -1787,225 +1590,6 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
    stp1_30 = stp2_30;                                                           \
    stp1_31 = stp2_31;
  
-#define IDCT32                                                                \
-  /* Stage1 */                                                                \
-  multiplication_and_add(&in[1], &in[31], &in[17], &in[15], &stg1_0, &stg1_1, \
-                         &stg1_2, &stg1_3, &stp1_16, &stp1_31, &stp1_17,      \
-                         &stp1_30);                                           \
-  multiplication_and_add(&in[9], &in[23], &in[25], &in[7], &stg1_4, &stg1_5,  \
-                         &stg1_6, &stg1_7, &stp1_18, &stp1_29, &stp1_19,      \
-                         &stp1_28);                                           \
-  multiplication_and_add(&in[5], &in[27], &in[21], &in[11], &stg1_8, &stg1_9, \
-                         &stg1_10, &stg1_11, &stp1_20, &stp1_27, &stp1_21,    \
-                         &stp1_26);                                           \
-  multiplication_and_add(&in[13], &in[19], &in[29], &in[3], &stg1_12,         \
-                         &stg1_13, &stg1_14, &stg1_15, &stp1_22, &stp1_25,    \
-                         &stp1_23, &stp1_24);                                 \
-                                                                              \
-  /* Stage2 */                                                                \
-  multiplication_and_add(&in[2], &in[30], &in[18], &in[14], &stg2_0, &stg2_1, \
-                         &stg2_2, &stg2_3, &stp2_8, &stp2_15, &stp2_9,        \
-                         &stp2_14);                                           \
-  multiplication_and_add(&in[10], &in[22], &in[26], &in[6], &stg2_4, &stg2_5, \
-                         &stg2_6, &stg2_7, &stp2_10, &stp2_13, &stp2_11,      \
-                         &stp2_12);                                           \
-                                                                              \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_17);                                  \
-  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);                                  \
-  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);                                  \
-  stp2_19 = _mm_add_epi16(stp1_19, stp1_18);                                  \
-                                                                              \
-  stp2_20 = _mm_add_epi16(stp1_20, stp1_21);                                  \
-  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);                                  \
-  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);                                  \
-  stp2_23 = _mm_add_epi16(stp1_23, stp1_22);                                  \
-                                                                              \
-  stp2_24 = _mm_add_epi16(stp1_24, stp1_25);                                  \
-  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);                                  \
-  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);                                  \
-  stp2_27 = _mm_add_epi16(stp1_27, stp1_26);                                  \
-                                                                              \
-  stp2_28 = _mm_add_epi16(stp1_28, stp1_29);                                  \
-  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);                                  \
-  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);                                  \
-  stp2_31 = _mm_add_epi16(stp1_31, stp1_30);                                  \
-                                                                              \
-  /* Stage3 */                                                                \
-  multiplication_and_add(&in[4], &in[28], &in[20], &in[12], &stg3_0, &stg3_1, \
-                         &stg3_2, &stg3_3, &stp1_4, &stp1_7, &stp1_5,         \
-                         &stp1_6);                                            \
-                                                                              \
-  stp1_8 = _mm_add_epi16(stp2_8, stp2_9);                                     \
-  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                     \
-  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                  \
-  stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                  \
-  stp1_12 = _mm_add_epi16(stp2_12, stp2_13);                                  \
-  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                  \
-  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                  \
-  stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                  \
-                                                                              \
-  multiplication_and_add(&stp2_17, &stp2_30, &stp2_18, &stp2_29, &stg3_4,     \
-                         &stg3_5, &stg3_6, &stg3_4, &stp1_17, &stp1_30,       \
-                         &stp1_18, &stp1_29);                                 \
-  multiplication_and_add(&stp2_21, &stp2_26, &stp2_22, &stp2_25, &stg3_8,     \
-                         &stg3_9, &stg3_10, &stg3_8, &stp1_21, &stp1_26,      \
-                         &stp1_22, &stp1_25);                                 \
-                                                                              \
-  stp1_16 = stp2_16;                                                          \
-  stp1_31 = stp2_31;                                                          \
-  stp1_19 = stp2_19;                                                          \
-  stp1_20 = stp2_20;                                                          \
-  stp1_23 = stp2_23;                                                          \
-  stp1_24 = stp2_24;                                                          \
-  stp1_27 = stp2_27;                                                          \
-  stp1_28 = stp2_28;                                                          \
-                                                                              \
-  /* Stage4 */                                                                \
-  multiplication_and_add(&in[0], &in[16], &in[8], &in[24], &stg4_0, &stg4_1,  \
-                         &stg4_2, &stg4_3, &stp2_0, &stp2_1, &stp2_2,         \
-                         &stp2_3);                                            \
-                                                                              \
-  stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                     \
-  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                     \
-  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                     \
-  stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                     \
-                                                                              \
-  multiplication_and_add(&stp1_9, &stp1_14, &stp1_10, &stp1_13, &stg4_4,      \
-                         &stg4_5, &stg4_6, &stg4_4, &stp2_9, &stp2_14,        \
-                         &stp2_10, &stp2_13);                                 \
-                                                                              \
-  stp2_8 = stp1_8;                                                            \
-  stp2_15 = stp1_15;                                                          \
-  stp2_11 = stp1_11;                                                          \
-  stp2_12 = stp1_12;                                                          \
-                                                                              \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                  \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                  \
-  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                  \
-  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                  \
-  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                  \
-  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                  \
-  stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                  \
-  stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                  \
-                                                                              \
-  stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                  \
-  stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                  \
-  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                  \
-  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                  \
-  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                  \
-  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                  \
-  stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                  \
-  stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                  \
-                                                                              \
-  /* Stage5 */                                                                \
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                     \
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                     \
-  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                     \
-  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                     \
-  multiplication_and_add_2(&stp2_6, &stp2_5, &stg4_1, &stg4_0, &stp1_5,       \
-                           &stp1_6);                                          \
-                                                                              \
-  stp1_4 = stp2_4;                                                            \
-  stp1_7 = stp2_7;                                                            \
-                                                                              \
-  stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                    \
-  stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                    \
-  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                   \
-  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                   \
-  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                  \
-  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                  \
-  stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                  \
-  stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                  \
-                                                                              \
-  stp1_16 = stp2_16;                                                          \
-  stp1_17 = stp2_17;                                                          \
-                                                                              \
-  multiplication_and_add(&stp2_18, &stp2_29, &stp2_19, &stp2_28, &stg4_4,     \
-                         &stg4_5, &stg4_4, &stg4_5, &stp1_18, &stp1_29,       \
-                         &stp1_19, &stp1_28);                                 \
-  multiplication_and_add(&stp2_20, &stp2_27, &stp2_21, &stp2_26, &stg4_6,     \
-                         &stg4_4, &stg4_6, &stg4_4, &stp1_20, &stp1_27,       \
-                         &stp1_21, &stp1_26);                                 \
-                                                                              \
-  stp1_22 = stp2_22;                                                          \
-  stp1_23 = stp2_23;                                                          \
-  stp1_24 = stp2_24;                                                          \
-  stp1_25 = stp2_25;                                                          \
-  stp1_30 = stp2_30;                                                          \
-  stp1_31 = stp2_31;                                                          \
-                                                                              \
-  /* Stage6 */                                                                \
-  stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                     \
-  stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                     \
-  stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                     \
-  stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                     \
-  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                     \
-  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                     \
-  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                     \
-  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                     \
-                                                                              \
-  stp2_8 = stp1_8;                                                            \
-  stp2_9 = stp1_9;                                                            \
-  stp2_14 = stp1_14;                                                          \
-  stp2_15 = stp1_15;                                                          \
-                                                                              \
-  multiplication_and_add(&stp1_10, &stp1_13, &stp1_11, &stp1_12, &stg6_0,     \
-                         &stg4_0, &stg6_0, &stg4_0, &stp2_10, &stp2_13,       \
-                         &stp2_11, &stp2_12);                                 \
-                                                                              \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                  \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                  \
-  stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                  \
-  stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                  \
-  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                  \
-  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                  \
-  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                  \
-  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                  \
-                                                                              \
-  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                  \
-  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                  \
-  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                  \
-  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                  \
-  stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                  \
-  stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                  \
-  stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                  \
-  stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                  \
-                                                                              \
-  /* Stage7 */                                                                \
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                    \
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                    \
-  stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                    \
-  stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                    \
-  stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                    \
-  stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                    \
-  stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                     \
-  stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                     \
-  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                     \
-  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                     \
-  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                   \
-  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                   \
-  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                   \
-  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                   \
-  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                   \
-  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                   \
-                                                                              \
-  stp1_16 = stp2_16;                                                          \
-  stp1_17 = stp2_17;                                                          \
-  stp1_18 = stp2_18;                                                          \
-  stp1_19 = stp2_19;                                                          \
-                                                                              \
-  multiplication_and_add(&stp2_20, &stp2_27, &stp2_21, &stp2_26, &stg6_0,     \
-                         &stg4_0, &stg6_0, &stg4_0, &stp1_20, &stp1_27,       \
-                         &stp1_21, &stp1_26);                                 \
-  multiplication_and_add(&stp2_22, &stp2_25, &stp2_23, &stp2_24, &stg6_0,     \
-                         &stg4_0, &stg6_0, &stg4_0, &stp1_22, &stp1_25,       \
-                         &stp1_23, &stp1_24);                                 \
-                                                                              \
-  stp1_28 = stp2_28;                                                          \
-  stp1_29 = stp2_29;                                                          \
-  stp1_30 = stp2_30;                                                          \
-  stp1_31 = stp2_31;
-
  // Only upper-left 8x8 has non-zero coeff
  void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
                                 int stride) {
@@ -2056,14 +1640,14 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
    int i;
  
    // Load input data. Only need to load the top left 8x8 block.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 32);
-  in[2] = load_input_data(input + 64);
-  in[3] = load_input_data(input + 96);
-  in[4] = load_input_data(input + 128);
-  in[5] = load_input_data(input + 160);
-  in[6] = load_input_data(input + 192);
-  in[7] = load_input_data(input + 224);
+  in[0] = load_input_data8(input + 0 * 32);
+  in[1] = load_input_data8(input + 1 * 32);
+  in[2] = load_input_data8(input + 2 * 32);
+  in[3] = load_input_data8(input + 3 * 32);
+  in[4] = load_input_data8(input + 4 * 32);
+  in[5] = load_input_data8(input + 5 * 32);
+  in[6] = load_input_data8(input + 6 * 32);
+  in[7] = load_input_data8(input + 7 * 32);
  
    transpose_16bit_8x8(in, in);
    IDCT32_34
@@ -2152,178 +1736,284 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
    }
  }
  
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
-                                 int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i in[32]
+static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/,
+                                       __m128i *out /*out[8]*/) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;  // stp1_
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;  // stp2_
  
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+  {
+    const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+    const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+    const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+    const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+    butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7);
+    butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6);
+  }
  
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  v4 = _mm_add_epi16(u4, u5);
+  v5 = _mm_sub_epi16(u4, u5);
+  v6 = _mm_sub_epi16(u7, u6);
+  v7 = _mm_add_epi16(u7, u6);
  
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+  {
+    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+    const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+    const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+    const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+    butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
+
+    butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1);
+    butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3);
+  }
  
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  v0 = _mm_add_epi16(u0, u3);
+  v1 = _mm_add_epi16(u1, u2);
+  v2 = _mm_sub_epi16(u1, u2);
+  v3 = _mm_sub_epi16(u0, u3);
+
+  out[0] = _mm_add_epi16(v0, v7);
+  out[1] = _mm_add_epi16(v1, v6);
+  out[2] = _mm_add_epi16(v2, v5);
+  out[3] = _mm_add_epi16(v3, v4);
+  out[4] = _mm_sub_epi16(v3, v4);
+  out[5] = _mm_sub_epi16(v2, v5);
+  out[6] = _mm_sub_epi16(v1, v6);
+  out[7] = _mm_sub_epi16(v0, v7);
+}
  
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i in[32]
+static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/,
+                                       __m128i *out /*out[16]*/) {
+  __m128i u8, u9, u10, u11, u12, u13, u14, u15;  // stp2_
+  __m128i v8, v9, v10, v11, v12, v13, v14, v15;  // stp1_
  
-  __m128i in[32], col[128], zero_idx[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  int i, j, i32;
+  {
+    const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+    const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+    const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+    const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+    butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15);
+    butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14);
+  }
  
-  for (i = 0; i < 4; i++) {
-    i32 = (i << 5);
-    // First 1-D idct
-    // Load input data.
-    LOAD_DQCOEFF(in[0], input);
-    LOAD_DQCOEFF(in[8], input);
-    LOAD_DQCOEFF(in[16], input);
-    LOAD_DQCOEFF(in[24], input);
-    LOAD_DQCOEFF(in[1], input);
-    LOAD_DQCOEFF(in[9], input);
-    LOAD_DQCOEFF(in[17], input);
-    LOAD_DQCOEFF(in[25], input);
-    LOAD_DQCOEFF(in[2], input);
-    LOAD_DQCOEFF(in[10], input);
-    LOAD_DQCOEFF(in[18], input);
-    LOAD_DQCOEFF(in[26], input);
-    LOAD_DQCOEFF(in[3], input);
-    LOAD_DQCOEFF(in[11], input);
-    LOAD_DQCOEFF(in[19], input);
-    LOAD_DQCOEFF(in[27], input);
-
-    LOAD_DQCOEFF(in[4], input);
-    LOAD_DQCOEFF(in[12], input);
-    LOAD_DQCOEFF(in[20], input);
-    LOAD_DQCOEFF(in[28], input);
-    LOAD_DQCOEFF(in[5], input);
-    LOAD_DQCOEFF(in[13], input);
-    LOAD_DQCOEFF(in[21], input);
-    LOAD_DQCOEFF(in[29], input);
-    LOAD_DQCOEFF(in[6], input);
-    LOAD_DQCOEFF(in[14], input);
-    LOAD_DQCOEFF(in[22], input);
-    LOAD_DQCOEFF(in[30], input);
-    LOAD_DQCOEFF(in[7], input);
-    LOAD_DQCOEFF(in[15], input);
-    LOAD_DQCOEFF(in[23], input);
-    LOAD_DQCOEFF(in[31], input);
-
-    // checking if all entries are zero
-    zero_idx[0] = _mm_or_si128(in[0], in[1]);
-    zero_idx[1] = _mm_or_si128(in[2], in[3]);
-    zero_idx[2] = _mm_or_si128(in[4], in[5]);
-    zero_idx[3] = _mm_or_si128(in[6], in[7]);
-    zero_idx[4] = _mm_or_si128(in[8], in[9]);
-    zero_idx[5] = _mm_or_si128(in[10], in[11]);
-    zero_idx[6] = _mm_or_si128(in[12], in[13]);
-    zero_idx[7] = _mm_or_si128(in[14], in[15]);
-    zero_idx[8] = _mm_or_si128(in[16], in[17]);
-    zero_idx[9] = _mm_or_si128(in[18], in[19]);
-    zero_idx[10] = _mm_or_si128(in[20], in[21]);
-    zero_idx[11] = _mm_or_si128(in[22], in[23]);
-    zero_idx[12] = _mm_or_si128(in[24], in[25]);
-    zero_idx[13] = _mm_or_si128(in[26], in[27]);
-    zero_idx[14] = _mm_or_si128(in[28], in[29]);
-    zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
-    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
-    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
-    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
-      col[i32 + 0] = _mm_setzero_si128();
-      col[i32 + 1] = _mm_setzero_si128();
-      col[i32 + 2] = _mm_setzero_si128();
-      col[i32 + 3] = _mm_setzero_si128();
-      col[i32 + 4] = _mm_setzero_si128();
-      col[i32 + 5] = _mm_setzero_si128();
-      col[i32 + 6] = _mm_setzero_si128();
-      col[i32 + 7] = _mm_setzero_si128();
-      col[i32 + 8] = _mm_setzero_si128();
-      col[i32 + 9] = _mm_setzero_si128();
-      col[i32 + 10] = _mm_setzero_si128();
-      col[i32 + 11] = _mm_setzero_si128();
-      col[i32 + 12] = _mm_setzero_si128();
-      col[i32 + 13] = _mm_setzero_si128();
-      col[i32 + 14] = _mm_setzero_si128();
-      col[i32 + 15] = _mm_setzero_si128();
-      col[i32 + 16] = _mm_setzero_si128();
-      col[i32 + 17] = _mm_setzero_si128();
-      col[i32 + 18] = _mm_setzero_si128();
-      col[i32 + 19] = _mm_setzero_si128();
-      col[i32 + 20] = _mm_setzero_si128();
-      col[i32 + 21] = _mm_setzero_si128();
-      col[i32 + 22] = _mm_setzero_si128();
-      col[i32 + 23] = _mm_setzero_si128();
-      col[i32 + 24] = _mm_setzero_si128();
-      col[i32 + 25] = _mm_setzero_si128();
-      col[i32 + 26] = _mm_setzero_si128();
-      col[i32 + 27] = _mm_setzero_si128();
-      col[i32 + 28] = _mm_setzero_si128();
-      col[i32 + 29] = _mm_setzero_si128();
-      col[i32 + 30] = _mm_setzero_si128();
-      col[i32 + 31] = _mm_setzero_si128();
-      continue;
-    }
+  v8 = _mm_add_epi16(u8, u9);
+  v9 = _mm_sub_epi16(u8, u9);
+  v14 = _mm_sub_epi16(u15, u14);
+  v15 = _mm_add_epi16(u15, u14);
+
+  {
+    const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+    const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+    const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+    const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+    butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13);
+    butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12);
+  }
+
+  v10 = _mm_sub_epi16(u11, u10);
+  v11 = _mm_add_epi16(u11, u10);
+  v12 = _mm_add_epi16(u12, u13);
+  v13 = _mm_sub_epi16(u12, u13);
+
+  {
+    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
+    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
+  }
+
+  out[0] = _mm_add_epi16(v8, v11);
+  out[1] = _mm_add_epi16(v9, v10);
+  out[6] = _mm_add_epi16(v14, v13);
+  out[7] = _mm_add_epi16(v15, v12);
+
+  out[2] = _mm_sub_epi16(v9, v10);
+  out[3] = _mm_sub_epi16(v8, v11);
+  out[4] = _mm_sub_epi16(v15, v12);
+  out[5] = _mm_sub_epi16(v14, v13);
+
+  {
+    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
+    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
+  }
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i in[32]
+// We avoid hide an offset, 16, inside this function. So we output 0-15 into
+// array out[16]
+static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
+                                         __m128i *out /*out[16]*/) {
+  __m128i v16, v17, v18, v19, v20, v21, v22, v23;
+  __m128i v24, v25, v26, v27, v28, v29, v30, v31;
+  __m128i u16, u17, u18, u19, u20, u21, u22, u23;
+  __m128i u24, u25, u26, u27, u28, u29, u30, u31;
+
+  {
+    const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+    const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+    const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+    const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+    const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+    const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+    const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+    const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+    const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+    const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+    const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+    const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+    const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+    const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+    const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+    const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+    butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31);
+    butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30);
+    butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29);
+    butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28);
+
+    butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27);
+    butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26);
+
+    butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25);
+    butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24);
+  }
+
+  v16 = _mm_add_epi16(u16, u17);
+  v17 = _mm_sub_epi16(u16, u17);
+  v18 = _mm_sub_epi16(u19, u18);
+  v19 = _mm_add_epi16(u19, u18);
+
+  v20 = _mm_add_epi16(u20, u21);
+  v21 = _mm_sub_epi16(u20, u21);
+  v22 = _mm_sub_epi16(u23, u22);
+  v23 = _mm_add_epi16(u23, u22);
+
+  v24 = _mm_add_epi16(u24, u25);
+  v25 = _mm_sub_epi16(u24, u25);
+  v26 = _mm_sub_epi16(u27, u26);
+  v27 = _mm_add_epi16(u27, u26);
+
+  v28 = _mm_add_epi16(u28, u29);
+  v29 = _mm_sub_epi16(u28, u29);
+  v30 = _mm_sub_epi16(u31, u30);
+  v31 = _mm_add_epi16(u31, u30);
+
+  {
+    const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+    const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+    const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+    const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+    const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+    const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
+    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
+    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
+    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
+  }
+
+  u16 = _mm_add_epi16(v16, v19);
+  u17 = _mm_add_epi16(v17, v18);
+  u18 = _mm_sub_epi16(v17, v18);
+  u19 = _mm_sub_epi16(v16, v19);
+  u20 = _mm_sub_epi16(v23, v20);
+  u21 = _mm_sub_epi16(v22, v21);
+  u22 = _mm_add_epi16(v22, v21);
+  u23 = _mm_add_epi16(v23, v20);
+
+  u24 = _mm_add_epi16(v24, v27);
+  u25 = _mm_add_epi16(v25, v26);
+  u26 = _mm_sub_epi16(v25, v26);
+  u27 = _mm_sub_epi16(v24, v27);
+
+  u28 = _mm_sub_epi16(v31, v28);
+  u29 = _mm_sub_epi16(v30, v29);
+  u30 = _mm_add_epi16(v29, v30);
+  u31 = _mm_add_epi16(v28, v31);
+
+  {
+    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
+    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
+    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
+    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
+  }
+
+  out[0] = _mm_add_epi16(u16, u23);
+  out[1] = _mm_add_epi16(u17, u22);
+  out[2] = _mm_add_epi16(u18, u21);
+  out[3] = _mm_add_epi16(u19, u20);
+  out[4] = _mm_sub_epi16(u19, u20);
+  out[5] = _mm_sub_epi16(u18, u21);
+  out[6] = _mm_sub_epi16(u17, u22);
+  out[7] = _mm_sub_epi16(u16, u23);
+
+  out[8] = _mm_sub_epi16(u31, u24);
+  out[9] = _mm_sub_epi16(u30, u25);
+  out[10] = _mm_sub_epi16(u29, u26);
+  out[11] = _mm_sub_epi16(u28, u27);
+  out[12] = _mm_add_epi16(u27, u28);
+  out[13] = _mm_add_epi16(u26, u29);
+  out[14] = _mm_add_epi16(u25, u30);
+  out[15] = _mm_add_epi16(u24, u31);
+
+  {
+    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+    butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
+    butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
+    butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
+    butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
+  }
+}
+
+static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/,
+                                         __m128i *out /*out[32]*/) {
+  __m128i temp[16];
+  idct32_full_8x32_quarter_1(in, temp);
+  idct32_full_8x32_quarter_2(in, &temp[8]);
+  add_sub_butterfly(temp, out, 16);
+}
+
+static void idct32_full_8x32(const __m128i *in /*in[32]*/,
+                             __m128i *out /*out[32]*/) {
+  __m128i temp[32];
+  idct32_full_8x32_quarter_1_2(in, temp);
+  idct32_full_8x32_quarter_3_4(in, &temp[16]);
+  add_sub_butterfly(temp, out, 32);
+}
+
+static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
+  int i;
+  for (i = 0; i < 8; ++i) {
+    in[i] = load_input_data8(input);
+    in[i + 8] = load_input_data8(input + 8);
+    in[i + 16] = load_input_data8(input + 16);
+    in[i + 24] = load_input_data8(input + 24);
+    input += 32;
+  }
+}
+
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
+                                 int stride) {
+  __m128i col[128], in[32];
+  int i, j;
+
+  // rows
+  for (i = 0; i < 4; ++i) {
+    load_buffer_8x32(input, in);
+    input += 32 << 3;
  
      // Transpose 32x8 block to 8x32 block
      transpose_16bit_8x8(in, in);
@@ -2331,95 +2021,20 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
      transpose_16bit_8x8(in + 16, in + 16);
      transpose_16bit_8x8(in + 24, in + 24);
  
-    IDCT32
-
-    // 1_D: Store 32 intermediate results for each 8x32 block.
-    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+    idct32_full_8x32(in, col + (i << 5));
    }
-  for (i = 0; i < 4; i++) {
-    // Second 1-D idct
-    j = i << 3;
  
+  // columns
+  for (i = 0; i < 4; ++i) {
+    j = i << 3;
      // Transpose 32x8 block to 8x32 block
      transpose_16bit_8x8(col + j, in);
      transpose_16bit_8x8(col + j + 32, in + 8);
      transpose_16bit_8x8(col + j + 64, in + 16);
      transpose_16bit_8x8(col + j + 96, in + 24);
  
-    IDCT32
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      recon_and_store(dest + j * stride, in[j]);
-    }
-
+    idct32_full_8x32(in, in);
+    store_buffer_8x32(in, dest, stride);
      dest += 8;
    }
  }
@@ -2427,18 +2042,16 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
  void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
                                int stride) {
    __m128i dc_value;
-  int a, j;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
+  int j;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
  
-  dc_value = _mm_set1_epi16(a);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  dc_value = _mm_set1_epi16(a1);
  
    for (j = 0; j < 32; ++j) {
-    recon_and_store(dest + 0 + j * stride, dc_value);
-    recon_and_store(dest + 8 + j * stride, dc_value);
-    recon_and_store(dest + 16 + j * stride, dc_value);
-    recon_and_store(dest + 24 + j * stride, dc_value);
+    recon_and_store_16(dest + j * stride + 0, dc_value);
+    recon_and_store_16(dest + j * stride + 16, dc_value);
    }
  }
diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h

index acaf86178d4259385e9d09a745aca86f1b723004..e6e1cd403bfa922bca9675747fda468662d13ef9 100644 (file)
--- a/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/vpx_dsp/x86/inv_txfm_sse2.h
@@ -76,24 +76,35 @@ static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0,
    return _mm_packs_epi32(t0, t1);
  }
  
-// Function to allow 8 bit optimisations to be used when profile 0 is used with
+static INLINE void multiplication_and_add_2(const __m128i *const in0,
+                                            const __m128i *const in1,
+                                            const __m128i *const cst0,
+                                            const __m128i *const cst1,
+                                            __m128i *const res0,
+                                            __m128i *const res1) {
+  const __m128i lo = _mm_unpacklo_epi16(*in0, *in1);
+  const __m128i hi = _mm_unpackhi_epi16(*in0, *in1);
+  *res0 = idct_calc_wraplow_sse2(lo, hi, *cst0);
+  *res1 = idct_calc_wraplow_sse2(lo, hi, *cst1);
+}
+
+// Functions to allow 8 bit optimisations to be used when profile 0 is used with
  // highbitdepth enabled
-static INLINE __m128i load_input_data(const tran_low_t *data) {
+static INLINE __m128i load_input_data4(const tran_low_t *data) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i in = _mm_load_si128((const __m128i *)data);
+  return _mm_packs_epi32(in, zero);
+#else
+  return _mm_loadl_epi64((const __m128i *)data);
+#endif
+}
+
+static INLINE __m128i load_input_data8(const tran_low_t *data) {
  #if CONFIG_VP9_HIGHBITDEPTH
-  // in0: 0 X 1 X  2 X 3 X
-  // in1: 4 X 5 X  6 X 7 X
-  // t0:  0 4 X X  1 5 X X
-  // t1:  2 6 X X  3 7 X X
-  // t2:  0 2 4 6  X X X X
-  // t3:  1 3 5 7  X X X X
-  // rtn: 0 1 2 3  4 5 6 7
    const __m128i in0 = _mm_load_si128((const __m128i *)data);
    const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4));
-  const __m128i t0 = _mm_unpacklo_epi16(in0, in1);
-  const __m128i t1 = _mm_unpackhi_epi16(in0, in1);
-  const __m128i t2 = _mm_unpacklo_epi16(t0, t1);
-  const __m128i t3 = _mm_unpackhi_epi16(t0, t1);
-  return _mm_unpacklo_epi16(t2, t3);
+  return _mm_packs_epi32(in0, in1);
  #else
    return _mm_load_si128((const __m128i *)data);
  #endif
@@ -101,35 +112,35 @@ static INLINE __m128i load_input_data(const tran_low_t *data) {
  
  static INLINE void load_buffer_8x8(const tran_low_t *const input,
                                     __m128i *const in) {
-  in[0] = load_input_data(input + 0 * 8);
-  in[1] = load_input_data(input + 1 * 8);
-  in[2] = load_input_data(input + 2 * 8);
-  in[3] = load_input_data(input + 3 * 8);
-  in[4] = load_input_data(input + 4 * 8);
-  in[5] = load_input_data(input + 5 * 8);
-  in[6] = load_input_data(input + 6 * 8);
-  in[7] = load_input_data(input + 7 * 8);
+  in[0] = load_input_data8(input + 0 * 8);
+  in[1] = load_input_data8(input + 1 * 8);
+  in[2] = load_input_data8(input + 2 * 8);
+  in[3] = load_input_data8(input + 3 * 8);
+  in[4] = load_input_data8(input + 4 * 8);
+  in[5] = load_input_data8(input + 5 * 8);
+  in[6] = load_input_data8(input + 6 * 8);
+  in[7] = load_input_data8(input + 7 * 8);
  }
  
  static INLINE void load_buffer_8x16(const tran_low_t *const input,
                                      __m128i *const in) {
-  in[0] = load_input_data(input + 0 * 16);
-  in[1] = load_input_data(input + 1 * 16);
-  in[2] = load_input_data(input + 2 * 16);
-  in[3] = load_input_data(input + 3 * 16);
-  in[4] = load_input_data(input + 4 * 16);
-  in[5] = load_input_data(input + 5 * 16);
-  in[6] = load_input_data(input + 6 * 16);
-  in[7] = load_input_data(input + 7 * 16);
-
-  in[8] = load_input_data(input + 8 * 16);
-  in[9] = load_input_data(input + 9 * 16);
-  in[10] = load_input_data(input + 10 * 16);
-  in[11] = load_input_data(input + 11 * 16);
-  in[12] = load_input_data(input + 12 * 16);
-  in[13] = load_input_data(input + 13 * 16);
-  in[14] = load_input_data(input + 14 * 16);
-  in[15] = load_input_data(input + 15 * 16);
+  in[0] = load_input_data8(input + 0 * 16);
+  in[1] = load_input_data8(input + 1 * 16);
+  in[2] = load_input_data8(input + 2 * 16);
+  in[3] = load_input_data8(input + 3 * 16);
+  in[4] = load_input_data8(input + 4 * 16);
+  in[5] = load_input_data8(input + 5 * 16);
+  in[6] = load_input_data8(input + 6 * 16);
+  in[7] = load_input_data8(input + 7 * 16);
+
+  in[8] = load_input_data8(input + 8 * 16);
+  in[9] = load_input_data8(input + 9 * 16);
+  in[10] = load_input_data8(input + 10 * 16);
+  in[11] = load_input_data8(input + 11 * 16);
+  in[12] = load_input_data8(input + 12 * 16);
+  in[13] = load_input_data8(input + 13 * 16);
+  in[14] = load_input_data8(input + 14 * 16);
+  in[15] = load_input_data8(input + 15 * 16);
  }
  
  static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) {
@@ -258,6 +269,78 @@ static INLINE void recon_and_store4x4_sse2(const __m128i *const in,
    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
  }
  
+static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  int j = 0;
+  while (j < 32) {
+    in[j] = _mm_adds_epi16(in[j], final_rounding);
+    in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
+
+    in[j] = _mm_srai_epi16(in[j], 6);
+    in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
+
+    recon_and_store(dst, in[j]);
+    dst += stride;
+    recon_and_store(dst, in[j + 1]);
+    dst += stride;
+    j += 2;
+  }
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
+                                     int size) {
+  int i = 0;
+  const int num = size >> 1;
+  const int bound = size - 1;
+  while (i < num) {
+    out[i] = _mm_add_epi16(in[i], in[bound - i]);
+    out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
+    i++;
+  }
+}
+
+#define BUTTERFLY_PAIR(x0, x1, co0, co1)         \
+  do {                                           \
+    tmp0 = _mm_madd_epi16(x0, co0);              \
+    tmp1 = _mm_madd_epi16(x1, co0);              \
+    tmp2 = _mm_madd_epi16(x0, co1);              \
+    tmp3 = _mm_madd_epi16(x1, co1);              \
+    tmp0 = _mm_add_epi32(tmp0, rounding);        \
+    tmp1 = _mm_add_epi32(tmp1, rounding);        \
+    tmp2 = _mm_add_epi32(tmp2, rounding);        \
+    tmp3 = _mm_add_epi32(tmp3, rounding);        \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+  } while (0)
+
+static INLINE void butterfly(const __m128i *x0, const __m128i *x1,
+                             const __m128i *c0, const __m128i *c1, __m128i *y0,
+                             __m128i *y1) {
+  __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  u0 = _mm_unpacklo_epi16(*x0, *x1);
+  u1 = _mm_unpackhi_epi16(*x0, *x1);
+  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
+  *y0 = _mm_packs_epi32(tmp0, tmp1);
+  *y1 = _mm_packs_epi32(tmp2, tmp3);
+}
+
+static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0,
+                                  const __m128i *c1) {
+  __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  u0 = _mm_unpacklo_epi16(*x0, *x1);
+  u1 = _mm_unpackhi_epi16(*x0, *x1);
+  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
+  *x0 = _mm_packs_epi32(tmp0, tmp1);
+  *x1 = _mm_packs_epi32(tmp2, tmp3);
+}
+
  void idct4_sse2(__m128i *in);
  void idct8_sse2(__m128i *in);
  void idct16_sse2(__m128i *in0, __m128i *in1);
diff --git a/vpx_dsp/x86/inv_txfm_ssse3.c b/vpx_dsp/x86/inv_txfm_ssse3.c

index 0e86e43f11947b1bb3d41bc4e6e92e2d55bcd57b..a2a8858c2a4923b212c8fe74f910783cf4b47a80 100644 (file)
--- a/vpx_dsp/x86/inv_txfm_ssse3.c
+++ b/vpx_dsp/x86/inv_txfm_ssse3.c
@@ -17,193 +17,100 @@
  
  void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
                                int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-  const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
-  const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
-  const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
-  const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[8];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp[4];
-
-  // Rows. Load 4-row input data.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 1);
-  in[2] = load_input_data(input + 8 * 2);
-  in[3] = load_input_data(input + 8 * 3);
-
-  // 4x4 Transpose
-  transpose_16bit_4x4(in, in);
-
-  // Stage1
-  tmp[0] = _mm_mulhrs_epi16(in[0], stg1_0);
-  tmp[1] = _mm_mulhrs_epi16(in[0], stg1_1);
-  tmp[2] = _mm_mulhrs_epi16(in[1], stg1_2);
-  tmp[3] = _mm_mulhrs_epi16(in[1], stg1_3);
-
-  stp1_4 = _mm_unpackhi_epi64(tmp[0], tmp[1]);
-  stp1_5 = _mm_unpackhi_epi64(tmp[2], tmp[3]);
-
-  // Stage2
-  tmp[0] = _mm_mulhrs_epi16(in[0], stg2_0);
-  stp2_0 = _mm_unpacklo_epi64(tmp[0], tmp[0]);
-
-  tmp[1] = _mm_mulhrs_epi16(in[1], stg2_2);
-  tmp[2] = _mm_mulhrs_epi16(in[1], stg2_3);
-  stp2_2 = _mm_unpacklo_epi64(tmp[2], tmp[1]);
-
-  tmp[0] = _mm_add_epi16(stp1_4, stp1_5);
-  tmp[1] = _mm_sub_epi16(stp1_4, stp1_5);
-
-  stp2_4 = tmp[0];
-  stp2_5 = _mm_unpacklo_epi64(tmp[1], zero);
-  stp2_6 = _mm_unpackhi_epi64(tmp[1], zero);
+  const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);
+  const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);
+  const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);
+  const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i cospi_16_64d = _mm_set1_epi16(2 * cospi_16_64);
+  const __m128i cospi_28_64d = _mm_set1_epi16(2 * cospi_28_64);
+  const __m128i cospi_4_64d = _mm_set1_epi16(2 * cospi_4_64);
+  const __m128i cospi_n20_64d = _mm_set1_epi16(-2 * cospi_20_64);
+  const __m128i cospi_12_64d = _mm_set1_epi16(2 * cospi_12_64);
+  const __m128i cospi_24_64d = _mm_set1_epi16(2 * cospi_24_64);
+  const __m128i cospi_8_64d = _mm_set1_epi16(2 * cospi_8_64);
+  __m128i in[8], step1[8], step2[8], tmp[4];
+
+  in[0] = load_input_data4(input + 0 * 8);
+  in[1] = load_input_data4(input + 1 * 8);
+  in[2] = load_input_data4(input + 2 * 8);
+  in[3] = load_input_data4(input + 3 * 8);
+
+  // pass 1
  
-  tmp[0] = _mm_unpacklo_epi16(stp2_5, stp2_6);
-  tmp[1] = _mm_madd_epi16(tmp[0], stg3_0);
-  tmp[2] = _mm_madd_epi16(tmp[0], stk2_0);  // stg3_1 = stk2_0
-
-  tmp[1] = _mm_add_epi32(tmp[1], rounding);
-  tmp[2] = _mm_add_epi32(tmp[2], rounding);
-  tmp[1] = _mm_srai_epi32(tmp[1], DCT_CONST_BITS);
-  tmp[2] = _mm_srai_epi32(tmp[2], DCT_CONST_BITS);
-
-  stp1_5 = _mm_packs_epi32(tmp[1], tmp[2]);
-
-  // Stage3
-  tmp[2] = _mm_add_epi16(stp2_0, stp2_2);
-  tmp[3] = _mm_sub_epi16(stp2_0, stp2_2);
-
-  stp1_2 = _mm_unpackhi_epi64(tmp[3], tmp[2]);
-  stp1_3 = _mm_unpacklo_epi64(tmp[3], tmp[2]);
-
-  // Stage4
-  tmp[0] = _mm_add_epi16(stp1_3, stp2_4);
-  tmp[1] = _mm_add_epi16(stp1_2, stp1_5);
-  tmp[2] = _mm_sub_epi16(stp1_3, stp2_4);
-  tmp[3] = _mm_sub_epi16(stp1_2, stp1_5);
+  transpose_16bit_4x4(in, in);
+  // in[0]: 00 10 20 30  01 11 21 31
+  // in[1]: 02 12 22 32  03 13 23 33
+
+  // stage 1
+  tmp[0] = _mm_unpacklo_epi64(in[0], in[0]);
+  tmp[1] = _mm_unpackhi_epi64(in[0], in[0]);
+  tmp[2] = _mm_unpacklo_epi64(in[1], in[1]);
+  tmp[3] = _mm_unpackhi_epi64(in[1], in[1]);
+  step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d);    // step1 4&7
+  step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d);  // step1 5&6
+
+  // stage 2
+  step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d);  // step2 0&1
+  step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d);     // step2 3&2
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);       // step2 4&7
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);       // step2 5&6
+  step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]);  // step2 6
+
+  // stage 3
+  tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]);
+  step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]);  // step1 5&6
+  tmp[0] = _mm_add_epi16(step2[0], step2[2]);                      // step1 0&1
+  tmp[1] = _mm_sub_epi16(step2[0], step2[2]);                      // step1 3&2
+  step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]);                   // step1 2&1
+  step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]);                   // step1 3&0
+
+  // stage 4
+  tmp[0] = _mm_add_epi16(step1[3], step2[4]);  // output 3&0
+  tmp[1] = _mm_add_epi16(step1[2], step1[5]);  // output 2&1
+  tmp[2] = _mm_sub_epi16(step1[3], step2[4]);  // output 4&7
+  tmp[3] = _mm_sub_epi16(step1[2], step1[5]);  // output 5&6
+
+  // pass 2
  
    idct8x8_12_transpose_16bit_4x8(tmp, in);
  
-  /* Stage1 */
-  stp1_4 = _mm_mulhrs_epi16(in[1], stg1_0);
-  stp1_7 = _mm_mulhrs_epi16(in[1], stg1_1);
-  stp1_5 = _mm_mulhrs_epi16(in[3], stg1_2);
-  stp1_6 = _mm_mulhrs_epi16(in[3], stg1_3);
-
-  /* Stage2 */
-  stp2_0 = _mm_mulhrs_epi16(in[0], stg2_0);
-  stp2_1 = _mm_mulhrs_epi16(in[0], stg2_0);
-
-  stp2_2 = _mm_mulhrs_epi16(in[2], stg2_2);
-  stp2_3 = _mm_mulhrs_epi16(in[2], stg2_3);
-
-  stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
-  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
-  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
-  stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-
-  /* Stage3 */
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
-  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
-  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
-  tmp[0] = _mm_unpacklo_epi16(stp2_6, stp2_5);
-  tmp[1] = _mm_unpackhi_epi16(stp2_6, stp2_5);
-
-  tmp[2] = _mm_madd_epi16(tmp[0], stk2_0);
-  tmp[3] = _mm_madd_epi16(tmp[1], stk2_0);
-  tmp[2] = _mm_add_epi32(tmp[2], rounding);
-  tmp[3] = _mm_add_epi32(tmp[3], rounding);
-  tmp[2] = _mm_srai_epi32(tmp[2], DCT_CONST_BITS);
-  tmp[3] = _mm_srai_epi32(tmp[3], DCT_CONST_BITS);
-  stp1_6 = _mm_packs_epi32(tmp[2], tmp[3]);
-
-  tmp[2] = _mm_madd_epi16(tmp[0], stk2_1);
-  tmp[3] = _mm_madd_epi16(tmp[1], stk2_1);
-  tmp[2] = _mm_add_epi32(tmp[2], rounding);
-  tmp[3] = _mm_add_epi32(tmp[3], rounding);
-  tmp[2] = _mm_srai_epi32(tmp[2], DCT_CONST_BITS);
-  tmp[3] = _mm_srai_epi32(tmp[3], DCT_CONST_BITS);
-  stp1_5 = _mm_packs_epi32(tmp[2], tmp[3]);
-
-  /* Stage4  */
-  in[0] = _mm_add_epi16(stp1_0, stp2_7);
-  in[1] = _mm_add_epi16(stp1_1, stp1_6);
-  in[2] = _mm_add_epi16(stp1_2, stp1_5);
-  in[3] = _mm_add_epi16(stp1_3, stp2_4);
-  in[4] = _mm_sub_epi16(stp1_3, stp2_4);
-  in[5] = _mm_sub_epi16(stp1_2, stp1_5);
-  in[6] = _mm_sub_epi16(stp1_1, stp1_6);
-  in[7] = _mm_sub_epi16(stp1_0, stp2_7);
+  // stage 1
+  step1[4] = _mm_mulhrs_epi16(in[1], cospi_28_64d);
+  step1[7] = _mm_mulhrs_epi16(in[1], cospi_4_64d);
+  step1[5] = _mm_mulhrs_epi16(in[3], cospi_n20_64d);
+  step1[6] = _mm_mulhrs_epi16(in[3], cospi_12_64d);
+
+  // stage 2
+  step2[0] = _mm_mulhrs_epi16(in[0], cospi_16_64d);  // step2[1] = step2[0]
+  step2[2] = _mm_mulhrs_epi16(in[2], cospi_24_64d);
+  step2[3] = _mm_mulhrs_epi16(in[2], cospi_8_64d);
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[0], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[0], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16,
+                           &step1[5], &step1[6]);
+
+  // stage 4
+  in[0] = _mm_add_epi16(step1[0], step2[7]);
+  in[1] = _mm_add_epi16(step1[1], step1[6]);
+  in[2] = _mm_add_epi16(step1[2], step1[5]);
+  in[3] = _mm_add_epi16(step1[3], step2[4]);
+  in[4] = _mm_sub_epi16(step1[3], step2[4]);
+  in[5] = _mm_sub_epi16(step1[2], step1[5]);
+  in[6] = _mm_sub_epi16(step1[1], step1[6]);
+  in[7] = _mm_sub_epi16(step1[0], step2[7]);
  
    write_buffer_8x8(in, dest, stride);
  }
  
-// Only do addition and subtraction butterfly, size = 16, 32
-static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
-                                     int size) {
-  int i = 0;
-  const int num = size >> 1;
-  const int bound = size - 1;
-  while (i < num) {
-    out[i] = _mm_add_epi16(in[i], in[bound - i]);
-    out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
-    i++;
-  }
-}
-
-#define BUTTERFLY_PAIR(x0, x1, co0, co1)         \
-  do {                                           \
-    tmp0 = _mm_madd_epi16(x0, co0);              \
-    tmp1 = _mm_madd_epi16(x1, co0);              \
-    tmp2 = _mm_madd_epi16(x0, co1);              \
-    tmp3 = _mm_madd_epi16(x1, co1);              \
-    tmp0 = _mm_add_epi32(tmp0, rounding);        \
-    tmp1 = _mm_add_epi32(tmp1, rounding);        \
-    tmp2 = _mm_add_epi32(tmp2, rounding);        \
-    tmp3 = _mm_add_epi32(tmp3, rounding);        \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-  } while (0)
-
-static INLINE void butterfly(const __m128i *x0, const __m128i *x1,
-                             const __m128i *c0, const __m128i *c1, __m128i *y0,
-                             __m128i *y1) {
-  __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  u0 = _mm_unpacklo_epi16(*x0, *x1);
-  u1 = _mm_unpackhi_epi16(*x0, *x1);
-  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
-  *y0 = _mm_packs_epi32(tmp0, tmp1);
-  *y1 = _mm_packs_epi32(tmp2, tmp3);
-}
-
-static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0,
-                                  const __m128i *c1) {
-  __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  u0 = _mm_unpacklo_epi16(*x0, *x1);
-  u1 = _mm_unpackhi_epi16(*x0, *x1);
-  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
-  *x0 = _mm_packs_epi32(tmp0, tmp1);
-  *x1 = _mm_packs_epi32(tmp2, tmp3);
-}
-
  static void idct32_34_first_half(const __m128i *in, __m128i *stp1) {
    const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
    const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
@@ -396,14 +303,14 @@ void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest,
    int i;
  
    // Load input data. Only need to load the top left 8x8 block.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 32);
-  in[2] = load_input_data(input + 64);
-  in[3] = load_input_data(input + 96);
-  in[4] = load_input_data(input + 128);
-  in[5] = load_input_data(input + 160);
-  in[6] = load_input_data(input + 192);
-  in[7] = load_input_data(input + 224);
+  in[0] = load_input_data8(input + 0 * 32);
+  in[1] = load_input_data8(input + 1 * 32);
+  in[2] = load_input_data8(input + 2 * 32);
+  in[3] = load_input_data8(input + 3 * 32);
+  in[4] = load_input_data8(input + 4 * 32);
+  in[5] = load_input_data8(input + 5 * 32);
+  in[6] = load_input_data8(input + 6 * 32);
+  in[7] = load_input_data8(input + 7 * 32);
  
    transpose_16bit_8x8(in, in);
    idct32_34_first_half(in, stp1);
@@ -437,8 +344,8 @@ static void load_buffer_16x16(const tran_low_t *input, __m128i *in0,
                                __m128i *in1) {
    int i;
    for (i = 0; i < 16; i++) {
-    in0[i] = load_input_data(input);
-    in1[i] = load_input_data(input + 8);
+    in0[i] = load_input_data8(input);
+    in1[i] = load_input_data8(input + 8);
      input += 32;
    }
  }
@@ -715,24 +622,6 @@ static void idct32_8x32_135(__m128i *in /*in[32]*/) {
    add_sub_butterfly(out, in, 32);
  }
  
-static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  int j = 0;
-  while (j < 32) {
-    in[j] = _mm_adds_epi16(in[j], final_rounding);
-    in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
-
-    in[j] = _mm_srai_epi16(in[j], 6);
-    in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
-
-    recon_and_store(dst, in[j]);
-    dst += stride;
-    recon_and_store(dst, in[j + 1]);
-    dst += stride;
-    j += 2;
-  }
-}
-
  static INLINE void recon_and_store_ssse3(__m128i *in0, __m128i *in1,
                                           uint8_t *dest, int stride) {
    store_buffer_8x32(in0, dest, stride);
@@ -793,306 +682,3 @@ void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest,
    idct32_135(col0, col1);
    recon_and_store_ssse3(col0, col1, dest + 16, stride);
  }
-
-// For each 8x32 block __m128i in[32],
-// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
-// output pixels: 8-15 in __m128i in[32]
-static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/,
-                                       __m128i *out /*out[16]*/) {
-  __m128i u8, u9, u10, u11, u12, u13, u14, u15;  // stp2_
-  __m128i v8, v9, v10, v11, v12, v13, v14, v15;  // stp1_
-
-  {
-    const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-    const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-    const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-    const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-    butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15);
-    butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14);
-  }
-
-  v8 = _mm_add_epi16(u8, u9);
-  v9 = _mm_sub_epi16(u8, u9);
-  v14 = _mm_sub_epi16(u15, u14);
-  v15 = _mm_add_epi16(u15, u14);
-
-  {
-    const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-    const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-    const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-    const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-    butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13);
-    butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12);
-  }
-
-  v10 = _mm_sub_epi16(u11, u10);
-  v11 = _mm_add_epi16(u11, u10);
-  v12 = _mm_add_epi16(u12, u13);
-  v13 = _mm_sub_epi16(u12, u13);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
-    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(v8, v11);
-  out[1] = _mm_add_epi16(v9, v10);
-  out[6] = _mm_add_epi16(v14, v13);
-  out[7] = _mm_add_epi16(v15, v12);
-
-  out[2] = _mm_sub_epi16(v9, v10);
-  out[3] = _mm_sub_epi16(v8, v11);
-  out[4] = _mm_sub_epi16(v15, v12);
-  out[5] = _mm_sub_epi16(v14, v13);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
-    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
-  }
-}
-
-// For each 8x32 block __m128i in[32],
-// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
-// output pixels: 0-7 in __m128i in[32]
-static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/,
-                                       __m128i *out /*out[8]*/) {
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;  // stp1_
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;  // stp2_
-
-  {
-    const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-    const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-    const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-    const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-    butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7);
-    butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6);
-  }
-
-  v4 = _mm_add_epi16(u4, u5);
-  v5 = _mm_sub_epi16(u4, u5);
-  v6 = _mm_sub_epi16(u7, u6);
-  v7 = _mm_add_epi16(u7, u6);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-    const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-    const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-    butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
-
-    butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1);
-    butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3);
-  }
-
-  v0 = _mm_add_epi16(u0, u3);
-  v1 = _mm_add_epi16(u1, u2);
-  v2 = _mm_sub_epi16(u1, u2);
-  v3 = _mm_sub_epi16(u0, u3);
-
-  out[0] = _mm_add_epi16(v0, v7);
-  out[1] = _mm_add_epi16(v1, v6);
-  out[2] = _mm_add_epi16(v2, v5);
-  out[3] = _mm_add_epi16(v3, v4);
-  out[4] = _mm_sub_epi16(v3, v4);
-  out[5] = _mm_sub_epi16(v2, v5);
-  out[6] = _mm_sub_epi16(v1, v6);
-  out[7] = _mm_sub_epi16(v0, v7);
-}
-
-// For each 8x32 block __m128i in[32],
-// Input with odd index,
-// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
-// output pixels: 16-23, 24-31 in __m128i in[32]
-// We avoid hide an offset, 16, inside this function. So we output 0-15 into
-// array out[16]
-static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
-                                         __m128i *out /*out[16]*/) {
-  __m128i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m128i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m128i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m128i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  {
-    const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-    const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-    const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-    const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-    const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-    const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-    const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-    const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-    const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-    const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-    const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-    const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-    const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-    const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-    const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-    const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-    butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31);
-    butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30);
-    butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29);
-    butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28);
-
-    butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27);
-    butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26);
-
-    butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25);
-    butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24);
-  }
-
-  v16 = _mm_add_epi16(u16, u17);
-  v17 = _mm_sub_epi16(u16, u17);
-  v18 = _mm_sub_epi16(u19, u18);
-  v19 = _mm_add_epi16(u19, u18);
-
-  v20 = _mm_add_epi16(u20, u21);
-  v21 = _mm_sub_epi16(u20, u21);
-  v22 = _mm_sub_epi16(u23, u22);
-  v23 = _mm_add_epi16(u23, u22);
-
-  v24 = _mm_add_epi16(u24, u25);
-  v25 = _mm_sub_epi16(u24, u25);
-  v26 = _mm_sub_epi16(u27, u26);
-  v27 = _mm_add_epi16(u27, u26);
-
-  v28 = _mm_add_epi16(u28, u29);
-  v29 = _mm_sub_epi16(u28, u29);
-  v30 = _mm_sub_epi16(u31, u30);
-  v31 = _mm_add_epi16(u31, u30);
-
-  {
-    const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-    const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-    const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-    const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-    const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-    const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
-    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
-    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
-    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
-  }
-
-  u16 = _mm_add_epi16(v16, v19);
-  u17 = _mm_add_epi16(v17, v18);
-  u18 = _mm_sub_epi16(v17, v18);
-  u19 = _mm_sub_epi16(v16, v19);
-  u20 = _mm_sub_epi16(v23, v20);
-  u21 = _mm_sub_epi16(v22, v21);
-  u22 = _mm_add_epi16(v22, v21);
-  u23 = _mm_add_epi16(v23, v20);
-
-  u24 = _mm_add_epi16(v24, v27);
-  u25 = _mm_add_epi16(v25, v26);
-  u26 = _mm_sub_epi16(v25, v26);
-  u27 = _mm_sub_epi16(v24, v27);
-
-  u28 = _mm_sub_epi16(v31, v28);
-  u29 = _mm_sub_epi16(v30, v29);
-  u30 = _mm_add_epi16(v29, v30);
-  u31 = _mm_add_epi16(v28, v31);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(u16, u23);
-  out[1] = _mm_add_epi16(u17, u22);
-  out[2] = _mm_add_epi16(u18, u21);
-  out[3] = _mm_add_epi16(u19, u20);
-  out[4] = _mm_sub_epi16(u19, u20);
-  out[5] = _mm_sub_epi16(u18, u21);
-  out[6] = _mm_sub_epi16(u17, u22);
-  out[7] = _mm_sub_epi16(u16, u23);
-
-  out[8] = _mm_sub_epi16(u31, u24);
-  out[9] = _mm_sub_epi16(u30, u25);
-  out[10] = _mm_sub_epi16(u29, u26);
-  out[11] = _mm_sub_epi16(u28, u27);
-  out[12] = _mm_add_epi16(u27, u28);
-  out[13] = _mm_add_epi16(u26, u29);
-  out[14] = _mm_add_epi16(u25, u30);
-  out[15] = _mm_add_epi16(u24, u31);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
-    butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
-    butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
-    butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
-  }
-}
-
-static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/,
-                                         __m128i *out /*out[32]*/) {
-  __m128i temp[16];
-  idct32_full_8x32_quarter_1(in, temp);
-  idct32_full_8x32_quarter_2(in, &temp[8]);
-  add_sub_butterfly(temp, out, 16);
-}
-
-static void idct32_full_8x32(const __m128i *in /*in[32]*/,
-                             __m128i *out /*out[32]*/) {
-  __m128i temp[32];
-  idct32_full_8x32_quarter_1_2(in, temp);
-  idct32_full_8x32_quarter_3_4(in, &temp[16]);
-  add_sub_butterfly(temp, out, 32);
-}
-
-static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
-  int i;
-  for (i = 0; i < 8; ++i) {
-    in[i] = load_input_data(input);
-    in[i + 8] = load_input_data(input + 8);
-    in[i + 16] = load_input_data(input + 16);
-    in[i + 24] = load_input_data(input + 24);
-    input += 32;
-  }
-}
-
-void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                                  int stride) {
-  __m128i col[128], in[32];
-  int i, j;
-
-  // rows
-  for (i = 0; i < 4; ++i) {
-    load_buffer_8x32(input, in);
-    input += 32 << 3;
-
-    // Transpose 32x8 block to 8x32 block
-    transpose_16bit_8x8(in, in);
-    transpose_16bit_8x8(in + 8, in + 8);
-    transpose_16bit_8x8(in + 16, in + 16);
-    transpose_16bit_8x8(in + 24, in + 24);
-
-    idct32_full_8x32(in, col + (i << 5));
-  }
-
-  // columns
-  for (i = 0; i < 4; ++i) {
-    j = i << 3;
-    // Transpose 32x8 block to 8x32 block
-    transpose_16bit_8x8(col + j, in);
-    transpose_16bit_8x8(col + j + 32, in + 8);
-    transpose_16bit_8x8(col + j + 64, in + 16);
-    transpose_16bit_8x8(col + j + 96, in + 24);
-
-    idct32_full_8x32(in, in);
-    store_buffer_8x32(in, dest, stride);
-    dest += 8;
-  }
-}
author	James Zern <jzern@google.com>
	Tue, 27 Jun 2017 23:30:16 +0000 (23:30 +0000)
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>
	Tue, 27 Jun 2017 23:30:17 +0000 (23:30 +0000)
test/datarate_test.cc		patch \| blob \| history
test/dct32x32_test.cc		patch \| blob \| history
test/dct_test.cc		patch \| blob \| history
test/partial_idct_test.cc		patch \| blob \| history
test/variance_test.cc		patch \| blob \| history
third_party/libwebm/README.libvpx		patch \| blob \| history
third_party/libwebm/common/hdr_util.h		patch \| blob \| history
third_party/libwebm/mkvmuxer/mkvmuxer.cc		patch \| blob \| history
third_party/libwebm/mkvmuxer/mkvmuxerutil.cc		patch \| blob \| history
third_party/libwebm/mkvmuxer/mkvwriter.cc		patch \| blob \| history
third_party/libwebm/mkvparser/mkvparser.cc		patch \| blob \| history
third_party/libwebm/mkvparser/mkvreader.cc		patch \| blob \| history
tools/tiny_ssim.c		patch \| blob \| history
vp8/common/loopfilter_filters.c		patch \| blob \| history
vp8/common/onyxd.h		patch \| blob \| history
vp8/common/vp8_skin_detection.c		patch \| blob \| history
vp8/decoder/decodemv.c		patch \| blob \| history
vp8/decoder/onyxd_if.c		patch \| blob \| history
vp8/decoder/onyxd_int.h		patch \| blob \| history
vp8/decoder/threading.c		patch \| blob \| history
vp8/encoder/bitstream.h		patch \| blob \| history
vp8/encoder/encodeframe.c		patch \| blob \| history
vp8/encoder/encodeframe.h		patch \| blob \| history
vp8/encoder/ethreading.c		patch \| blob \| history
vp8/encoder/ethreading.h	[new file with mode: 0644]	patch \| blob
vp8/encoder/onyx_if.c		patch \| blob \| history
vp8/encoder/onyx_int.h		patch \| blob \| history
vp8/encoder/pickinter.c		patch \| blob \| history
vp8/encoder/picklpf.c		patch \| blob \| history
vp8/encoder/picklpf.h	[new file with mode: 0644]	patch \| blob
vp8/encoder/rdopt.c		patch \| blob \| history
vp8/encoder/rdopt.h		patch \| blob \| history
vp8/encoder/temporal_filter.c		patch \| blob \| history
vp8/encoder/temporal_filter.h	[new file with mode: 0644]	patch \| blob
vp8/encoder/x86/quantize_mmx.asm	[deleted file]	patch \| blob \| history
vp8/encoder/x86/quantize_ssse3.c		patch \| blob \| history
vp8/encoder/x86/vp8_enc_stubs_mmx.c	[deleted file]	patch \| blob \| history
vp8/vp8_dx_iface.c		patch \| blob \| history
vp8/vp8cx.mk		patch \| blob \| history
vp9/common/x86/vp9_idct_intrin_sse2.c		patch \| blob \| history
vp9/encoder/vp9_alt_ref_aq.c		patch \| blob \| history
vp9/encoder/vp9_alt_ref_aq.h		patch \| blob \| history
vp9/encoder/vp9_aq_cyclicrefresh.c		patch \| blob \| history
vp9/encoder/vp9_encodeframe.c		patch \| blob \| history
vp9/encoder/vp9_encodemb.c		patch \| blob \| history
vp9/encoder/vp9_encoder.c		patch \| blob \| history
vp9/encoder/vp9_encoder.h		patch \| blob \| history
vp9/encoder/vp9_pickmode.c		patch \| blob \| history
vp9/encoder/vp9_ratectrl.c		patch \| blob \| history
vp9/encoder/vp9_ratectrl.h		patch \| blob \| history
vp9/encoder/x86/temporal_filter_sse4.c		patch \| blob \| history
vpx_dsp/add_noise.c		patch \| blob \| history
vpx_dsp/arm/fdct32x32_neon.c	[new file with mode: 0644]	patch \| blob
vpx_dsp/deblock.c		patch \| blob \| history
vpx_dsp/vpx_dsp.mk		patch \| blob \| history
vpx_dsp/vpx_dsp_rtcd_defs.pl		patch \| blob \| history
vpx_dsp/x86/highbd_idct4x4_add_sse2.c		patch \| blob \| history
vpx_dsp/x86/highbd_idct4x4_add_sse4.c	[new file with mode: 0644]	patch \| blob
vpx_dsp/x86/highbd_inv_txfm_sse2.h		patch \| blob \| history
vpx_dsp/x86/inv_txfm_sse2.c		patch \| blob \| history
vpx_dsp/x86/inv_txfm_sse2.h		patch \| blob \| history
vpx_dsp/x86/inv_txfm_ssse3.c		patch \| blob \| history