Merge changes I92eb4312,Ibb2afe4e

author Johann Koenig <johannkoenig@google.com>

Wed, 10 May 2017 18:19:50 +0000 (18:19 +0000)

committer Gerrit Code Review <noreply-gerritcodereview@google.com>

Wed, 10 May 2017 18:19:52 +0000 (18:19 +0000)
author Johann Koenig <johannkoenig@google.com>
Wed, 10 May 2017 18:19:50 +0000 (18:19 +0000)
committer Gerrit Code Review <noreply-gerritcodereview@google.com>
Wed, 10 May 2017 18:19:52 +0000 (18:19 +0000)
diff --git a/test/datarate_test.cc b/test/datarate_test.cc

index dff75357e21bda0a5b6e7754e620ff424f62a44e..c1a925f7b032fca65a4df2426c398f432f3e47ae 100644 (file)
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -1041,7 +1041,7 @@ TEST_P(DatarateTestVP9LargeDenoiser, 4threads) {
    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
        << " The datarate for the file is lower than target by too much!";
-  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.28)
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.29)
        << " The datarate for the file is greater than target by too much!";
  }
  
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc

index f9745ed81fb8c3249b4b86f4d3929719c1db3ebc..6ea77fde28ea381ee901191ae52700189aebb5e3 100644 (file)
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -255,11 +255,11 @@ void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
  
  #if CONFIG_VP9_HIGHBITDEPTH
  void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_256_add_c(in, out, stride, 10);
+  vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
  }
  
  void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_256_add_c(in, out, stride, 12);
+  vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
  }
  
  void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
@@ -273,36 +273,36 @@ void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride,
  }
  
  void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 10);
+  vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
  }
  
  void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12);
+  vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
  }
  
  #if HAVE_SSE2
  void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_10_add_c(in, out, stride, 10);
+  vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
  }
  
  void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_10_add_c(in, out, stride, 12);
+  vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
  }
  
  void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 10);
+  vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
  }
  
  void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 12);
+  vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
  }
  
  void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 10);
+  vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
  }
  
  void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 12);
+  vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
  }
  #endif  // HAVE_SSE2
  #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -353,7 +353,7 @@ class Trans16x16TestBase {
  #if CONFIG_VP9_HIGHBITDEPTH
        } else {
          ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));
  #endif
        }
  
@@ -475,10 +475,10 @@ class Trans16x16TestBase {
          ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
  #if CONFIG_VP9_HIGHBITDEPTH
        } else {
-        inv_txfm_ref(output_ref_block, CONVERT_TO_BYTEPTR(ref16), pitch_,
+        inv_txfm_ref(output_ref_block, CAST_TO_BYTEPTR(ref16), pitch_,
                       tx_type_);
          ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(output_ref_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(output_ref_block, CAST_TO_BYTEPTR(dst16), pitch_));
  #endif
        }
        if (bit_depth_ == VPX_BITS_8) {
@@ -530,8 +530,7 @@ class Trans16x16TestBase {
          ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
  #if CONFIG_VP9_HIGHBITDEPTH
        } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), 16));
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), 16));
  #endif  // CONFIG_VP9_HIGHBITDEPTH
        }
  
@@ -585,9 +584,9 @@ class Trans16x16TestBase {
          ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
        } else {
  #if CONFIG_VP9_HIGHBITDEPTH
-        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
          ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
  #endif  // CONFIG_VP9_HIGHBITDEPTH
        }
  
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc

index a168e690ee3296be2473158ba0921a827737f5aa..d8054c4eb59a5c2333c4513759dffc18c5fa71b1 100644 (file)
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -71,11 +71,11 @@ typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
  
  #if CONFIG_VP9_HIGHBITDEPTH
  void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct32x32_1024_add_c(in, out, stride, 10);
+  vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
  }
  
  void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct32x32_1024_add_c(in, out, stride, 12);
+  vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
  }
  #endif  // CONFIG_VP9_HIGHBITDEPTH
  
@@ -137,7 +137,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
  #if CONFIG_VP9_HIGHBITDEPTH
      } else {
        ASM_REGISTER_STATE_CHECK(
-          inv_txfm_(test_temp_block, CONVERT_TO_BYTEPTR(dst16), 32));
+          inv_txfm_(test_temp_block, CAST_TO_BYTEPTR(dst16), 32));
  #endif
      }
  
@@ -275,7 +275,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
        ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
  #if CONFIG_VP9_HIGHBITDEPTH
      } else {
-      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CONVERT_TO_BYTEPTR(dst16), 32));
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CAST_TO_BYTEPTR(dst16), 32));
  #endif
      }
      for (int j = 0; j < kNumCoeffs; ++j) {
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc

index 444b0209d513d7dcee0b77b942e685c5b8f0feaa..bd2327520bce00cdc61d0e7d3f443d670e76c4f8 100644 (file)
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -55,36 +55,36 @@ void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
  
  #if CONFIG_VP9_HIGHBITDEPTH
  void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_c(in, out, stride, 10);
+  vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
  }
  
  void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_c(in, out, stride, 12);
+  vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
  }
  
  void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 10);
+  vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
  }
  
  void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 12);
+  vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
  }
  
  void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_iwht4x4_16_add_c(in, out, stride, 10);
+  vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
  }
  
  void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_iwht4x4_16_add_c(in, out, stride, 12);
+  vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
  }
  
  #if HAVE_SSE2
  void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 10);
+  vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
  }
  
  void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 12);
+  vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
  }
  #endif  // HAVE_SSE2
  #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -135,7 +135,7 @@ class Trans4x4TestBase {
  #if CONFIG_VP9_HIGHBITDEPTH
        } else {
          ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));
  #endif
        }
  
@@ -249,7 +249,7 @@ class Trans4x4TestBase {
  #if CONFIG_VP9_HIGHBITDEPTH
        } else {
          ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
  #endif
        }
  
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc

index 7c2df67f6a9e6dd61ef18bb00dda3a0986d78ad3..dfbb5dc3d423e721df68521868bfdfccf5bbddab 100644 (file)
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -88,45 +88,45 @@ void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
  
  #if CONFIG_VP9_HIGHBITDEPTH
  void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_64_add_c(in, out, stride, 10);
+  vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
  }
  
  void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_64_add_c(in, out, stride, 12);
+  vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
  }
  
  void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 10);
+  vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
  }
  
  void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12);
+  vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
  }
  
  #if HAVE_SSE2
  
  void idct8x8_12_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_12_add_c(in, out, stride, 10);
+  vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
  }
  
  void idct8x8_12_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_12_add_c(in, out, stride, 12);
+  vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
  }
  
  void idct8x8_12_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 10);
+  vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
  }
  
  void idct8x8_12_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 12);
+  vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
  }
  
  void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 10);
+  vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
  }
  
  void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 12);
+  vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
  }
  #endif  // HAVE_SSE2
  #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -257,7 +257,7 @@ class FwdTrans8x8TestBase {
  #if CONFIG_VP9_HIGHBITDEPTH
        } else {
          ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));
  #endif
        }
  
@@ -340,7 +340,7 @@ class FwdTrans8x8TestBase {
  #if CONFIG_VP9_HIGHBITDEPTH
        } else {
          ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));
  #endif
        }
  
@@ -413,7 +413,7 @@ class FwdTrans8x8TestBase {
  #if CONFIG_VP9_HIGHBITDEPTH
        } else {
          ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
  #endif
        }
  
@@ -497,9 +497,9 @@ class FwdTrans8x8TestBase {
          ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
  #if CONFIG_VP9_HIGHBITDEPTH
        } else {
-        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
          ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
  #endif
        }
  
@@ -671,16 +671,11 @@ INSTANTIATE_TEST_CASE_P(
  #endif  // CONFIG_VP9_HIGHBITDEPTH
  
  #if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct8x8_neon,
-                                                     &vpx_idct8x8_64_add_neon,
-                                                     0, VPX_BITS_8)));
-#else   // !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT,
                          ::testing::Values(make_tuple(&vpx_fdct8x8_neon,
                                                       &vpx_idct8x8_64_add_neon,
                                                       0, VPX_BITS_8)));
+#if !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      NEON, FwdTrans8x8HT,
      ::testing::Values(
@@ -688,8 +683,8 @@ INSTANTIATE_TEST_CASE_P(
          make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8),
          make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8),
          make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
  
  #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
  INSTANTIATE_TEST_CASE_P(SSE2, FwdTrans8x8DCT,
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc

index 0759cd41c6968571a4c1046896dbb48531ab2d24..2dda7da4520075fbef59c3bcba483ac90c6cf119 100644 (file)
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -43,9 +43,11 @@ void wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) {
  }
  
  #if CONFIG_VP9_HIGHBITDEPTH
-template <InvTxfmWithBdFunc fn>
+typedef void (*InvTxfmHighbdFunc)(const tran_low_t *in, uint16_t *out,
+                                  int stride, int bd);
+template <InvTxfmHighbdFunc fn>
  void highbd_wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) {
-  fn(in, CONVERT_TO_BYTEPTR(out), stride, bd);
+  fn(in, CAST_TO_SHORTPTR(out), stride, bd);
  }
  #endif
  
@@ -452,31 +454,31 @@ const PartialInvTxfmParam neon_partial_idct_tests[] = {
               &highbd_wrapper<vpx_highbd_idct32x32_1024_add_neon>, TX_32X32,
               1024, 12, 2),
    make_tuple(
-      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
        &highbd_wrapper<vpx_highbd_idct32x32_135_add_neon>, TX_32X32, 135, 8, 2),
    make_tuple(
-      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
        &highbd_wrapper<vpx_highbd_idct32x32_135_add_neon>, TX_32X32, 135, 10, 2),
    make_tuple(
-      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
        &highbd_wrapper<vpx_highbd_idct32x32_135_add_neon>, TX_32X32, 135, 12, 2),
    make_tuple(
-      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
        &highbd_wrapper<vpx_highbd_idct32x32_34_add_neon>, TX_32X32, 34, 8, 2),
    make_tuple(
-      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
        &highbd_wrapper<vpx_highbd_idct32x32_34_add_neon>, TX_32X32, 34, 10, 2),
    make_tuple(
-      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
        &highbd_wrapper<vpx_highbd_idct32x32_34_add_neon>, TX_32X32, 34, 12, 2),
    make_tuple(
-      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
        &highbd_wrapper<vpx_highbd_idct32x32_1_add_neon>, TX_32X32, 1, 8, 2),
    make_tuple(
-      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
        &highbd_wrapper<vpx_highbd_idct32x32_1_add_neon>, TX_32X32, 1, 10, 2),
    make_tuple(
-      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
        &highbd_wrapper<vpx_highbd_idct32x32_1_add_neon>, TX_32X32, 1, 12, 2),
    make_tuple(
        &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
@@ -488,31 +490,31 @@ const PartialInvTxfmParam neon_partial_idct_tests[] = {
        &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
        &highbd_wrapper<vpx_highbd_idct16x16_256_add_neon>, TX_16X16, 256, 12, 2),
    make_tuple(
-      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
        &highbd_wrapper<vpx_highbd_idct16x16_38_add_neon>, TX_16X16, 38, 8, 2),
    make_tuple(
-      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
        &highbd_wrapper<vpx_highbd_idct16x16_38_add_neon>, TX_16X16, 38, 10, 2),
    make_tuple(
-      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
        &highbd_wrapper<vpx_highbd_idct16x16_38_add_neon>, TX_16X16, 38, 12, 2),
    make_tuple(
-      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
        &highbd_wrapper<vpx_highbd_idct16x16_10_add_neon>, TX_16X16, 10, 8, 2),
    make_tuple(
-      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
        &highbd_wrapper<vpx_highbd_idct16x16_10_add_neon>, TX_16X16, 10, 10, 2),
    make_tuple(
-      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
        &highbd_wrapper<vpx_highbd_idct16x16_10_add_neon>, TX_16X16, 10, 12, 2),
    make_tuple(
-      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
        &highbd_wrapper<vpx_highbd_idct16x16_1_add_neon>, TX_16X16, 1, 8, 2),
    make_tuple(
-      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
        &highbd_wrapper<vpx_highbd_idct16x16_1_add_neon>, TX_16X16, 1, 10, 2),
    make_tuple(
-      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
        &highbd_wrapper<vpx_highbd_idct16x16_1_add_neon>, TX_16X16, 1, 12, 2),
    make_tuple(&vpx_highbd_fdct8x8_c,
               &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
@@ -556,29 +558,29 @@ const PartialInvTxfmParam neon_partial_idct_tests[] = {
  #endif  // CONFIG_VP9_HIGHBITDEPTH
    make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
               &wrapper<vpx_idct32x32_1024_add_neon>, TX_32X32, 1024, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_135_add_c>,
               &wrapper<vpx_idct32x32_135_add_neon>, TX_32X32, 135, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
               &wrapper<vpx_idct32x32_34_add_neon>, TX_32X32, 34, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
               &wrapper<vpx_idct32x32_1_add_neon>, TX_32X32, 1, 8, 1),
    make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
               &wrapper<vpx_idct16x16_256_add_neon>, TX_16X16, 256, 8, 1),
-  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_38_add_c>,
               &wrapper<vpx_idct16x16_38_add_neon>, TX_16X16, 38, 8, 1),
-  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_10_add_c>,
               &wrapper<vpx_idct16x16_10_add_neon>, TX_16X16, 10, 8, 1),
-  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_1_add_c>,
               &wrapper<vpx_idct16x16_1_add_neon>, TX_16X16, 1, 8, 1),
    make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
               &wrapper<vpx_idct8x8_64_add_neon>, TX_8X8, 64, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>,
               &wrapper<vpx_idct8x8_12_add_neon>, TX_8X8, 12, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_1_add_c>,
               &wrapper<vpx_idct8x8_1_add_neon>, TX_8X8, 1, 8, 1),
    make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
               &wrapper<vpx_idct4x4_16_add_neon>, TX_4X4, 16, 8, 1),
-  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_1_add_c>,
               &wrapper<vpx_idct4x4_1_add_neon>, TX_4X4, 1, 8, 1)
  };
  
@@ -591,13 +593,13 @@ INSTANTIATE_TEST_CASE_P(NEON, PartialIDctTest,
  const PartialInvTxfmParam sse2_partial_idct_tests[] = {
  #if CONFIG_VP9_HIGHBITDEPTH
    make_tuple(
-      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
        &highbd_wrapper<vpx_highbd_idct32x32_1_add_sse2>, TX_32X32, 1, 8, 2),
    make_tuple(
-      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
        &highbd_wrapper<vpx_highbd_idct32x32_1_add_sse2>, TX_32X32, 1, 10, 2),
    make_tuple(
-      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
        &highbd_wrapper<vpx_highbd_idct32x32_1_add_sse2>, TX_32X32, 1, 12, 2),
    make_tuple(
        &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
@@ -609,13 +611,13 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
        &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
        &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse2>, TX_16X16, 256, 12, 2),
    make_tuple(
-      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
        &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 2),
    make_tuple(
-      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
        &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 10, 2),
    make_tuple(
-      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
        &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 12, 2),
    make_tuple(&vpx_highbd_fdct8x8_c,
               &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
@@ -627,13 +629,13 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
        &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
        &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse2>, TX_8X8, 64, 12, 2),
    make_tuple(&vpx_highbd_fdct8x8_c,
-             &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
               &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse2>, TX_8X8, 12, 8, 2),
    make_tuple(
-      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
        &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse2>, TX_8X8, 12, 10, 2),
    make_tuple(
-      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
        &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse2>, TX_8X8, 12, 12, 2),
    make_tuple(&vpx_highbd_fdct4x4_c,
               &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
@@ -647,27 +649,25 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
  #endif  // CONFIG_VP9_HIGHBITDEPTH
    make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
               &wrapper<vpx_idct32x32_1024_add_sse2>, TX_32X32, 1024, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
-             &wrapper<vpx_idct32x32_1024_add_sse2>, TX_32X32, 135, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
               &wrapper<vpx_idct32x32_34_add_sse2>, TX_32X32, 34, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
               &wrapper<vpx_idct32x32_1_add_sse2>, TX_32X32, 1, 8, 1),
    make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
               &wrapper<vpx_idct16x16_256_add_sse2>, TX_16X16, 256, 8, 1),
-  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_10_add_c>,
               &wrapper<vpx_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 1),
-  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_1_add_c>,
               &wrapper<vpx_idct16x16_1_add_sse2>, TX_16X16, 1, 8, 1),
    make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
               &wrapper<vpx_idct8x8_64_add_sse2>, TX_8X8, 64, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>,
               &wrapper<vpx_idct8x8_12_add_sse2>, TX_8X8, 12, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_1_add_c>,
               &wrapper<vpx_idct8x8_1_add_sse2>, TX_8X8, 1, 8, 1),
    make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
               &wrapper<vpx_idct4x4_16_add_sse2>, TX_4X4, 16, 8, 1),
-  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_1_add_c>,
               &wrapper<vpx_idct4x4_1_add_sse2>, TX_4X4, 1, 8, 1)
  };
  
@@ -680,13 +680,13 @@ INSTANTIATE_TEST_CASE_P(SSE2, PartialIDctTest,
  const PartialInvTxfmParam ssse3_partial_idct_tests[] = {
    make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
               &wrapper<vpx_idct32x32_1024_add_ssse3>, TX_32X32, 1024, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_135_add_c>,
               &wrapper<vpx_idct32x32_135_add_ssse3>, TX_32X32, 135, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
               &wrapper<vpx_idct32x32_34_add_ssse3>, TX_32X32, 34, 8, 1),
    make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
               &wrapper<vpx_idct8x8_64_add_ssse3>, TX_8X8, 64, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>,
               &wrapper<vpx_idct8x8_12_add_ssse3>, TX_8X8, 12, 8, 1)
  };
  
@@ -698,27 +698,25 @@ INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest,
  const PartialInvTxfmParam dspr2_partial_idct_tests[] = {
    make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
               &wrapper<vpx_idct32x32_1024_add_dspr2>, TX_32X32, 1024, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
-             &wrapper<vpx_idct32x32_1024_add_dspr2>, TX_32X32, 135, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
               &wrapper<vpx_idct32x32_34_add_dspr2>, TX_32X32, 34, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
               &wrapper<vpx_idct32x32_1_add_dspr2>, TX_32X32, 1, 8, 1),
    make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
               &wrapper<vpx_idct16x16_256_add_dspr2>, TX_16X16, 256, 8, 1),
-  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_10_add_c>,
               &wrapper<vpx_idct16x16_10_add_dspr2>, TX_16X16, 10, 8, 1),
-  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_1_add_c>,
               &wrapper<vpx_idct16x16_1_add_dspr2>, TX_16X16, 1, 8, 1),
    make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
               &wrapper<vpx_idct8x8_64_add_dspr2>, TX_8X8, 64, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>,
               &wrapper<vpx_idct8x8_12_add_dspr2>, TX_8X8, 12, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_1_add_c>,
               &wrapper<vpx_idct8x8_1_add_dspr2>, TX_8X8, 1, 8, 1),
    make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
               &wrapper<vpx_idct4x4_16_add_dspr2>, TX_4X4, 16, 8, 1),
-  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_1_add_c>,
               &wrapper<vpx_idct4x4_1_add_dspr2>, TX_4X4, 1, 8, 1)
  };
  
@@ -731,27 +729,25 @@ INSTANTIATE_TEST_CASE_P(DSPR2, PartialIDctTest,
  const PartialInvTxfmParam msa_partial_idct_tests[] = {
    make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
               &wrapper<vpx_idct32x32_1024_add_msa>, TX_32X32, 1024, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
-             &wrapper<vpx_idct32x32_1024_add_msa>, TX_32X32, 135, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
               &wrapper<vpx_idct32x32_34_add_msa>, TX_32X32, 34, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
               &wrapper<vpx_idct32x32_1_add_msa>, TX_32X32, 1, 8, 1),
    make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
               &wrapper<vpx_idct16x16_256_add_msa>, TX_16X16, 256, 8, 1),
-  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_10_add_c>,
               &wrapper<vpx_idct16x16_10_add_msa>, TX_16X16, 10, 8, 1),
-  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_1_add_c>,
               &wrapper<vpx_idct16x16_1_add_msa>, TX_16X16, 1, 8, 1),
    make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
               &wrapper<vpx_idct8x8_64_add_msa>, TX_8X8, 64, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>,
               &wrapper<vpx_idct8x8_12_add_msa>, TX_8X8, 12, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_1_add_c>,
               &wrapper<vpx_idct8x8_1_add_msa>, TX_8X8, 1, 8, 1),
    make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
               &wrapper<vpx_idct4x4_16_add_msa>, TX_4X4, 16, 8, 1),
-  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_1_add_c>,
               &wrapper<vpx_idct4x4_1_add_msa>, TX_4X4, 1, 8, 1)
  };
  
diff --git a/test/variance_test.cc b/test/variance_test.cc

index 9eb9be3a1de034e9fd8bbff820214427676f2443..57d57c97239cad2f0d57730a0e0ab79fa083eca8 100644 (file)
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -22,6 +22,7 @@
  #include "vpx/vpx_integer.h"
  #include "vpx_mem/vpx_mem.h"
  #include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
  
  namespace {
  
@@ -345,6 +346,7 @@ class MainTestClass
    void RefTest();
    void RefStrideTest();
    void OneQuarterTest();
+  void SpeedTest();
  
    // MSE/SSE tests
    void RefTestMse();
@@ -363,6 +365,7 @@ class MainTestClass
    int byte_shift() const { return params_.bit_depth - 8; }
    int block_size() const { return params_.block_size; }
    int width() const { return params_.width; }
+  int height() const { return params_.height; }
    uint32_t mask() const { return params_.mask; }
  };
  
@@ -471,6 +474,35 @@ void MainTestClass<VarianceFunctionType>::OneQuarterTest() {
    EXPECT_EQ(expected, var);
  }
  
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::SpeedTest() {
+  const int half = block_size() / 2;
+  if (!use_high_bit_depth()) {
+    memset(src_, 255, block_size());
+    memset(ref_, 255, half);
+    memset(ref_ + half, 0, half);
+#if CONFIG_VP9_HIGHBITDEPTH
+  } else {
+    vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size());
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << byte_shift(), half);
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+  unsigned int sse;
+
+  vpx_usec_timer timer;
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < 100000000 / block_size(); ++i) {
+    const uint32_t variance = params_.func(src_, width(), ref_, width(), &sse);
+    // Ignore return value.
+    (void)variance;
+  }
+  vpx_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("Variance %dx%d time: %5d ms\n", width(), height(),
+         elapsed_time / 1000);
+}
+
  ////////////////////////////////////////////////////////////////////////////////
  // Tests related to MSE / SSE.
  
@@ -727,6 +759,7 @@ TEST_P(VpxVarianceTest, Zero) { ZeroTest(); }
  TEST_P(VpxVarianceTest, Ref) { RefTest(); }
  TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); }
  TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(VpxVarianceTest, DISABLED_Speed) { SpeedTest(); }
  TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
  TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
  TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); }
@@ -809,6 +842,7 @@ TEST_P(VpxHBDVarianceTest, Zero) { ZeroTest(); }
  TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); }
  TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); }
  TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(VpxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); }
  TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }
  TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
  TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
@@ -1219,10 +1253,13 @@ INSTANTIATE_TEST_CASE_P(
                        VarianceParams(6, 5, &vpx_variance64x32_neon),
                        VarianceParams(5, 6, &vpx_variance32x64_neon),
                        VarianceParams(5, 5, &vpx_variance32x32_neon),
+                      VarianceParams(5, 4, &vpx_variance32x16_neon),
+                      VarianceParams(4, 5, &vpx_variance16x32_neon),
                        VarianceParams(4, 4, &vpx_variance16x16_neon),
                        VarianceParams(4, 3, &vpx_variance16x8_neon),
                        VarianceParams(3, 4, &vpx_variance8x16_neon),
-                      VarianceParams(3, 3, &vpx_variance8x8_neon)));
+                      VarianceParams(3, 3, &vpx_variance8x8_neon),
+                      VarianceParams(3, 2, &vpx_variance8x4_neon)));
  
  INSTANTIATE_TEST_CASE_P(
      NEON, VpxSubpelVarianceTest,
diff --git a/test/vp9_denoiser_test.cc b/test/vp9_denoiser_test.cc

index 4a10e187b5c94e08d92d58c47d874ea62ecf8424..56ca257c5958a5eee64f57d4d87819fcc922f7ae 100644 (file)
--- a/test/vp9_denoiser_test.cc
+++ b/test/vp9_denoiser_test.cc
@@ -116,4 +116,19 @@ INSTANTIATE_TEST_CASE_P(
                        make_tuple(&vp9_denoiser_filter_sse2, BLOCK_64X32),
                        make_tuple(&vp9_denoiser_filter_sse2, BLOCK_64X64)));
  #endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, VP9DenoiserTest,
+    ::testing::Values(make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X8),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X16),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X8),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X16),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X32),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X16),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X32),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X64),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_64X32),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_64X64)));
+#endif
  }  // namespace
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c

index 55957414cde9e309aa654aa61b53a38cd26d605a..69069042cc298c050d05192a76d9b7a301919f4d 100644 (file)
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -205,7 +205,7 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
  
  #if CONFIG_VP9_HIGHBITDEPTH
  
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
                                  int stride, int tx_type, int bd) {
    const highbd_transform_2d IHT_4[] = {
      { vpx_highbd_idct4_c, vpx_highbd_idct4_c },   // DCT_DCT  = 0
@@ -213,7 +213,6 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
      { vpx_highbd_idct4_c, vpx_highbd_iadst4_c },  // DCT_ADST = 2
      { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c }  // ADST_ADST = 3
    };
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    int i, j;
    tran_low_t out[4 * 4];
@@ -245,14 +244,13 @@ static const highbd_transform_2d HIGH_IHT_8[] = {
    { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c }  // ADST_ADST = 3
  };
  
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
                                  int stride, int tx_type, int bd) {
    int i, j;
    tran_low_t out[8 * 8];
    tran_low_t *outptr = out;
    tran_low_t temp_in[8], temp_out[8];
    const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    // Inverse transform row vectors.
    for (i = 0; i < 8; ++i) {
@@ -279,14 +277,13 @@ static const highbd_transform_2d HIGH_IHT_16[] = {
    { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c }  // ADST_ADST = 3
  };
  
-void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
                                     int stride, int tx_type, int bd) {
    int i, j;
    tran_low_t out[16 * 16];
    tran_low_t *outptr = out;
    tran_low_t temp_in[16], temp_out[16];
    const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    // Rows
    for (i = 0; i < 16; ++i) {
@@ -307,7 +304,7 @@ void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
  }
  
  // idct
-void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
                              int eob, int bd) {
    if (eob > 1)
      vpx_highbd_idct4x4_16_add(input, dest, stride, bd);
@@ -315,7 +312,7 @@ void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
      vpx_highbd_idct4x4_1_add(input, dest, stride, bd);
  }
  
-void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
                              int eob, int bd) {
    if (eob > 1)
      vpx_highbd_iwht4x4_16_add(input, dest, stride, bd);
@@ -323,7 +320,7 @@ void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
      vpx_highbd_iwht4x4_1_add(input, dest, stride, bd);
  }
  
-void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride,
                              int eob, int bd) {
    // If dc is 1, then input[0] is the reconstructed value, do not need
    // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
@@ -340,7 +337,7 @@ void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
    }
  }
  
-void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest,
                                int stride, int eob, int bd) {
    // The calculation can be simplified if there are not many non-zero dct
    // coefficients. Use eobs to separate different cases.
@@ -356,7 +353,7 @@ void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
    }
  }
  
-void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest,
                                int stride, int eob, int bd) {
    // Non-zero coeff only in upper-left 8x8
    if (eob == 1) {
@@ -372,7 +369,7 @@ void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
  
  // iht
  void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
-                           uint8_t *dest, int stride, int eob, int bd) {
+                           uint16_t *dest, int stride, int eob, int bd) {
    if (tx_type == DCT_DCT)
      vp9_highbd_idct4x4_add(input, dest, stride, eob, bd);
    else
@@ -380,7 +377,7 @@ void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
  }
  
  void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
-                           uint8_t *dest, int stride, int eob, int bd) {
+                           uint16_t *dest, int stride, int eob, int bd) {
    if (tx_type == DCT_DCT) {
      vp9_highbd_idct8x8_add(input, dest, stride, eob, bd);
    } else {
@@ -389,7 +386,7 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
  }
  
  void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
-                             uint8_t *dest, int stride, int eob, int bd) {
+                             uint16_t *dest, int stride, int eob, int bd) {
    if (tx_type == DCT_DCT) {
      vp9_highbd_idct16x16_add(input, dest, stride, eob, bd);
    } else {
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h

index ea958a38c0e206d0189c061b1f09c914e560b4c4..3e83b8402de0739161286f2e8cbc5da17b6252be 100644 (file)
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -57,22 +57,22 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                        int stride, int eob);
  
  #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
                              int eob, int bd);
-void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
                              int eob, int bd);
-void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride,
                              int eob, int bd);
-void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest,
                                int stride, int eob, int bd);
-void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest,
                                int stride, int eob, int bd);
  void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
-                           uint8_t *dest, int stride, int eob, int bd);
+                           uint16_t *dest, int stride, int eob, int bd);
  void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
-                           uint8_t *dest, int stride, int eob, int bd);
+                           uint16_t *dest, int stride, int eob, int bd);
  void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
-                             uint8_t *dest, int stride, int eob, int bd);
+                             uint16_t *dest, int stride, int eob, int bd);
  #endif  // CONFIG_VP9_HIGHBITDEPTH
  #ifdef __cplusplus
  }  // extern "C"
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl

index da449e254954cad1c695cb030761322a2f7bb034..20013cb9efaf45cae6c9be223b62f18d6c671aa3 100644 (file)
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -101,11 +101,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    #
    # Note as optimized versions of these functions are added we need to add a check to ensure
    # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
-  add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";
+  add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
  
-  add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";
+  add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
  
-  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
+  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
  }
  
  #
@@ -120,7 +120,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
  #
  if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
    add_proto qw/int vp9_denoiser_filter/, "const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude";
-  specialize qw/vp9_denoiser_filter sse2/;
+  specialize qw/vp9_denoiser_filter neon sse2/;
  }
  
  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
@@ -225,7 +225,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  #
  # frame based scale
  #
-add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, int phase_scaler";
+add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, uint8_t filter_type, int phase_scaler";
  specialize qw/vp9_scale_and_extend_frame ssse3/;
  
  }
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c

index dcfc454aa0dca9fe1fc31216b1ccce05e848bed2..bb2dcf52bf509920a27dbd1e8422690a6ac259a2 100644 (file)
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -16,7 +16,6 @@
  void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                              int tx_type) {
    __m128i in[2];
-  const __m128i zero = _mm_setzero_si128();
    const __m128i eight = _mm_set1_epi16(8);
  
    in[0] = load_input_data(input);
@@ -49,31 +48,7 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
    in[0] = _mm_srai_epi16(in[0], 4);
    in[1] = _mm_srai_epi16(in[1], 4);
  
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi32(d0,
-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
-    d2 = _mm_unpacklo_epi32(
-        d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
-    d0 = _mm_unpacklo_epi8(d0, zero);
-    d2 = _mm_unpacklo_epi8(d2, zero);
-    d0 = _mm_add_epi16(d0, in[0]);
-    d2 = _mm_add_epi16(d2, in[1]);
-    d0 = _mm_packus_epi16(d0, d2);
-    // store result[0]
-    *(int *)dest = _mm_cvtsi128_si32(d0);
-    // store result[1]
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-    // store result[2]
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-    // store result[3]
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-  }
+  recon_and_store4x4_sse2(in, dest, stride);
  }
  
  void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c

index a9e5eebbab8b01269e884ba2ea775173e9dd369c..0760f8c2398ee5c4f0ff82b312f843471be47385 100644 (file)
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -189,21 +189,22 @@ static void inverse_transform_block_inter(MACROBLOCKD *xd, int plane,
    assert(eob > 0);
  #if CONFIG_VP9_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
      if (xd->lossless) {
-      vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+      vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd);
      } else {
        switch (tx_size) {
          case TX_4X4:
-          vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_idct4x4_add(dqcoeff, dst16, stride, eob, xd->bd);
            break;
          case TX_8X8:
-          vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_idct8x8_add(dqcoeff, dst16, stride, eob, xd->bd);
            break;
          case TX_16X16:
-          vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_idct16x16_add(dqcoeff, dst16, stride, eob, xd->bd);
            break;
          case TX_32X32:
-          vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd);
            break;
          default: assert(0 && "Invalid transform size");
        }
@@ -256,21 +257,22 @@ static void inverse_transform_block_intra(MACROBLOCKD *xd, int plane,
    assert(eob > 0);
  #if CONFIG_VP9_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
      if (xd->lossless) {
-      vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+      vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd);
      } else {
        switch (tx_size) {
          case TX_4X4:
-          vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);
            break;
          case TX_8X8:
-          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);
            break;
          case TX_16X16:
-          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);
            break;
          case TX_32X32:
-          vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd);
            break;
          default: assert(0 && "Invalid transform size");
        }
diff --git a/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/vp9/encoder/arm/neon/vp9_denoiser_neon.c

new file mode 100644 (file)

index 0000000..4152e7b
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_denoiser_neon.c
@@ -0,0 +1,352 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_denoiser.h"
+#include "vpx_mem/vpx_mem.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
+  const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
+  const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+  const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
+  const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210),
+                                vget_low_s64(fedcba98_76543210));
+  const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
+  return sum_diff;
+}
+
+// Denoise a 16x1 vector.
+static INLINE int8x16_t denoiser_16x1_neon(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+    const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold,
+    const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment,
+    const uint8x16_t v_delta_level_1_and_2,
+    const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) {
+  const uint8x16_t v_sig = vld1q_u8(sig);
+  const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+  /* Calculate absolute difference and sign masks. */
+  const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+
+  /* Figure out which level that put us in. */
+  const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff);
+  const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff);
+  const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff);
+
+  /* Calculate absolute adjustments for level 1, 2 and 3. */
+  const uint8x16_t v_level2_adjustment =
+      vandq_u8(v_level2_mask, v_delta_level_1_and_2);
+  const uint8x16_t v_level3_adjustment =
+      vandq_u8(v_level3_mask, v_delta_level_2_and_3);
+  const uint8x16_t v_level1and2_adjustment =
+      vaddq_u8(v_level1_adjustment, v_level2_adjustment);
+  const uint8x16_t v_level1and2and3_adjustment =
+      vaddq_u8(v_level1and2_adjustment, v_level3_adjustment);
+
+  /* Figure adjustment absolute value by selecting between the absolute
+   * difference if in level0 or the value for level 1, 2 and 3.
+   */
+  const uint8x16_t v_abs_adjustment =
+      vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff);
+
+  /* Calculate positive and negative adjustments. Apply them to the signal
+   * and accumulate them. Adjustments are less than eight and the maximum
+   * sum of them (7 * 16) can fit in a signed char.
+   */
+  const uint8x16_t v_pos_adjustment =
+      vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+  const uint8x16_t v_neg_adjustment =
+      vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+  uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
+  v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
+
+  /* Store results. */
+  vst1q_u8(running_avg_y, v_running_avg_y);
+
+  /* Sum all the accumulators to have the sum of all pixel differences
+   * for this macroblock.
+   */
+  {
+    const int8x16_t v_sum_diff =
+        vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
+                  vreinterpretq_s8_u8(v_neg_adjustment));
+    v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+  }
+  return v_sum_diff_total;
+}
+
+static INLINE int8x16_t denoiser_adjust_16x1_neon(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+    const uint8x16_t k_delta, int8x16_t v_sum_diff_total) {
+  uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
+  const uint8x16_t v_sig = vld1q_u8(sig);
+  const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+  /* Calculate absolute difference and sign masks. */
+  const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+  // Clamp absolute difference to delta to get the adjustment.
+  const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta));
+
+  const uint8x16_t v_pos_adjustment =
+      vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+  const uint8x16_t v_neg_adjustment =
+      vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+  v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
+  v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
+
+  /* Store results. */
+  vst1q_u8(running_avg_y, v_running_avg_y);
+
+  {
+    const int8x16_t v_sum_diff =
+        vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
+                  vreinterpretq_s8_u8(v_pos_adjustment));
+    v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+  }
+  return v_sum_diff_total;
+}
+
+// Denoise 8x8 and 8x16 blocks.
+static int vp9_denoiser_8xN_neon(const uint8_t *sig, int sig_stride,
+                                 const uint8_t *mc_running_avg_y,
+                                 int mc_avg_y_stride, uint8_t *running_avg_y,
+                                 int avg_y_stride, int increase_denoising,
+                                 BLOCK_SIZE bs, int motion_magnitude,
+                                 int width) {
+  int sum_diff_thresh, r, sum_diff = 0;
+  const int shift_inc =
+      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+          ? 1
+          : 0;
+  uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+
+  const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+  const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+  const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+  const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc);
+  const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+  const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+  const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
+
+  int8x16_t v_sum_diff_total = vdupq_n_s8(0);
+
+  for (r = 0; r < b_height; ++r) {
+    memcpy(sig_buffer[r], sig, width);
+    memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+    memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+    memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
+           width);
+    memcpy(running_buffer[r], running_avg_y, width);
+    memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+    v_sum_diff_total = denoiser_16x1_neon(
+        sig_buffer[r], mc_running_buffer[r], running_buffer[r],
+        v_level1_threshold, v_level2_threshold, v_level3_threshold,
+        v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3,
+        v_sum_diff_total);
+    {
+      const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+      const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer);
+      const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer);
+      vst1_u8(running_avg_y, v_running_buffer_low);
+      vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+    }
+    // Update pointers for next iteration.
+    sig += (sig_stride << 1);
+    mc_running_avg_y += (mc_avg_y_stride << 1);
+    running_avg_y += (avg_y_stride << 1);
+  }
+
+  {
+    sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising),
+      // check if we can still apply some (weaker) temporal filtering to
+      // this block, that would otherwise not be denoised at all. Simplest
+      // is to apply an additional adjustment to running_avg_y to bring it
+      // closer to sig. The adjustment is capped by a maximum delta, and
+      // chosen such that in most cases the resulting sum_diff will be
+      // within the acceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over the
+      // threshold.
+      const int delta =
+          ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const uint8x16_t k_delta = vmovq_n_u8(delta);
+        running_avg_y -= avg_y_stride * (b_height << 1);
+        for (r = 0; r < b_height; ++r) {
+          v_sum_diff_total = denoiser_adjust_16x1_neon(
+              sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta,
+              v_sum_diff_total);
+          {
+            const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+            const uint8x8_t v_running_buffer_high =
+                vget_high_u8(v_running_buffer);
+            const uint8x8_t v_running_buffer_low =
+                vget_low_u8(v_running_buffer);
+            vst1_u8(running_avg_y, v_running_buffer_low);
+            vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+          }
+          // Update pointers for next iteration.
+          running_avg_y += (avg_y_stride << 1);
+        }
+        sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+
+  return FILTER_BLOCK;
+}
+
+// Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks.
+static int vp9_denoiser_NxM_neon(const uint8_t *sig, int sig_stride,
+                                 const uint8_t *mc_running_avg_y,
+                                 int mc_avg_y_stride, uint8_t *running_avg_y,
+                                 int avg_y_stride, int increase_denoising,
+                                 BLOCK_SIZE bs, int motion_magnitude) {
+  const int shift_inc =
+      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+          ? 1
+          : 0;
+  const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+  const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+  const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+  const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
+  const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+  const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+  const int b_width = (4 << b_width_log2_lookup[bs]);
+  const int b_height = (4 << b_height_log2_lookup[bs]);
+  const int b_width_shift4 = b_width >> 4;
+
+  int8x16_t v_sum_diff_total[4][4];
+  int r, c, sum_diff = 0;
+
+  for (r = 0; r < 4; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      v_sum_diff_total[c][r] = vdupq_n_s8(0);
+    }
+  }
+
+  for (r = 0; r < b_height; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon(
+          sig, mc_running_avg_y, running_avg_y, v_level1_threshold,
+          v_level2_threshold, v_level3_threshold, v_level1_adjustment,
+          v_delta_level_1_and_2, v_delta_level_2_and_3,
+          v_sum_diff_total[c][r >> 4]);
+
+      // Update pointers for next iteration.
+      sig += 16;
+      mc_running_avg_y += 16;
+      running_avg_y += 16;
+    }
+
+    if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+      for (c = 0; c < b_width_shift4; ++c) {
+        sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+      }
+    }
+
+    // Update pointers for next iteration.
+    sig = sig - b_width + sig_stride;
+    mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+    running_avg_y = running_avg_y - b_width + avg_y_stride;
+  }
+
+  {
+    const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      const int delta =
+          ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const uint8x16_t k_delta = vdupq_n_u8(delta);
+        sig -= sig_stride * b_height;
+        mc_running_avg_y -= mc_avg_y_stride * b_height;
+        running_avg_y -= avg_y_stride * b_height;
+        sum_diff = 0;
+
+        for (r = 0; r < b_height; ++r) {
+          for (c = 0; c < b_width_shift4; ++c) {
+            v_sum_diff_total[c][r >> 4] =
+                denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y,
+                                          k_delta, v_sum_diff_total[c][r >> 4]);
+
+            // Update pointers for next iteration.
+            sig += 16;
+            mc_running_avg_y += 16;
+            running_avg_y += 16;
+          }
+          if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+            for (c = 0; c < b_width_shift4; ++c) {
+              sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+            }
+          }
+
+          sig = sig - b_width + sig_stride;
+          mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+          running_avg_y = running_avg_y - b_width + avg_y_stride;
+        }
+
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride,
+                             const uint8_t *mc_avg, int mc_avg_stride,
+                             uint8_t *avg, int avg_stride,
+                             int increase_denoising, BLOCK_SIZE bs,
+                             int motion_magnitude) {
+  // Rank by frequency of the block type to have an early termination.
+  if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
+      bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
+      bs == BLOCK_32X64 || bs == BLOCK_64X32) {
+    return vp9_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+                                 avg_stride, increase_denoising, bs,
+                                 motion_magnitude);
+  } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+    return vp9_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+                                 avg_stride, increase_denoising, bs,
+                                 motion_magnitude, 8);
+  }
+  return COPY_BLOCK;
+}
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h

index bdd6662863e399fe1e0c55f6a9c8698df30b2bc2..ab488f48f0aaa4432873172ebac585fed9a1b9b3 100644 (file)
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -184,7 +184,7 @@ struct macroblock {
    void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
    void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
  #if CONFIG_VP9_HIGHBITDEPTH
-  void (*highbd_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,
+  void (*highbd_itxm_add)(const tran_low_t *input, uint16_t *dest, int stride,
                            int eob, int bd);
  #endif
  };
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c

index 16eea8fa2c562e99b526792330516d0bb388cf94..7e30499c5732d976d4dca2820d3b98ac2bd78cdb 100644 (file)
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -637,24 +637,25 @@ static void encode_block(int plane, int block, int row, int col,
    if (x->skip_encode || p->eobs[block] == 0) return;
  #if CONFIG_VP9_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
      switch (tx_size) {
        case TX_32X32:
-        vp9_highbd_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+        vp9_highbd_idct32x32_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
                                   xd->bd);
          break;
        case TX_16X16:
-        vp9_highbd_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+        vp9_highbd_idct16x16_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
                                   xd->bd);
          break;
        case TX_8X8:
-        vp9_highbd_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+        vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
                                 xd->bd);
          break;
        case TX_4X4:
          // this is like vp9_short_idct4x4 but has a special case around eob<=1
          // which is significant (not just an optimization) for the lossless
          // case.
-        x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+        x->highbd_itxm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
                             xd->bd);
          break;
        default: assert(0 && "Invalid transform size");
@@ -699,7 +700,8 @@ static void encode_block_pass1(int plane, int block, int row, int col,
    if (p->eobs[block] > 0) {
  #if CONFIG_VP9_HIGHBITDEPTH
      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd);
+      x->highbd_itxm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride,
+                         p->eobs[block], xd->bd);
        return;
      }
  #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -799,6 +801,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
  
  #if CONFIG_VP9_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
      switch (tx_size) {
        case TX_32X32:
          if (!x->skip_recode) {
@@ -814,7 +817,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
            *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
          }
          if (!x->skip_encode && *eob) {
-          vp9_highbd_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+          vp9_highbd_idct32x32_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);
          }
          break;
        case TX_16X16:
@@ -834,7 +837,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
            *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
          }
          if (!x->skip_encode && *eob) {
-          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob,
+          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, dst_stride, *eob,
                                    xd->bd);
          }
          break;
@@ -855,7 +858,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
            *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
          }
          if (!x->skip_encode && *eob) {
-          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob,
+          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, dst_stride, *eob,
                                  xd->bd);
          }
          break;
@@ -880,9 +883,10 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
              // this is like vp9_short_idct4x4 but has a special case around
              // eob<=1 which is significant (not just an optimization) for the
              // lossless case.
-            x->highbd_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+            x->highbd_itxm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);
            } else {
-            vp9_highbd_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd);
+            vp9_highbd_iht4x4_16_add(dqcoeff, dst16, dst_stride, tx_type,
+                                     xd->bd);
            }
          }
          break;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c

index 62d88747156cd06ef81ba1dde66e12f4f5b89b09..32878dcc8808f1b5e227eaf76ba4d1741e1603e9 100644 (file)
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2391,6 +2391,7 @@ static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
  #if CONFIG_VP9_HIGHBITDEPTH
  static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                     YV12_BUFFER_CONFIG *dst, int bd,
+                                   INTERP_FILTER filter_type,
                                     int phase_scaler) {
    const int src_w = src->y_crop_width;
    const int src_h = src->y_crop_height;
@@ -2401,7 +2402,7 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
    const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
    uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
    const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
-  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
+  const InterpKernel *const kernel = vp9_filter_kernels[filter_type];
    int x, y, i;
  
    for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -2714,7 +2715,8 @@ void vp9_scale_references(VP9_COMP *cpi) {
                                         cm->byte_alignment, NULL, NULL, NULL))
              vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                                 "Failed to allocate frame buffer");
-          scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth, 0);
+          scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth,
+                                 EIGHTTAP, 0);
            cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
            alloc_frame_mvs(cm, new_fb);
          }
@@ -2737,7 +2739,7 @@ void vp9_scale_references(VP9_COMP *cpi) {
                                         cm->byte_alignment, NULL, NULL, NULL))
              vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                                 "Failed to allocate frame buffer");
-          vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, 0);
+          vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, EIGHTTAP, 0);
            cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
            alloc_frame_mvs(cm, new_fb);
          }
@@ -3124,10 +3126,13 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
                                         uint8_t *dest) {
    VP9_COMMON *const cm = &cpi->common;
    int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
+  const INTERP_FILTER filter_scaler =
+      (is_one_pass_cbr_svc(cpi))
+          ? cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id]
+          : EIGHTTAP;
    const int phase_scaler =
-      (is_one_pass_cbr_svc(cpi) &&
-       cpi->svc.filtertype_downsample_source[cpi->svc.spatial_layer_id])
-          ? 8
+      (is_one_pass_cbr_svc(cpi))
+          ? cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id]
            : 0;
  
    // Flag to check if its valid to compute the source sad (used for
@@ -3148,10 +3153,11 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
      // For svc, if it is a 1/4x1/4 downscaling, do a two-stage scaling to take
      // advantage of the 1:2 optimized scaler. In the process, the 1/2x1/2
      // result will be saved in scaled_temp and might be used later.
-    int phase_scaler2 = (cpi->svc.filtertype_downsample_source[1]) ? 8 : 0;
+    const INTERP_FILTER filter_scaler2 = cpi->svc.downsample_filter_type[1];
+    const int phase_scaler2 = cpi->svc.downsample_filter_phase[1];
      cpi->Source = vp9_svc_twostage_scale(
          cm, cpi->un_scaled_source, &cpi->scaled_source, &cpi->svc.scaled_temp,
-        phase_scaler, phase_scaler2);
+        filter_scaler, phase_scaler, filter_scaler2, phase_scaler2);
      cpi->svc.scaled_one_half = 1;
    } else if (is_one_pass_cbr_svc(cpi) &&
               cpi->un_scaled_source->y_width == cm->width << 1 &&
@@ -3162,9 +3168,9 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
      cpi->Source = &cpi->svc.scaled_temp;
      cpi->svc.scaled_one_half = 0;
    } else {
-    cpi->Source =
-        vp9_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source,
-                              (cpi->oxcf.pass == 0), phase_scaler);
+    cpi->Source = vp9_scale_if_required(
+        cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0),
+        filter_scaler, phase_scaler);
    }
    // Unfiltered raw source used in metrics calculation if the source
    // has been filtered.
@@ -3173,7 +3179,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
      if (is_spatial_denoise_enabled(cpi)) {
        cpi->raw_source_frame = vp9_scale_if_required(
            cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source,
-          (cpi->oxcf.pass == 0), phase_scaler);
+          (cpi->oxcf.pass == 0), EIGHTTAP, phase_scaler);
      } else {
        cpi->raw_source_frame = cpi->Source;
      }
@@ -3205,9 +3211,9 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
         cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION ||
         (cpi->noise_estimate.enabled && !cpi->oxcf.noise_sensitivity) ||
         cpi->compute_source_sad_onepass))
-    cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
-                                             &cpi->scaled_last_source,
-                                             (cpi->oxcf.pass == 0), 0);
+    cpi->Last_Source = vp9_scale_if_required(
+        cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+        (cpi->oxcf.pass == 0), EIGHTTAP, 0);
  
    if (cpi->Last_Source == NULL ||
        cpi->Last_Source->y_width != cpi->Source->y_width ||
@@ -3392,7 +3398,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
  
      cpi->Source =
          vp9_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source,
-                              (cpi->oxcf.pass == 0), 0);
+                              (cpi->oxcf.pass == 0), EIGHTTAP, 0);
  
      // Unfiltered raw source used in metrics calculation if the source
      // has been filtered.
@@ -3401,7 +3407,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
        if (is_spatial_denoise_enabled(cpi)) {
          cpi->raw_source_frame = vp9_scale_if_required(
              cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source,
-            (cpi->oxcf.pass == 0), 0);
+            (cpi->oxcf.pass == 0), EIGHTTAP, 0);
        } else {
          cpi->raw_source_frame = cpi->Source;
        }
@@ -3411,9 +3417,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
      }
  
      if (cpi->unscaled_last_source != NULL)
-      cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
-                                               &cpi->scaled_last_source,
-                                               (cpi->oxcf.pass == 0), 0);
+      cpi->Last_Source = vp9_scale_if_required(
+          cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+          (cpi->oxcf.pass == 0), EIGHTTAP, 0);
  
      if (frame_is_intra_only(cm) == 0) {
        if (loop_count > 0) {
@@ -3693,22 +3699,26 @@ static void set_ext_overrides(VP9_COMP *cpi) {
  
  YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
      VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
-    YV12_BUFFER_CONFIG *scaled_temp, int phase_scaler, int phase_scaler2) {
+    YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
+    int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) {
    if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
        cm->mi_rows * MI_SIZE != unscaled->y_height) {
  #if CONFIG_VP9_HIGHBITDEPTH
      if (cm->bit_depth == VPX_BITS_8) {
-      vp9_scale_and_extend_frame(unscaled, scaled_temp, phase_scaler2);
-      vp9_scale_and_extend_frame(scaled_temp, scaled, phase_scaler);
+      vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+                                 phase_scaler2);
+      vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type,
+                                 phase_scaler);
      } else {
        scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth,
-                             phase_scaler2);
+                             filter_type2, phase_scaler2);
        scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth,
-                             phase_scaler);
+                             filter_type, phase_scaler);
      }
  #else
-    vp9_scale_and_extend_frame(unscaled, scaled_temp, phase_scaler2);
-    vp9_scale_and_extend_frame(scaled_temp, scaled, phase_scaler);
+    vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+                               phase_scaler2);
+    vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler);
  #endif  // CONFIG_VP9_HIGHBITDEPTH
      return scaled;
    } else {
@@ -3716,27 +3726,25 @@ YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
    }
  }
  
-YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
-                                          YV12_BUFFER_CONFIG *unscaled,
-                                          YV12_BUFFER_CONFIG *scaled,
-                                          int use_normative_scaler,
-                                          int phase_scaler) {
+YV12_BUFFER_CONFIG *vp9_scale_if_required(
+    VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+    int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler) {
    if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
        cm->mi_rows * MI_SIZE != unscaled->y_height) {
  #if CONFIG_VP9_HIGHBITDEPTH
      if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) &&
          unscaled->y_height <= (scaled->y_height << 1))
        if (cm->bit_depth == VPX_BITS_8)
-        vp9_scale_and_extend_frame(unscaled, scaled, phase_scaler);
+        vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler);
        else
          scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth,
-                               phase_scaler);
+                               filter_type, phase_scaler);
      else
        scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
  #else
      if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) &&
          unscaled->y_height <= (scaled->y_height << 1))
-      vp9_scale_and_extend_frame(unscaled, scaled, phase_scaler);
+      vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler);
      else
        scale_and_extend_frame_nonnormative(unscaled, scaled);
  #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h

index 1c7f17e196d30bba8ba900918ba95c74d74fb05a..c47926168a4f527b5a025ee1320e920bdcddedf5 100644 (file)
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -836,17 +836,14 @@ void vp9_update_reference_frames(VP9_COMP *cpi);
  
  void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
  
-YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
-                                           YV12_BUFFER_CONFIG *unscaled,
-                                           YV12_BUFFER_CONFIG *scaled,
-                                           YV12_BUFFER_CONFIG *scaled_temp,
-                                           int phase_scaler, int phase_scaler2);
-
-YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
-                                          YV12_BUFFER_CONFIG *unscaled,
-                                          YV12_BUFFER_CONFIG *scaled,
-                                          int use_normative_scaler,
-                                          int phase_scaler);
+YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
+    VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+    YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
+    int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2);
+
+YV12_BUFFER_CONFIG *vp9_scale_if_required(
+    VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+    int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler);
  
  void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
  
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c

index 55f425b6caa174c11eb3b1eabfe106bc4014a571..e398ff5e825b1e1fa17b81e5b0a43cc4a587a7be 100644 (file)
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1400,7 +1400,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
                   (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
  
      cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
-                                        &cpi->scaled_source, 0, 0);
+                                        &cpi->scaled_source, 0, EIGHTTAP, 0);
    }
  
    vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
diff --git a/vp9/encoder/vp9_frame_scale.c b/vp9/encoder/vp9_frame_scale.c

index 03f8fc480eef784326d3c934b32101a8a74217a1..e58628388f0d3598144ac0115392a8d2e63d5d2d 100644 (file)
--- a/vp9/encoder/vp9_frame_scale.c
+++ b/vp9/encoder/vp9_frame_scale.c
@@ -16,7 +16,8 @@
  #include "vpx_scale/yv12config.h"
  
  void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
-                                  YV12_BUFFER_CONFIG *dst, int phase_scaler) {
+                                  YV12_BUFFER_CONFIG *dst,
+                                  INTERP_FILTER filter_type, int phase_scaler) {
    const int src_w = src->y_crop_width;
    const int src_h = src->y_crop_height;
    const int dst_w = dst->y_crop_width;
@@ -26,7 +27,7 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
    const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
    uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
    const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
-  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
+  const InterpKernel *const kernel = vp9_filter_kernels[filter_type];
    int x, y, i;
  
    for (i = 0; i < MAX_MB_PLANE; ++i) {
diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c

index fc2e32448e884e2459bc4f2c961bd2b3a9a8d69a..ca0873e4c436f46ed36a6e1364ce7b38fc8a5e1b 100644 (file)
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -26,25 +26,27 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
    ne->level = kLowLow;
    ne->value = 0;
    ne->count = 0;
-  ne->thresh = 100;
+  ne->thresh = 90;
    ne->last_w = 0;
    ne->last_h = 0;
    if (width * height >= 1920 * 1080) {
      ne->thresh = 200;
    } else if (width * height >= 1280 * 720) {
-    ne->thresh = 140;
+    ne->thresh = 150;
+  } else if (width * height >= 640 * 360) {
+    ne->thresh = 100;
    }
-  ne->num_frames_estimate = 20;
+  ne->num_frames_estimate = 15;
  }
  
  static int enable_noise_estimation(VP9_COMP *const cpi) {
  #if CONFIG_VP9_HIGHBITDEPTH
    if (cpi->common.use_highbitdepth) return 0;
  #endif
-// Enable noise estimation if denoising is on, but not for low resolutions.
+// Enable noise estimation if denoising is on.
  #if CONFIG_VP9_TEMPORAL_DENOISING
    if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
-      cpi->common.width >= 640 && cpi->common.height >= 360)
+      cpi->common.width >= 320 && cpi->common.height >= 180)
      return 1;
  #endif
    // Only allow noise estimate under certain encoding mode.
@@ -97,6 +99,7 @@ NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) {
  void vp9_update_noise_estimate(VP9_COMP *const cpi) {
    const VP9_COMMON *const cm = &cpi->common;
    NOISE_ESTIMATE *const ne = &cpi->noise_estimate;
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
    // Estimate of noise level every frame_period frames.
    int frame_period = 8;
    int thresh_consec_zeromv = 6;
@@ -111,6 +114,13 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
    if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi))
      last_source = &cpi->denoiser.last_source;
  #endif
+  // Tune these thresholds for different resolutions.
+  if (cm->width > 640 && cm->width < 1920) {
+    thresh_consec_zeromv = 5;
+    thresh_sum_diff = 200;
+    thresh_sum_spatial = (120 * 120) << 8;
+    thresh_spatial_var = (48 * 48) << 8;
+  }
    ne->enabled = enable_noise_estimation(cpi);
    if (cpi->svc.number_spatial_layers > 1)
      frame_counter = cpi->svc.current_superframe;
@@ -127,9 +137,12 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
        ne->last_h = cm->height;
      }
      return;
-  } else if (cpi->rc.avg_frame_low_motion < 50) {
+  } else if (cm->current_video_frame > 60 &&
+             cpi->rc.avg_frame_low_motion < (low_res ? 70 : 50)) {
      // Force noise estimation to 0 and denoiser off if content has high motion.
      ne->level = kLowLow;
+    ne->count = 0;
+    ne->num_frames_estimate = 10;
  #if CONFIG_VP9_TEMPORAL_DENOISING
      if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
          cpi->svc.current_superframe > 1) {
@@ -210,7 +223,8 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
                // Avoid blocks with high brightness and high spatial variance.
                if ((sse2 - spatial_variance) < thresh_sum_spatial &&
                    spatial_variance < thresh_spatial_var) {
-                avg_est += variance / ((spatial_variance >> 9) + 1);
+                avg_est += low_res ? variance >> 4
+                                   : variance / ((spatial_variance >> 9) + 1);
                  num_samples++;
                }
              }
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c

index d4aa46e835c6aed685a6afa5b07f71afdb4f3765..bf0fec3d8d81effc35386fe73801673a08383c5f 100644 (file)
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -601,26 +601,26 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
          vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,
                                   32, NULL, 0, NULL, 0, bs, bs, xd->bd);
-        recon = CONVERT_TO_BYTEPTR(recon16);
          if (xd->lossless) {
-          vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd);
+          vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
          } else {
            switch (tx_size) {
              case TX_4X4:
-              vp9_highbd_idct4x4_add(dqcoeff, recon, 32, *eob, xd->bd);
+              vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
                break;
              case TX_8X8:
-              vp9_highbd_idct8x8_add(dqcoeff, recon, 32, *eob, xd->bd);
+              vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, *eob, xd->bd);
                break;
              case TX_16X16:
-              vp9_highbd_idct16x16_add(dqcoeff, recon, 32, *eob, xd->bd);
+              vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, *eob, xd->bd);
                break;
              case TX_32X32:
-              vp9_highbd_idct32x32_add(dqcoeff, recon, 32, *eob, xd->bd);
+              vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, *eob, xd->bd);
                break;
              default: assert(0 && "Invalid transform size");
            }
          }
+        recon = CONVERT_TO_BYTEPTR(recon16);
        } else {
  #endif  // CONFIG_VP9_HIGHBITDEPTH
          vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs);
@@ -1004,6 +1004,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
            const int block = (row + idy) * 2 + (col + idx);
            const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
            uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+          uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
            int16_t *const src_diff =
                vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
            tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
@@ -1025,7 +1026,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
              tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0);
              if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
                goto next_highbd;
-            vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst,
+            vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst16,
                                     dst_stride, p->eobs[block], xd->bd);
            } else {
              int64_t unused;
@@ -1048,7 +1049,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
              if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
                goto next_highbd;
              vp9_highbd_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
-                                  dst, dst_stride, p->eobs[block], xd->bd);
+                                  dst16, dst_stride, p->eobs[block], xd->bd);
            }
          }
        }
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c

index ed628758b73b2fc43fbac3dd9de79356d1774cc6..5867a6c38b8be2150a62eb851091bea88648d11c 100644 (file)
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -38,11 +38,12 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
    svc->current_superframe = 0;
    for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1;
    for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
-    cpi->svc.ext_frame_flags[sl] = 0;
-    cpi->svc.ext_lst_fb_idx[sl] = 0;
-    cpi->svc.ext_gld_fb_idx[sl] = 1;
-    cpi->svc.ext_alt_fb_idx[sl] = 2;
-    cpi->svc.filtertype_downsample_source[sl] = 0;
+    svc->ext_frame_flags[sl] = 0;
+    svc->ext_lst_fb_idx[sl] = 0;
+    svc->ext_gld_fb_idx[sl] = 1;
+    svc->ext_alt_fb_idx[sl] = 2;
+    svc->downsample_filter_type[sl] = EIGHTTAP;
+    svc->downsample_filter_phase[sl] = 0;  // Set to 8 for averaging filter.
    }
  
    if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
@@ -651,11 +652,17 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
                         lc->scaling_factor_num, lc->scaling_factor_den, &width,
                         &height);
  
+  // For low resolutions: set phase of the filter = 8 (for symmetric averaging
+  // filter), use bilinear for now.
+  if (width <= 320 && height <= 240) {
+    cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] = BILINEAR;
+    cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] = 8;
+  }
+
    // The usage of use_base_mv assumes down-scale of 2x2. For now, turn off use
    // of base motion vectors if spatial scale factors for any layers are not 2,
    // keep the case of 3 spatial layers with scale factor of 4x4 for base layer.
    // TODO(marpan): Fix this to allow for use_base_mv for scale factors != 2.
-  // Same condition applies to use of non-zero phase_scaler.
    if (cpi->svc.number_spatial_layers > 1) {
      int sl;
      for (sl = 0; sl < cpi->svc.number_spatial_layers - 1; ++sl) {
@@ -664,10 +671,7 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
        if ((lc->scaling_factor_num != lc->scaling_factor_den >> 1) &&
            !(lc->scaling_factor_num == lc->scaling_factor_den >> 2 && sl == 0 &&
              cpi->svc.number_spatial_layers == 3)) {
-        int sl2;
          cpi->svc.use_base_mv = 0;
-        for (sl2 = 0; sl2 < cpi->svc.number_spatial_layers - 1; ++sl2)
-          cpi->svc.filtertype_downsample_source[sl2] = 0;
          break;
        }
      }
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h

index 7442e6ff749a0a721488262b605ff84be9c04300..d8e6772b26f1ca3d170af67ac0745ea954feeb27 100644 (file)
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -89,11 +89,12 @@ typedef struct {
    int current_superframe;
    int use_base_mv;
    // Used to control the downscaling filter for source scaling, for 1 pass CBR.
-  // 0 will do sub-sampling (no weighted average), 1 will center the target
-  // pixel and use the averaging filter, for the default eightap_regular:
-  // {-1, 6, -19, 78, 78, -19, 6, -1 }.
-  // TODO(marpan): Add option for bilinear.
-  int filtertype_downsample_source[VPX_SS_MAX_LAYERS];
+  // downsample_filter_phase: = 0 will do sub-sampling (no weighted average),
+  // = 8 will center the target pixel and get a symmetric averaging filter.
+  // downsample_filter_type: 4 filters may be used: eighttap_regular,
+  // eighttap_smooth, eighttap_sharp, and bilinear.
+  INTERP_FILTER downsample_filter_type[VPX_SS_MAX_LAYERS];
+  int downsample_filter_phase[VPX_SS_MAX_LAYERS];
  } SVC;
  
  struct VP9_COMP;
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c

index 31bcb246a61a59396033b450f6657d1d66e4848c..63079415617d5f4d4f359c019d93d5bd7a0346a7 100644 (file)
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -755,7 +755,8 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
                                 "Failed to reallocate alt_ref_buffer");
            }
            frames[frame] = vp9_scale_if_required(
-              cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0, 0);
+              cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0,
+              EIGHTTAP, 0);
            ++frame_used;
          }
        }
diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c

index 31e17f21f785115873221675d7e293a8af40badd..b53714a0289ff9ddad8af72eaf32a97d2a5ce5f9 100644 (file)
--- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -17,7 +17,7 @@
  
  extern void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
                                           YV12_BUFFER_CONFIG *dst,
-                                         int phase_scaler);
+                                         uint8_t filter_type, int phase_scaler);
  
  static void downsample_2_to_1_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride, int w,
@@ -170,7 +170,7 @@ static void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride,
  
  void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
                                        YV12_BUFFER_CONFIG *dst,
-                                      int phase_scaler) {
+                                      uint8_t filter_type, int phase_scaler) {
    const int src_w = src->y_crop_width;
    const int src_h = src->y_crop_height;
    const int dst_w = dst->y_crop_width;
@@ -198,9 +198,9 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
                              dst->uv_stride, dst_uv_w, dst_uv_h);
        vpx_extend_frame_borders(dst);
      } else {
-      vp9_scale_and_extend_frame_c(src, dst, phase_scaler);
+      vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
      }
    } else {
-    vp9_scale_and_extend_frame_c(src, dst, phase_scaler);
+    vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
    }
  }
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk

index 4b8ec5048487e6aaac9b3b3a889045a16b73b43e..47846c9410da1f92773544110d3c0e2496759d39 100644 (file)
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -121,6 +121,7 @@ VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c
  
  ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
  VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_denoiser_neon.c
  endif
  
  VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c
diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c

index 96f6de1be95c4cdc9e674c74d6be68e43094127d..b26920504f58ac86f845c7300e16f56337450ebf 100644 (file)
--- a/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/vpx_dsp/arm/fwd_txfm_neon.c
@@ -125,6 +125,8 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
        out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
      }
      // transpose 8x8
+    // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
+    // columns.
      {
        // 00 01 02 03 40 41 42 43
        // 10 11 12 13 50 51 52 53
diff --git a/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/vpx_dsp/arm/highbd_idct16x16_add_neon.c

index 1259bb3807b36848b62393f265241528b8555f1b..98e42cd25ab2094e34d09d98406817a6a063ef05 100644 (file)
--- a/vpx_dsp/arm/highbd_idct16x16_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -1268,10 +1268,8 @@ void vpx_highbd_idct16x16_10_add_half1d_pass2(const int32_t *input,
    }
  }
  
-void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest,
                                         int stride, int bd) {
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
    if (bd == 8) {
      int16_t row_idct_output[16 * 16];
  
@@ -1313,10 +1311,8 @@ void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest8,
    }
  }
  
-void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest,
                                        int stride, int bd) {
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
    if (bd == 8) {
      int16_t row_idct_output[16 * 16];
  
@@ -1349,10 +1345,8 @@ void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest8,
    }
  }
  
-void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest,
                                        int stride, int bd) {
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
    if (bd == 8) {
      int16_t row_idct_output[4 * 16];
  
@@ -1414,7 +1408,7 @@ static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest,
    *dest += stride;
  }
  
-void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest,
                                       int stride, int bd) {
    const tran_low_t out0 =
        HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
@@ -1422,7 +1416,6 @@ void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8,
        HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
    const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
    const int16x8_t dc = vdupq_n_s16(a1);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    int i;
  
    if (a1 >= 0) {
diff --git a/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c

index 858342830d8f9d0df5711e6cd451b9bdc33cacdf..96a55c472f67110337624f2242c821276a0f9f39 100644 (file)
--- a/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
@@ -386,15 +386,14 @@ static INLINE void idct32_bands_end_2nd_pass(const int32_t *const out,
  }
  
  static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input,
-                                             uint8_t *const dest,
-                                             const int stride, const int bd) {
+                                             uint16_t *dst, const int stride,
+                                             const int bd) {
    int i, idct32_pass_loop;
    int32_t trans_buf[32 * 8];
    int32_t pass1[32 * 32];
    int32_t pass2[32 * 32];
    int32_t *out;
    int32x4x2_t q[16];
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dest);
  
    for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
         idct32_pass_loop++, input = pass1, out = pass2) {
@@ -637,10 +636,10 @@ static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input,
    }
  }
  
-void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
+void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest,
                                          int stride, int bd) {
    if (bd == 8) {
-    vpx_idct32_32_neon(input, dest, stride, 1);
+    vpx_idct32_32_neon(input, CAST_TO_BYTEPTR(dest), stride, 1);
    } else {
      vpx_highbd_idct32_32_neon(input, dest, stride, bd);
    }
diff --git a/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c

index 52f3d43e5c49a9f00890cd047a0987c754522fd7..3970a5a86131b12521b21601d9461a25e141344a 100644 (file)
--- a/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
@@ -726,10 +726,9 @@ static void vpx_highbd_idct32_16_neon(const int32_t *const input,
    highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
  }
  
-void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest,
                                         int stride, int bd) {
    int i;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    if (bd == 8) {
      int16_t temp[32 * 16];
diff --git a/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c

index 195dcc92d5e097da90b7e959e159e1f1b1b36cf8..5d9063b15dc71be94c729966531ef2beb41fc233 100644 (file)
--- a/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
@@ -594,10 +594,9 @@ static void vpx_highbd_idct32_8_neon(const int32_t *input, uint16_t *output,
    highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
  }
  
-void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest,
                                        int stride, int bd) {
    int i;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    if (bd == 8) {
      int16_t temp[32 * 8];
diff --git a/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_add_neon.c

index d74331f8031648fa59e0af8ad019baca1016b0b0..63eb49678cca33fe637e66d522d5752c5c618b8e 100644 (file)
--- a/vpx_dsp/arm/highbd_idct32x32_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct32x32_add_neon.c
@@ -59,7 +59,7 @@ static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest,
    *dest += stride;
  }
  
-void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest,
                                       int stride, int bd) {
    const tran_low_t out0 =
        HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
@@ -67,7 +67,6 @@ void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8,
        HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
    const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
    const int16x8_t dc = vdupq_n_s16(a1);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    int i;
  
    if (a1 >= 0) {
diff --git a/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/vpx_dsp/arm/highbd_idct4x4_add_neon.c

index 128f72b9c96d7a4a791bd399feea4c8581d28fd1..20b09f6834383e418a10649aea5f20cbf55fb033 100644 (file)
--- a/vpx_dsp/arm/highbd_idct4x4_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -51,7 +51,7 @@ static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest,
    *dest += stride;
  }
  
-void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
    const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
    const tran_low_t out0 =
@@ -60,7 +60,6 @@ void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8,
        HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
    const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
    const int16x8_t dc = vdupq_n_s16(a1);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
    highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
@@ -133,14 +132,13 @@ static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
    *a3 = vsubq_s32(b0, b3);
  }
  
-void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
                                      int stride, int bd) {
    const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
    int32x4_t c0 = vld1q_s32(input);
    int32x4_t c1 = vld1q_s32(input + 4);
    int32x4_t c2 = vld1q_s32(input + 8);
    int32x4_t c3 = vld1q_s32(input + 12);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    int16x8_t a0, a1;
  
    if (bd == 8) {
diff --git a/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/vpx_dsp/arm/highbd_idct8x8_add_neon.c

index f53f4c7fcad0fa2ce018de71e8acd5152f9bec43..6687e764959b797a2bb81fe3edba4700bb98d97f 100644 (file)
--- a/vpx_dsp/arm/highbd_idct8x8_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -36,7 +36,7 @@ static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest,
    *dest += stride;
  }
  
-void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
    const tran_low_t out0 =
        HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
@@ -44,7 +44,6 @@ void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,
        HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
    const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
    const int16x8_t dc = vdupq_n_s16(a1);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    if (a1 >= 0) {
      const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
@@ -292,9 +291,8 @@ static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
    vst1q_u16(dest, d7_u16);
  }
  
-void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest,
                                      int stride, int bd) {
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    int32x4_t a0 = vld1q_s32(input);
    int32x4_t a1 = vld1q_s32(input + 8);
    int32x4_t a2 = vld1q_s32(input + 16);
@@ -553,9 +551,8 @@ static INLINE void idct8x8_64_half1d_bd12(
    *io7 = vsubq_s32(step1[0], step2[7]);
  }
  
-void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
                                      int stride, int bd) {
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    int32x4_t a0 = vld1q_s32(input);
    int32x4_t a1 = vld1q_s32(input + 4);
    int32x4_t a2 = vld1q_s32(input + 8);
diff --git a/vpx_dsp/arm/idct32x32_add_neon.c b/vpx_dsp/arm/idct32x32_add_neon.c

index 34b5baf7236cb653ae87ab28f5deeb97f697df8d..91418c9e69139b8d1e5e1cba2bef1f52b90d6370 100644 (file)
--- a/vpx_dsp/arm/idct32x32_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_add_neon.c
@@ -517,7 +517,7 @@ void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest,
    const int16_t *input_pass2 = pass1;  // input of pass2 is the result of pass1
    int16_t *out;
    int16x8_t q[16];
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dest);
+  uint16_t *dst = CAST_TO_SHORTPTR(dest);
  
    for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
         idct32_pass_loop++, out = pass2) {
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c

index b6d7f86a4b27f13232a67038d742298ab71c25bb..c0828e8f63992e285d97cf01f5c2529359b8e263 100644 (file)
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -31,77 +31,129 @@ static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
    return vget_lane_s32(c, 0);
  }
  
-// w * h must be less than 2048 or local variable v_sum may overflow.
-static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int w, int h, uint32_t *sse,
-                             int *sum) {
+// w * h must be less than 2048 or sum_s16 may overflow.
+// Process a block of any size where the width is divisible by 16.
+static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int w, int h, uint32_t *sse,
+                              int *sum) {
    int i, j;
-  int16x8_t v_sum = vdupq_n_s16(0);
-  int32x4_t v_sse_lo = vdupq_n_s32(0);
-  int32x4_t v_sse_hi = vdupq_n_s32(0);
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int32x4_t sse_lo_s32 = vdupq_n_s32(0);
+  int32x4_t sse_hi_s32 = vdupq_n_s32(0);
  
    for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 8) {
-      const uint8x8_t v_a = vld1_u8(&a[j]);
-      const uint8x8_t v_b = vld1_u8(&b[j]);
-      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
-      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
-      v_sum = vaddq_s16(v_sum, sv_diff);
-      v_sse_lo =
-          vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
-      v_sse_hi =
-          vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
+    for (j = 0; j < w; j += 16) {
+      const uint8x16_t a_u8 = vld1q_u8(a + j);
+      const uint8x16_t b_u8 = vld1q_u8(b + j);
+
+      const uint16x8_t diff_lo_u16 =
+          vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
+      const uint16x8_t diff_hi_u16 =
+          vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8));
+
+      const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16);
+      const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16);
+
+      sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
+      sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
+
+      sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16),
+                             vget_low_s16(diff_lo_s16));
+      sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16),
+                             vget_high_s16(diff_lo_s16));
+
+      sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16),
+                             vget_low_s16(diff_hi_s16));
+      sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
+                             vget_high_s16(diff_hi_s16));
      }
      a += a_stride;
      b += b_stride;
    }
  
-  *sum = horizontal_add_s16x8(v_sum);
-  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+  *sum = horizontal_add_s16x8(sum_s16);
+  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32));
+}
+
+// w * h must be less than 2048 or sum_s16 may overflow.
+// Process a block of width 8 two rows at a time.
+static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, int h, uint32_t *sse, int *sum) {
+  int i = 0;
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int32x4_t sse_lo_s32 = vdupq_n_s32(0);
+  int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+
+  do {
+    const uint8x8_t a_0_u8 = vld1_u8(a);
+    const uint8x8_t a_1_u8 = vld1_u8(a + a_stride);
+    const uint8x8_t b_0_u8 = vld1_u8(b);
+    const uint8x8_t b_1_u8 = vld1_u8(b + b_stride);
+    const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8);
+    const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8);
+    const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16);
+    const int16x8_t diff_1_s16 = vreinterpretq_s16_u16(diff_1_u16);
+    sum_s16 = vaddq_s16(sum_s16, diff_0_s16);
+    sum_s16 = vaddq_s16(sum_s16, diff_1_s16);
+    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_0_s16),
+                           vget_low_s16(diff_0_s16));
+    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_1_s16),
+                           vget_low_s16(diff_1_s16));
+    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_0_s16),
+                           vget_high_s16(diff_0_s16));
+    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16),
+                           vget_high_s16(diff_1_s16));
+    a += a_stride + a_stride;
+    b += b_stride + b_stride;
+    i += 2;
+  } while (i < h);
+
+  *sum = horizontal_add_s16x8(sum_s16);
+  *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32));
  }
  
  void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
                          int b_stride, unsigned int *sse, int *sum) {
-  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
+  variance_neon_w8x2(a, a_stride, b, b_stride, 8, sse, sum);
  }
  
  void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
                            int b_stride, unsigned int *sse, int *sum) {
-  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
-}
-
-unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
-                                  const uint8_t *b, int b_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
-  return *sse - ((sum * sum) >> 6);
+  variance_neon_w16(a, a_stride, b, b_stride, 16, 16, sse, sum);
  }
  
-unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
-  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
-}
+#define varianceNxM(n, m, shift)                                            \
+  unsigned int vpx_variance##n##x##m##_neon(const uint8_t *a, int a_stride, \
+                                            const uint8_t *b, int b_stride, \
+                                            unsigned int *sse) {            \
+    int sum;                                                                \
+    if (n == 8)                                                             \
+      variance_neon_w8x2(a, a_stride, b, b_stride, m, sse, &sum);           \
+    else                                                                    \
+      variance_neon_w16(a, a_stride, b, b_stride, n, m, sse, &sum);         \
+    if (n * m < 16 * 16)                                                    \
+      return *sse - ((sum * sum) >> shift);                                 \
+    else                                                                    \
+      return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);              \
+  }
  
-unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
-}
+varianceNxM(8, 4, 5);
+varianceNxM(8, 8, 6);
+varianceNxM(8, 16, 7);
+varianceNxM(16, 8, 7);
+varianceNxM(16, 16, 8);
+varianceNxM(16, 32, 9);
+varianceNxM(32, 16, 9);
+varianceNxM(32, 32, 10);
  
  unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
                                      const uint8_t *b, int b_stride,
                                      unsigned int *sse) {
    int sum1, sum2;
    uint32_t sse1, sse2;
-  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
-  variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
-                   32, 32, &sse2, &sum2);
+  variance_neon_w16(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
+  variance_neon_w16(a + (32 * a_stride), a_stride, b + (32 * b_stride),
+                    b_stride, 32, 32, &sse2, &sum2);
    *sse = sse1 + sse2;
    sum1 += sum2;
    return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
@@ -112,9 +164,9 @@ unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
                                      unsigned int *sse) {
    int sum1, sum2;
    uint32_t sse1, sse2;
-  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
+  variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride),
+                    b_stride, 64, 16, &sse2, &sum2);
    *sse = sse1 + sse2;
    sum1 += sum2;
    return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
@@ -126,162 +178,24 @@ unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
    int sum1, sum2;
    uint32_t sse1, sse2;
  
-  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
+  variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride),
+                    b_stride, 64, 16, &sse2, &sum2);
    sse1 += sse2;
    sum1 += sum2;
  
-  variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
-                   b_stride, 64, 16, &sse2, &sum2);
+  variance_neon_w16(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
+                    b_stride, 64, 16, &sse2, &sum2);
    sse1 += sse2;
    sum1 += sum2;
  
-  variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
-                   b_stride, 64, 16, &sse2, &sum2);
+  variance_neon_w16(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
+                    b_stride, 64, 16, &sse2, &sum2);
    *sse = sse1 + sse2;
    sum1 += sum2;
    return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
  }
  
-unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr,
-                                   int source_stride,
-                                   const unsigned char *ref_ptr,
-                                   int recon_stride, unsigned int *sse) {
-  int i;
-  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-  uint32x2_t d0u32, d10u32;
-  int64x1_t d0s64, d1s64;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int32x4_t q8s32, q9s32, q10s32;
-  int64x2_t q0s64, q1s64, q5s64;
-
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 4; i++) {
-    q0u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    q1u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    __builtin_prefetch(src_ptr);
-
-    q2u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    q3u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    __builtin_prefetch(ref_ptr);
-
-    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-    q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-    q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-  }
-
-  q10s32 = vaddq_s32(q10s32, q9s32);
-  q0s64 = vpaddlq_s32(q8s32);
-  q1s64 = vpaddlq_s32(q10s32);
-
-  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-  return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vpx_variance8x16_neon(const unsigned char *src_ptr,
-                                   int source_stride,
-                                   const unsigned char *ref_ptr,
-                                   int recon_stride, unsigned int *sse) {
-  int i;
-  uint8x8_t d0u8, d2u8, d4u8, d6u8;
-  int16x4_t d22s16, d23s16, d24s16, d25s16;
-  uint32x2_t d0u32, d10u32;
-  int64x1_t d0s64, d1s64;
-  uint16x8_t q11u16, q12u16;
-  int32x4_t q8s32, q9s32, q10s32;
-  int64x2_t q0s64, q1s64, q5s64;
-
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 8; i++) {
-    d0u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    d2u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    __builtin_prefetch(src_ptr);
-
-    d4u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    d6u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    __builtin_prefetch(ref_ptr);
-
-    q11u16 = vsubl_u8(d0u8, d4u8);
-    q12u16 = vsubl_u8(d2u8, d6u8);
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-  }
-
-  q10s32 = vaddq_s32(q10s32, q9s32);
-  q0s64 = vpaddlq_s32(q8s32);
-  q1s64 = vpaddlq_s32(q10s32);
-
-  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-  return vget_lane_u32(d0u32, 0);
-}
-
  unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
                                 const unsigned char *ref_ptr, int recon_stride,
                                 unsigned int *sse) {
diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c

index 210a9bed962331d601b48fd12a78d8c252f2478d..29323d1b899800cf8e02eec1591ba0fceaff39f3 100644 (file)
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -1182,16 +1182,10 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
  
    // Rows
    for (i = 0; i < 32; ++i) {
-    int16_t zero_coeff[16];
-    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
-    if (zero_coeff[0] | zero_coeff[1])
+    int16_t zero_coeff = 0;
+    for (j = 0; j < 32; ++j) zero_coeff |= input[j];
+
+    if (zero_coeff)
        idct32_c(input, outptr);
      else
        memset(outptr, 0, sizeof(tran_low_t) * 32);
@@ -1290,7 +1284,7 @@ static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
    return 0;
  }
  
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
                                   int stride, int bd) {
    /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
       0.5 shifts per pixel. */
@@ -1299,7 +1293,6 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
    tran_high_t a1, b1, c1, d1, e1;
    const tran_low_t *ip = input;
    tran_low_t *op = output;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    for (i = 0; i < 4; i++) {
      a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -1348,14 +1341,13 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
    }
  }
  
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest,
                                  int stride, int bd) {
    int i;
    tran_high_t a1, e1;
    tran_low_t tmp[4];
    const tran_low_t *ip = in;
    tran_low_t *op = tmp;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    (void)bd;
  
    a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -1452,13 +1444,12 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
    output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
  }
  
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
                                   int stride, int bd) {
    int i, j;
    tran_low_t out[4 * 4];
    tran_low_t *outptr = out;
    tran_low_t temp_in[4], temp_out[4];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    // Rows
    for (i = 0; i < 4; ++i) {
@@ -1478,13 +1469,12 @@ void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
    }
  }
  
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
                                  int stride, int bd) {
    int i;
    tran_high_t a1;
    tran_low_t out =
        HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
    a1 = ROUND_POWER_OF_TWO(out, 4);
@@ -1636,13 +1626,12 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
    output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
  }
  
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
                                   int stride, int bd) {
    int i, j;
    tran_low_t out[8 * 8];
    tran_low_t *outptr = out;
    tran_low_t temp_in[8], temp_out[8];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    // First transform rows
    for (i = 0; i < 8; ++i) {
@@ -1662,13 +1651,12 @@ void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
    }
  }
  
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest,
                                   int stride, int bd) {
    int i, j;
    tran_low_t out[8 * 8] = { 0 };
    tran_low_t *outptr = out;
    tran_low_t temp_in[8], temp_out[8];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    // First transform rows
    // Only first 4 row has non-zero coefs
@@ -1689,13 +1677,12 @@ void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
    }
  }
  
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest,
                                  int stride, int bd) {
    int i, j;
    tran_high_t a1;
    tran_low_t out =
        HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
    a1 = ROUND_POWER_OF_TWO(out, 5);
@@ -2056,13 +2043,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
    output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
  }
  
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
                                      int stride, int bd) {
    int i, j;
    tran_low_t out[16 * 16];
    tran_low_t *outptr = out;
    tran_low_t temp_in[16], temp_out[16];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    // First transform rows
    for (i = 0; i < 16; ++i) {
@@ -2082,13 +2068,12 @@ void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
    }
  }
  
-void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
    int i, j;
    tran_low_t out[16 * 16] = { 0 };
    tran_low_t *outptr = out;
    tran_low_t temp_in[16], temp_out[16];
-  uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8);
  
    // First transform rows. Since all non-zero dct coefficients are in
    // upper-left 8x8 area, we only need to calculate first 8 rows here.
@@ -2111,13 +2096,12 @@ void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest8,
    }
  }
  
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
    int i, j;
    tran_low_t out[16 * 16] = { 0 };
    tran_low_t *outptr = out;
    tran_low_t temp_in[16], temp_out[16];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    // First transform rows. Since all non-zero dct coefficients are in
    // upper-left 4x4 area, we only need to calculate first 4 rows here.
@@ -2138,13 +2122,12 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
    }
  }
  
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest,
                                    int stride, int bd) {
    int i, j;
    tran_high_t a1;
    tran_low_t out =
        HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
    a1 = ROUND_POWER_OF_TWO(out, 6);
@@ -2531,26 +2514,19 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
    output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
  }
  
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest,
                                       int stride, int bd) {
    int i, j;
    tran_low_t out[32 * 32];
    tran_low_t *outptr = out;
    tran_low_t temp_in[32], temp_out[32];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    // Rows
    for (i = 0; i < 32; ++i) {
-    tran_low_t zero_coeff[16];
-    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
-    if (zero_coeff[0] | zero_coeff[1])
+    tran_low_t zero_coeff = 0;
+    for (j = 0; j < 32; ++j) zero_coeff |= input[j];
+
+    if (zero_coeff)
        highbd_idct32_c(input, outptr, bd);
      else
        memset(outptr, 0, sizeof(tran_low_t) * 32);
@@ -2569,13 +2545,12 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
    }
  }
  
-void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest,
                                      int stride, int bd) {
    int i, j;
    tran_low_t out[32 * 32] = { 0 };
    tran_low_t *outptr = out;
    tran_low_t temp_in[32], temp_out[32];
-  uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8);
  
    // Rows
    // Only upper-left 16x16 has non-zero coeff
@@ -2598,13 +2573,12 @@ void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest8,
    }
  }
  
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
    int i, j;
    tran_low_t out[32 * 32] = { 0 };
    tran_low_t *outptr = out;
    tran_low_t temp_in[32], temp_out[32];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  
    // Rows
    // Only upper-left 8x8 has non-zero coeff
@@ -2625,11 +2599,10 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
    }
  }
  
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest,
                                    int stride, int bd) {
    int i, j;
    int a1;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
    tran_low_t out =
        HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
  
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk

index d1c991782b66f58b2abc91bc19e23e56b914ced1..6daa583901b85532216a4e692b7799083280e585 100644 (file)
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -231,6 +231,11 @@ DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_add_neon.c
  DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_34_add_neon.c
  DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_135_add_neon.c
  DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_1024_add_neon.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_inv_txfm_sse2.h
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct4x4_add_sse2.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct8x8_add_sse2.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct16x16_add_sse2.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct32x32_add_sse2.c
  endif  # !CONFIG_VP9_HIGHBITDEPTH
  
  ifeq ($(HAVE_NEON_ASM),yes)
@@ -351,6 +356,9 @@ DSP_SRCS-$(HAVE_VSX)  += ppc/types_vsx.h
  DSP_SRCS-$(HAVE_VSX)  += ppc/transpose_vsx.h
  DSP_SRCS-$(HAVE_VSX)  += ppc/bitdepth_conversion_vsx.h
  
+# X86 utilities
+DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h
+
  DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
  
  DSP_SRCS-yes += vpx_dsp_rtcd.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl

index d025a2f5bf0bf290d9a7cfe83e41bf090c61c7ad..3126ae6c8d3188c849cc4673342458f3f223331e 100644 (file)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -629,39 +629,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
    specialize qw/vpx_iwht4x4_16_add sse2/;
  
-  add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
  
-  add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
    specialize qw/vpx_highbd_idct4x4_1_add neon/;
  
-  add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
  
-  add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
  
-  add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
    specialize qw/vpx_highbd_idct8x8_1_add neon/;
  
-  add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
  
-  add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
  
-  add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
  
-  add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
    specialize qw/vpx_highbd_idct16x16_1_add neon/;
  
-  add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
  
-  add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
  
-  add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
  
-  add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
    specialize qw/vpx_highbd_idct32x32_1_add neon sse2/;
  
-  add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
  
-  add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
  
    if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
      specialize qw/vpx_highbd_idct4x4_16_add neon sse2/;
@@ -1177,10 +1177,10 @@ add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int sourc
    specialize qw/vpx_variance32x32 sse2 avx2 neon msa/;
  
  add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x16 sse2 avx2 msa/;
+  specialize qw/vpx_variance32x16 sse2 avx2 neon msa/;
  
  add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x32 sse2 msa/;
+  specialize qw/vpx_variance16x32 sse2 neon msa/;
  
  add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
    specialize qw/vpx_variance16x16 sse2 avx2 neon msa/;
@@ -1195,12 +1195,14 @@ add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_
    specialize qw/vpx_variance8x8 sse2 neon msa/;
  
  add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x4 sse2 msa/;
+  specialize qw/vpx_variance8x4 sse2 neon msa/;
  
  add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+# TODO(johannkoenig): neon
    specialize qw/vpx_variance4x8 sse2 msa/;
  
  add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+# TODO(johannkoenig): neon
    specialize qw/vpx_variance4x4 sse2 msa/;
  
  #
diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c

new file mode 100644 (file)

index 0000000..f16e4d0
--- /dev/null
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -0,0 +1,244 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                       int stride, int bd) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 32; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 32; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_16x16(inptr, inptr + 16);
+      for (i = 0; i < 16; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 16; ++i) {
+      vpx_highbd_idct16_c(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], rounding);
+        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
+        inptr[i] = _mm_srai_epi16(inptr[i], 6);
+        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+      vpx_highbd_idct16_c(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      }
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // Since all non-zero dct coefficients are in upper-left 4x4 area,
+  // we only need to consider first 4 rows here.
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform (N.B. This transposes inptr)
+    idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 16; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_8x8(inptr, inptr);
+      array_transpose_8x8(inptr + 8, inptr + 16);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vpx_highbd_idct16_c(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], rounding);
+        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
+        inptr[i] = _mm_srai_epi16(inptr[i], 6);
+        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+      vpx_highbd_idct16_c(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      }
+    }
+  }
+}
diff --git a/vpx_dsp/x86/highbd_idct32x32_add_sse2.c b/vpx_dsp/x86/highbd_idct32x32_add_sse2.c

new file mode 100644 (file)

index 0000000..bc9debf
--- /dev/null
+++ b/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int bd) {
+  __m128i dc_value, d;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  int a, i, j;
+  tran_low_t out;
+
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  a = ROUND_POWER_OF_TWO(out, 6);
+
+  d = _mm_set1_epi32(a);
+  dc_value = _mm_packs_epi32(d, d);
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 4; ++j) {
+      d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));
+      d = _mm_adds_epi16(d, dc_value);
+      d = _mm_max_epi16(d, zero);
+      d = _mm_min_epi16(d, max);
+      _mm_storeu_si128((__m128i *)(&dest[j * 8]), d);
+    }
+    dest += stride;
+  }
+}
diff --git a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c

new file mode 100644 (file)

index 0000000..f3a6979
--- /dev/null
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -0,0 +1,129 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                    int stride, int bd) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  int i, j;
+  __m128i inptr[4];
+  __m128i sign_bits[2];
+  __m128i temp_mm, min_input, max_input;
+  int test;
+  int optimised_cols = 0;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i max = _mm_set1_epi16(12043);
+  const __m128i min = _mm_set1_epi16(-12043);
+  // Load input into __m128i
+  inptr[0] = _mm_loadu_si128((const __m128i *)input);
+  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
+  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
+  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
+
+  // Pack to 16 bits
+  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
+  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
+
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp_mm = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp_mm);
+
+  if (!test) {
+    // Do the row transform
+    idct4_sse2(inptr);
+
+    // Check the min & max values
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp_mm = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp_mm);
+
+    if (test) {
+      transpose_4x4(inptr);
+      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
+      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
+      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
+      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
+      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
+      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
+      _mm_storeu_si128((__m128i *)outptr, inptr[0]);
+      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
+      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
+      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vpx_highbd_idct4_c(input, outptr, bd);
+      input += 4;
+      outptr += 4;
+    }
+  }
+
+  if (optimised_cols) {
+    idct4_sse2(inptr);
+
+    // Final round and shift
+    inptr[0] = _mm_add_epi16(inptr[0], eight);
+    inptr[1] = _mm_add_epi16(inptr[1], eight);
+
+    inptr[0] = _mm_srai_epi16(inptr[0], 4);
+    inptr[1] = _mm_srai_epi16(inptr[1], 4);
+
+    // Reconstruction and Store
+    {
+      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+      d0 = _mm_unpacklo_epi64(
+          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
+      d2 = _mm_unpacklo_epi64(
+          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
+      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
+      // store input0
+      _mm_storel_epi64((__m128i *)dest, d0);
+      // store input1
+      d0 = _mm_srli_si128(d0, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride), d0);
+      // store input2
+      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+      // store input3
+      d2 = _mm_srli_si128(d2, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[4], temp_out[4];
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
+      vpx_highbd_idct4_c(temp_in, temp_out, bd);
+      for (j = 0; j < 4; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+      }
+    }
+  }
+}
diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c

new file mode 100644 (file)

index 0000000..6a2e180
--- /dev/null
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -0,0 +1,216 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                    int stride, int bd) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 8; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 8; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_8x8(inptr, inptr);
+      for (i = 0; i < 8; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 8; ++i) {
+      vpx_highbd_idct8_c(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
+      vpx_highbd_idct8_c(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      }
+    }
+  }
+}
+
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                    int stride, int bd) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // only first 4 row has non-zero coefs
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 8; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_4X8(inptr, inptr);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vpx_highbd_idct8_c(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
+      vpx_highbd_idct8_c(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      }
+    }
+  }
+}
diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h

new file mode 100644 (file)

index 0000000..774cce1
--- /dev/null
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
+#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
+  __m128i ubounded, retval;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  ubounded = _mm_cmpgt_epi16(value, max);
+  retval = _mm_andnot_si128(ubounded, value);
+  ubounded = _mm_and_si128(ubounded, max);
+  retval = _mm_or_si128(retval, ubounded);
+  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
+  return retval;
+}
+
+#endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c

index 8c33caedbd8943d68b3b177e4987d507c6b69874..4b201b987595992ae3d3ccf192c543c7c1a74799 100644 (file)
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -10,153 +10,36 @@
  
  #include "./vpx_dsp_rtcd.h"
  #include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
  #include "vpx_dsp/x86/txfm_common_sse2.h"
  
-#define RECON_AND_STORE4X4(dest, in_x)                    \
-  {                                                       \
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
-    d0 = _mm_unpacklo_epi8(d0, zero);                     \
-    d0 = _mm_add_epi16(in_x, d0);                         \
-    d0 = _mm_packus_epi16(d0, d0);                        \
-    *(int *)(dest) = _mm_cvtsi128_si32(d0);               \
-  }
-
  void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
                               int stride) {
-  const __m128i zero = _mm_setzero_si128();
    const __m128i eight = _mm_set1_epi16(8);
-  const __m128i cst = _mm_setr_epi16(
-      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
-      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
-      (int16_t)cospi_8_64, (int16_t)cospi_24_64);
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i input0, input1, input2, input3;
+  __m128i in[2];
  
    // Rows
-  input0 = load_input_data(input);
-  input2 = load_input_data(input + 8);
-
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_shufflelo_epi16(input0, 0xd8);
-  input0 = _mm_shufflehi_epi16(input0, 0xd8);
-  input2 = _mm_shufflelo_epi16(input2, 0xd8);
-  input2 = _mm_shufflehi_epi16(input2, 0xd8);
-
-  input1 = _mm_unpackhi_epi32(input0, input0);
-  input0 = _mm_unpacklo_epi32(input0, input0);
-  input3 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpacklo_epi32(input2, input2);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input1);
-  input1 = _mm_packs_epi32(input2, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
+  in[0] = load_input_data(input);
+  in[1] = load_input_data(input + 8);
+  idct4_sse2(in);
  
    // Columns
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_unpacklo_epi32(input2, input2);
-  input1 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpackhi_epi32(input3, input3);
-  input3 = _mm_unpacklo_epi32(input3, input3);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input2);
-  input1 = _mm_packs_epi32(input1, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
+  idct4_sse2(in);
  
    // Final round and shift
-  input2 = _mm_add_epi16(input2, eight);
-  input3 = _mm_add_epi16(input3, eight);
-
-  input2 = _mm_srai_epi16(input2, 4);
-  input3 = _mm_srai_epi16(input3, 4);
+  in[0] = _mm_add_epi16(in[0], eight);
+  in[1] = _mm_add_epi16(in[1], eight);
+  in[0] = _mm_srai_epi16(in[0], 4);
+  in[1] = _mm_srai_epi16(in[1], 4);
  
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi32(d0,
-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
-    d2 = _mm_unpacklo_epi32(
-        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
-    d0 = _mm_unpacklo_epi8(d0, zero);
-    d2 = _mm_unpacklo_epi8(d2, zero);
-    d0 = _mm_add_epi16(d0, input2);
-    d2 = _mm_add_epi16(d2, input3);
-    d0 = _mm_packus_epi16(d0, d2);
-    // store input0
-    *(int *)dest = _mm_cvtsi128_si32(d0);
-    // store input1
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-    // store input2
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-    // store input3
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-  }
+  recon_and_store4x4_sse2(in, dest, stride);
  }
  
  void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
                              int stride) {
-  __m128i dc_value;
    const __m128i zero = _mm_setzero_si128();
    int a;
+  __m128i dc_value, d[2];
  
    a = (int)dct_const_round_shift(input[0] * cospi_16_64);
    a = (int)dct_const_round_shift(a * cospi_16_64);
@@ -164,18 +47,26 @@ void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
  
    dc_value = _mm_set1_epi16(a);
  
-  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
-}
-
-static INLINE void transpose_4x4(__m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
-  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
-  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+  // Reconstruction and Store
+  d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
+  d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+  d[0] = _mm_unpacklo_epi32(d[0],
+                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+  d[1] = _mm_unpacklo_epi32(
+      _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
+  d[0] = _mm_unpacklo_epi8(d[0], zero);
+  d[1] = _mm_unpacklo_epi8(d[1], zero);
+  d[0] = _mm_add_epi16(d[0], dc_value);
+  d[1] = _mm_add_epi16(d[1], dc_value);
+  d[0] = _mm_packus_epi16(d[0], d[1]);
+
+  *(int *)dest = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
  }
  
  void idct4_sse2(__m128i *in) {
@@ -3349,595 +3240,3 @@ void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
      RECON_AND_STORE(dest + 24 + j * stride, dc_value);
    }
  }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
-  __m128i ubounded, retval;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
-  ubounded = _mm_cmpgt_epi16(value, max);
-  retval = _mm_andnot_si128(ubounded, value);
-  ubounded = _mm_and_si128(ubounded, max);
-  retval = _mm_or_si128(retval, ubounded);
-  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
-  return retval;
-}
-
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  __m128i inptr[4];
-  __m128i sign_bits[2];
-  __m128i temp_mm, min_input, max_input;
-  int test;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  int optimised_cols = 0;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i max = _mm_set1_epi16(12043);
-  const __m128i min = _mm_set1_epi16(-12043);
-  // Load input into __m128i
-  inptr[0] = _mm_loadu_si128((const __m128i *)input);
-  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
-  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
-  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
-
-  // Pack to 16 bits
-  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
-  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
-
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp_mm = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp_mm);
-
-  if (!test) {
-    // Do the row transform
-    idct4_sse2(inptr);
-
-    // Check the min & max values
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp_mm = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp_mm);
-
-    if (test) {
-      transpose_4x4(inptr);
-      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
-      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
-      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
-      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
-      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
-      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
-      _mm_storeu_si128((__m128i *)outptr, inptr[0]);
-      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
-      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
-      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct4_c(input, outptr, bd);
-      input += 4;
-      outptr += 4;
-    }
-  }
-
-  if (optimised_cols) {
-    idct4_sse2(inptr);
-
-    // Final round and shift
-    inptr[0] = _mm_add_epi16(inptr[0], eight);
-    inptr[1] = _mm_add_epi16(inptr[1], eight);
-
-    inptr[0] = _mm_srai_epi16(inptr[0], 4);
-    inptr[1] = _mm_srai_epi16(inptr[1], 4);
-
-    // Reconstruction and Store
-    {
-      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
-      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
-      d0 = _mm_unpacklo_epi64(
-          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
-      d2 = _mm_unpacklo_epi64(
-          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
-      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
-      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
-      // store input0
-      _mm_storel_epi64((__m128i *)dest, d0);
-      // store input1
-      d0 = _mm_srli_si128(d0, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride), d0);
-      // store input2
-      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
-      // store input3
-      d2 = _mm_srli_si128(d2, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[4], temp_out[4];
-    // Columns
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-      vpx_highbd_idct4_c(temp_in, temp_out, bd);
-      for (j = 0; j < 4; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[8];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  const __m128i max = _mm_set1_epi16(6201);
-  const __m128i min = _mm_set1_epi16(-6201);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 8; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 8; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct8_sse2(inptr);
-
-    // Find the min & max for the column transform
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 8; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      array_transpose_8x8(inptr, inptr);
-      for (i = 0; i < 8; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 8; ++i) {
-      vpx_highbd_idct8_c(input, outptr, bd);
-      input += 8;
-      outptr += 8;
-    }
-  }
-
-  if (optimised_cols) {
-    idct8_sse2(inptr);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[8];
-      for (i = 0; i < 8; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[8], temp_out[8];
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-      vpx_highbd_idct8_c(temp_in, temp_out, bd);
-      for (j = 0; j < 8; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[8];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  const __m128i max = _mm_set1_epi16(6201);
-  const __m128i min = _mm_set1_epi16(-6201);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 8; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  // only first 4 row has non-zero coefs
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 4; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct8_sse2(inptr);
-
-    // Find the min & max for the column transform
-    // N.B. Only first 4 cols contain non-zero coeffs
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 8; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      // Use fact only first 4 rows contain non-zero coeffs
-      array_transpose_4X8(inptr, inptr);
-      for (i = 0; i < 4; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct8_c(input, outptr, bd);
-      input += 8;
-      outptr += 8;
-    }
-  }
-
-  if (optimised_cols) {
-    idct8_sse2(inptr);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[8];
-      for (i = 0; i < 8; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[8], temp_out[8];
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-      vpx_highbd_idct8_c(temp_in, temp_out, bd);
-      for (j = 0; j < 8; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                       int stride, int bd) {
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[32];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_set1_epi16(32);
-  const __m128i max = _mm_set1_epi16(3155);
-  const __m128i min = _mm_set1_epi16(-3155);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 16; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 32; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct16_sse2(inptr, inptr + 16);
-
-    // Find the min & max for the column transform
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 32; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      array_transpose_16x16(inptr, inptr + 16);
-      for (i = 0; i < 16; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 16; ++i) {
-      vpx_highbd_idct16_c(input, outptr, bd);
-      input += 16;
-      outptr += 16;
-    }
-  }
-
-  if (optimised_cols) {
-    idct16_sse2(inptr, inptr + 16);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[2];
-      for (i = 0; i < 16; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], rounding);
-        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
-        inptr[i] = _mm_srai_epi16(inptr[i], 6);
-        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[16], temp_out[16];
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-      vpx_highbd_idct16_c(temp_in, temp_out, bd);
-      for (j = 0; j < 16; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                      int stride, int bd) {
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[32];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_set1_epi16(32);
-  const __m128i max = _mm_set1_epi16(3155);
-  const __m128i min = _mm_set1_epi16(-3155);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 16; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  // Since all non-zero dct coefficients are in upper-left 4x4 area,
-  // we only need to consider first 4 rows here.
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 4; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform (N.B. This transposes inptr)
-    idct16_sse2(inptr, inptr + 16);
-
-    // Find the min & max for the column transform
-    // N.B. Only first 4 cols contain non-zero coeffs
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 16; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      // Use fact only first 4 rows contain non-zero coeffs
-      array_transpose_8x8(inptr, inptr);
-      array_transpose_8x8(inptr + 8, inptr + 16);
-      for (i = 0; i < 4; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct16_c(input, outptr, bd);
-      input += 16;
-      outptr += 16;
-    }
-  }
-
-  if (optimised_cols) {
-    idct16_sse2(inptr, inptr + 16);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[2];
-      for (i = 0; i < 16; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], rounding);
-        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
-        inptr[i] = _mm_srai_epi16(inptr[i], 6);
-        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[16], temp_out[16];
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-      vpx_highbd_idct16_c(temp_in, temp_out, bd);
-      for (j = 0; j < 16; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                     int stride, int bd) {
-  __m128i dc_value, d;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
-  int a, i, j;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  tran_low_t out;
-
-  out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
-  a = ROUND_POWER_OF_TWO(out, 6);
-
-  d = _mm_set1_epi32(a);
-  dc_value = _mm_packs_epi32(d, d);
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 4; ++j) {
-      d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));
-      d = _mm_adds_epi16(d, dc_value);
-      d = _mm_max_epi16(d, zero);
-      d = _mm_min_epi16(d, max);
-      _mm_storeu_si128((__m128i *)(&dest[j * 8]), d);
-    }
-    dest += stride;
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h

index d5683ab1cf0fd5a588c288003e8befe04c03c7ee..0460ab13bcbda57f36aa63ef9f9d8c248c02afaf 100644 (file)
--- a/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/vpx_dsp/x86/inv_txfm_sse2.h
@@ -279,6 +279,34 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
      res3 = _mm_packs_epi32(tmp6, tmp7);                                        \
    }
  
+static INLINE void recon_and_store4x4_sse2(const __m128i *const in,
+                                           uint8_t *const dest,
+                                           const int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d[2];
+
+  // Reconstruction and Store
+  d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
+  d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+  d[0] = _mm_unpacklo_epi32(d[0],
+                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+  d[1] = _mm_unpacklo_epi32(
+      _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
+  d[0] = _mm_unpacklo_epi8(d[0], zero);
+  d[1] = _mm_unpacklo_epi8(d[1], zero);
+  d[0] = _mm_add_epi16(d[0], in[0]);
+  d[1] = _mm_add_epi16(d[1], in[1]);
+  d[0] = _mm_packus_epi16(d[0], d[1]);
+
+  *(int *)dest = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
+}
+
  void idct4_sse2(__m128i *in);
  void idct8_sse2(__m128i *in);
  void idct16_sse2(__m128i *in0, __m128i *in1);
diff --git a/vpx_dsp/x86/transpose_sse2.h b/vpx_dsp/x86/transpose_sse2.h

new file mode 100644 (file)

index 0000000..7292723
--- /dev/null
+++ b/vpx_dsp/x86/transpose_sse2.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_
+#define VPX_DSP_X86_TRANSPOSE_SSE2_H_
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void transpose_4x4(__m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
+  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+}
+
+#endif  // VPX_DSP_X86_TRANSPOSE_SSE2_H_
author	Johann Koenig <johannkoenig@google.com>
	Wed, 10 May 2017 18:19:50 +0000 (18:19 +0000)
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>
	Wed, 10 May 2017 18:19:52 +0000 (18:19 +0000)
test/datarate_test.cc		patch \| blob \| history
test/dct16x16_test.cc		patch \| blob \| history
test/dct32x32_test.cc		patch \| blob \| history
test/fdct4x4_test.cc		patch \| blob \| history
test/fdct8x8_test.cc		patch \| blob \| history
test/partial_idct_test.cc		patch \| blob \| history
test/variance_test.cc		patch \| blob \| history
test/vp9_denoiser_test.cc		patch \| blob \| history
vp9/common/vp9_idct.c		patch \| blob \| history
vp9/common/vp9_idct.h		patch \| blob \| history
vp9/common/vp9_rtcd_defs.pl		patch \| blob \| history
vp9/common/x86/vp9_idct_intrin_sse2.c		patch \| blob \| history
vp9/decoder/vp9_decodeframe.c		patch \| blob \| history
vp9/encoder/arm/neon/vp9_denoiser_neon.c	[new file with mode: 0644]	patch \| blob
vp9/encoder/vp9_block.h		patch \| blob \| history
vp9/encoder/vp9_encodemb.c		patch \| blob \| history
vp9/encoder/vp9_encoder.c		patch \| blob \| history
vp9/encoder/vp9_encoder.h		patch \| blob \| history
vp9/encoder/vp9_firstpass.c		patch \| blob \| history
vp9/encoder/vp9_frame_scale.c		patch \| blob \| history
vp9/encoder/vp9_noise_estimate.c		patch \| blob \| history
vp9/encoder/vp9_rdopt.c		patch \| blob \| history
vp9/encoder/vp9_svc_layercontext.c		patch \| blob \| history
vp9/encoder/vp9_svc_layercontext.h		patch \| blob \| history
vp9/encoder/vp9_temporal_filter.c		patch \| blob \| history
vp9/encoder/x86/vp9_frame_scale_ssse3.c		patch \| blob \| history
vp9/vp9cx.mk		patch \| blob \| history
vpx_dsp/arm/fwd_txfm_neon.c		patch \| blob \| history
vpx_dsp/arm/highbd_idct16x16_add_neon.c		patch \| blob \| history
vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c		patch \| blob \| history
vpx_dsp/arm/highbd_idct32x32_135_add_neon.c		patch \| blob \| history
vpx_dsp/arm/highbd_idct32x32_34_add_neon.c		patch \| blob \| history
vpx_dsp/arm/highbd_idct32x32_add_neon.c		patch \| blob \| history
vpx_dsp/arm/highbd_idct4x4_add_neon.c		patch \| blob \| history
vpx_dsp/arm/highbd_idct8x8_add_neon.c		patch \| blob \| history
vpx_dsp/arm/idct32x32_add_neon.c		patch \| blob \| history
vpx_dsp/arm/variance_neon.c		patch \| blob \| history
vpx_dsp/inv_txfm.c		patch \| blob \| history
vpx_dsp/vpx_dsp.mk		patch \| blob \| history
vpx_dsp/vpx_dsp_rtcd_defs.pl		patch \| blob \| history
vpx_dsp/x86/highbd_idct16x16_add_sse2.c	[new file with mode: 0644]	patch \| blob
vpx_dsp/x86/highbd_idct32x32_add_sse2.c	[new file with mode: 0644]	patch \| blob
vpx_dsp/x86/highbd_idct4x4_add_sse2.c	[new file with mode: 0644]	patch \| blob
vpx_dsp/x86/highbd_idct8x8_add_sse2.c	[new file with mode: 0644]	patch \| blob
vpx_dsp/x86/highbd_inv_txfm_sse2.h	[new file with mode: 0644]	patch \| blob
vpx_dsp/x86/inv_txfm_sse2.c		patch \| blob \| history
vpx_dsp/x86/inv_txfm_sse2.h		patch \| blob \| history
vpx_dsp/x86/transpose_sse2.h	[new file with mode: 0644]	patch \| blob