Merge "ppc: Add vpx_sadnxmx4d_vsx for n,m = {8, 16, 32 ,64}"

author James Zern <jzern@google.com>

Tue, 6 Jun 2017 23:52:39 +0000 (23:52 +0000)

committer Gerrit Code Review <noreply-gerritcodereview@google.com>

Tue, 6 Jun 2017 23:52:39 +0000 (23:52 +0000)
author James Zern <jzern@google.com>
Tue, 6 Jun 2017 23:52:39 +0000 (23:52 +0000)
committer Gerrit Code Review <noreply-gerritcodereview@google.com>
Tue, 6 Jun 2017 23:52:39 +0000 (23:52 +0000)
diff --git a/README b/README

index 63f9eeade22d6d664914fc72f9b741c8b531f5b7..f910ce76187fa64dd3dee9215effcd4ad89e4fe9 100644 (file)
--- a/README
+++ b/README
@@ -58,6 +58,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
      armv7-win32-vs11
      armv7-win32-vs12
      armv7-win32-vs14
+    armv7-win32-vs15
      armv7s-darwin-gcc
      armv8-linux-gcc
      mips32-linux-gcc
@@ -85,6 +86,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
      x86-win32-vs11
      x86-win32-vs12
      x86-win32-vs14
+    x86-win32-vs15
      x86_64-android-gcc
      x86_64-darwin9-gcc
      x86_64-darwin10-gcc
@@ -103,6 +105,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
      x86_64-win64-vs11
      x86_64-win64-vs12
      x86_64-win64-vs14
+    x86_64-win64-vs15
      generic-gnu
  
    The generic-gnu target, in conjunction with the CROSS environment variable,
diff --git a/build/make/gen_msvs_sln.sh b/build/make/gen_msvs_sln.sh

index 7d5f4681094de8ad4969bb560db25106b7074b91..8b68038b3060150e1524f7201a64ca8538146d9e 100755 (executable)
--- a/build/make/gen_msvs_sln.sh
+++ b/build/make/gen_msvs_sln.sh
@@ -25,7 +25,7 @@ files.
  Options:
      --help                      Print this message
      --out=outfile               Redirect output to a file
-    --ver=version               Version (7,8,9,10,11,12,14) of visual studio to generate for
+    --ver=version               Version (7,8,9,10,11,12,14,15) of visual studio to generate for
      --target=isa-os-cc          Target specifier
  EOF
      exit 1
@@ -215,7 +215,7 @@ for opt in "$@"; do
      ;;
      --ver=*) vs_ver="$optval"
               case $optval in
-             10|11|12|14)
+             10|11|12|14|15)
               ;;
               *) die Unrecognized Visual Studio Version in $opt
               ;;
@@ -243,6 +243,9 @@ case "${vs_ver:-10}" in
      14) sln_vers="14.00"
         sln_vers_str="Visual Studio 2015"
      ;;
+    15) sln_vers="15.00"
+       sln_vers_str="Visual Studio 2017"
+    ;;
  esac
  sfx=vcxproj
  
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh

index 2cf62c117c2601808e7eaef56c42c770043374e2..171d0b99b6e8f4df2d13469abf1c0120e42b4c19 100755 (executable)
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -34,7 +34,7 @@ Options:
      --name=project_name         Name of the project (required)
      --proj-guid=GUID            GUID to use for the project
      --module-def=filename       File containing export definitions (for DLLs)
-    --ver=version               Version (10,11,12,14) of visual studio to generate for
+    --ver=version               Version (10,11,12,14,15) of visual studio to generate for
      --src-path-bare=dir         Path to root of source tree
      -Ipath/to/include           Additional include directories
      -DFLAG[=value]              Preprocessor macros to define
@@ -168,7 +168,7 @@ for opt in "$@"; do
          --ver=*)
              vs_ver="$optval"
              case "$optval" in
-                10|11|12|14)
+                10|11|12|14|15)
                  ;;
                  *) die Unrecognized Visual Studio Version in $opt
                  ;;
@@ -218,7 +218,7 @@ guid=${guid:-`generate_uuid`}
  asm_use_custom_step=false
  uses_asm=${uses_asm:-false}
  case "${vs_ver:-11}" in
-    10|11|12|14)
+    10|11|12|14|15)
         asm_use_custom_step=$uses_asm
      ;;
  esac
@@ -347,6 +347,9 @@ generate_vcxproj() {
              if [ "$vs_ver" = "14" ]; then
                  tag_content PlatformToolset v140
              fi
+            if [ "$vs_ver" = "15" ]; then
+                tag_content PlatformToolset v141
+            fi
              tag_content CharacterSet Unicode
              if [ "$config" = "Release" ]; then
                  tag_content WholeProgramOptimization true
diff --git a/configure b/configure

index 8f4ceb047a47b8c6b33c436f66fd13672899fef6..090d3fb1edc115ab55c7115a1e0814c32f982a89 100755 (executable)
--- a/configure
+++ b/configure
@@ -109,6 +109,7 @@ all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
  all_platforms="${all_platforms} armv7-win32-vs11"
  all_platforms="${all_platforms} armv7-win32-vs12"
  all_platforms="${all_platforms} armv7-win32-vs14"
+all_platforms="${all_platforms} armv7-win32-vs15"
  all_platforms="${all_platforms} armv7s-darwin-gcc"
  all_platforms="${all_platforms} armv8-linux-gcc"
  all_platforms="${all_platforms} mips32-linux-gcc"
@@ -138,6 +139,7 @@ all_platforms="${all_platforms} x86-win32-vs10"
  all_platforms="${all_platforms} x86-win32-vs11"
  all_platforms="${all_platforms} x86-win32-vs12"
  all_platforms="${all_platforms} x86-win32-vs14"
+all_platforms="${all_platforms} x86-win32-vs15"
  all_platforms="${all_platforms} x86_64-android-gcc"
  all_platforms="${all_platforms} x86_64-darwin9-gcc"
  all_platforms="${all_platforms} x86_64-darwin10-gcc"
@@ -156,6 +158,7 @@ all_platforms="${all_platforms} x86_64-win64-vs10"
  all_platforms="${all_platforms} x86_64-win64-vs11"
  all_platforms="${all_platforms} x86_64-win64-vs12"
  all_platforms="${all_platforms} x86_64-win64-vs14"
+all_platforms="${all_platforms} x86_64-win64-vs15"
  all_platforms="${all_platforms} generic-gnu"
  
  # all_targets is a list of all targets that can be configured
diff --git a/test/buffer.h b/test/buffer.h

index 75016c91edd9148d7589945e9b6557d20350ca22..1fbbae1dd87f9cd7495b5f387f9a111d561f9aaf 100644 (file)
--- a/test/buffer.h
+++ b/test/buffer.h
@@ -29,16 +29,12 @@ class Buffer {
           int right_padding, int bottom_padding)
        : width_(width), height_(height), top_padding_(top_padding),
          left_padding_(left_padding), right_padding_(right_padding),
-        bottom_padding_(bottom_padding) {
-    Init();
-  }
+        bottom_padding_(bottom_padding), raw_buffer_(NULL) {}
  
    Buffer(int width, int height, int padding)
        : width_(width), height_(height), top_padding_(padding),
          left_padding_(padding), right_padding_(padding),
-        bottom_padding_(padding) {
-    Init();
-  }
+        bottom_padding_(padding), raw_buffer_(NULL) {}
  
    ~Buffer() { delete[] raw_buffer_; }
  
@@ -47,7 +43,7 @@ class Buffer {
    int stride() const { return stride_; }
  
    // Set the buffer (excluding padding) to 'value'.
-  void Set(const int value);
+  void Set(const T value);
  
    // Set the buffer (excluding padding) to the output of ACMRandom function 'b'.
    void Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)());
@@ -63,11 +59,11 @@ class Buffer {
    bool HasPadding() const;
  
    // Sets all the values in the buffer to 'padding_value'.
-  void SetPadding(const int padding_value);
+  void SetPadding(const T padding_value);
  
    // Checks if all the values (excluding padding) are equal to 'value' if the
    // Buffers are the same size.
-  bool CheckValues(const int value) const;
+  bool CheckValues(const T value) const;
  
    // Check that padding matches the expected value or there is no padding.
    bool CheckPadding() const;
@@ -75,21 +71,22 @@ class Buffer {
    // Compare the non-padding portion of two buffers if they are the same size.
    bool CheckValues(const Buffer<T> &a) const;
  
- private:
-  void Init() {
-    ASSERT_GT(width_, 0);
-    ASSERT_GT(height_, 0);
-    ASSERT_GE(top_padding_, 0);
-    ASSERT_GE(left_padding_, 0);
-    ASSERT_GE(right_padding_, 0);
-    ASSERT_GE(bottom_padding_, 0);
+  bool Init() {
+    EXPECT_GT(width_, 0);
+    EXPECT_GT(height_, 0);
+    EXPECT_GE(top_padding_, 0);
+    EXPECT_GE(left_padding_, 0);
+    EXPECT_GE(right_padding_, 0);
+    EXPECT_GE(bottom_padding_, 0);
      stride_ = left_padding_ + width_ + right_padding_;
      raw_size_ = stride_ * (top_padding_ + height_ + bottom_padding_);
      raw_buffer_ = new (std::nothrow) T[raw_size_];
-    ASSERT_TRUE(raw_buffer_ != NULL);
+    EXPECT_TRUE(raw_buffer_ != NULL);
      SetPadding(std::numeric_limits<T>::max());
+    return !::testing::Test::HasFailure();
    }
  
+ private:
    bool BufferSizesMatch(const Buffer<T> &a) const;
  
    const int width_;
@@ -98,7 +95,7 @@ class Buffer {
    const int left_padding_;
    const int right_padding_;
    const int bottom_padding_;
-  int padding_value_;
+  T padding_value_;
    int stride_;
    int raw_size_;
    T *raw_buffer_;
@@ -106,11 +103,13 @@ class Buffer {
  
  template <typename T>
  T *Buffer<T>::TopLeftPixel() const {
+  if (!raw_buffer_) return NULL;
    return raw_buffer_ + (top_padding_ * stride()) + left_padding_;
  }
  
  template <typename T>
-void Buffer<T>::Set(const int value) {
+void Buffer<T>::Set(const T value) {
+  if (!raw_buffer_) return;
    T *src = TopLeftPixel();
    for (int height = 0; height < height_; ++height) {
      for (int width = 0; width < width_; ++width) {
@@ -122,6 +121,7 @@ void Buffer<T>::Set(const int value) {
  
  template <typename T>
  void Buffer<T>::Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)()) {
+  if (!raw_buffer_) return;
    T *src = TopLeftPixel();
    for (int height = 0; height < height_; ++height) {
      for (int width = 0; width < width_; ++width) {
@@ -133,9 +133,8 @@ void Buffer<T>::Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)()) {
  
  template <typename T>
  void Buffer<T>::CopyFrom(const Buffer<T> &a) {
-  if (!BufferSizesMatch(a)) {
-    return;
-  }
+  if (!raw_buffer_) return;
+  if (!BufferSizesMatch(a)) return;
  
    T *a_src = a.TopLeftPixel();
    T *b_src = this->TopLeftPixel();
@@ -150,6 +149,7 @@ void Buffer<T>::CopyFrom(const Buffer<T> &a) {
  
  template <typename T>
  void Buffer<T>::DumpBuffer() const {
+  if (!raw_buffer_) return;
    for (int height = 0; height < height_ + top_padding_ + bottom_padding_;
         ++height) {
      for (int width = 0; width < stride(); ++width) {
@@ -161,14 +161,14 @@ void Buffer<T>::DumpBuffer() const {
  
  template <typename T>
  bool Buffer<T>::HasPadding() const {
+  if (!raw_buffer_) return false;
    return top_padding_ || left_padding_ || right_padding_ || bottom_padding_;
  }
  
  template <typename T>
  void Buffer<T>::PrintDifference(const Buffer<T> &a) const {
-  if (!BufferSizesMatch(a)) {
-    return;
-  }
+  if (!raw_buffer_) return;
+  if (!BufferSizesMatch(a)) return;
  
    T *a_src = a.TopLeftPixel();
    T *b_src = TopLeftPixel();
@@ -206,7 +206,8 @@ void Buffer<T>::PrintDifference(const Buffer<T> &a) const {
  }
  
  template <typename T>
-void Buffer<T>::SetPadding(const int padding_value) {
+void Buffer<T>::SetPadding(const T padding_value) {
+  if (!raw_buffer_) return;
    padding_value_ = padding_value;
  
    T *src = raw_buffer_;
@@ -216,7 +217,8 @@ void Buffer<T>::SetPadding(const int padding_value) {
  }
  
  template <typename T>
-bool Buffer<T>::CheckValues(const int value) const {
+bool Buffer<T>::CheckValues(const T value) const {
+  if (!raw_buffer_) return false;
    T *src = TopLeftPixel();
    for (int height = 0; height < height_; ++height) {
      for (int width = 0; width < width_; ++width) {
@@ -231,9 +233,8 @@ bool Buffer<T>::CheckValues(const int value) const {
  
  template <typename T>
  bool Buffer<T>::CheckPadding() const {
-  if (!HasPadding()) {
-    return true;
-  }
+  if (!raw_buffer_) return false;
+  if (!HasPadding()) return true;
  
    // Top padding.
    T const *top = raw_buffer_;
@@ -278,9 +279,8 @@ bool Buffer<T>::CheckPadding() const {
  
  template <typename T>
  bool Buffer<T>::CheckValues(const Buffer<T> &a) const {
-  if (!BufferSizesMatch(a)) {
-    return false;
-  }
+  if (!raw_buffer_) return false;
+  if (!BufferSizesMatch(a)) return false;
  
    T *a_src = a.TopLeftPixel();
    T *b_src = this->TopLeftPixel();
@@ -298,6 +298,7 @@ bool Buffer<T>::CheckValues(const Buffer<T> &a) const {
  
  template <typename T>
  bool Buffer<T>::BufferSizesMatch(const Buffer<T> &a) const {
+  if (!raw_buffer_) return false;
    if (a.width_ != this->width_ || a.height_ != this->height_) {
      printf(
          "Reference buffer of size %dx%d does not match this buffer which is "
diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc

index 3feba7127430cb4d3126a5eda5db3b492d23ed74..3c1d746cd96dc92ca41e01150ca3d440570fa2e0 100644 (file)
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -80,6 +80,7 @@ TEST_P(AvgPredTest, SizeCombinations) {
          // Only the reference buffer may have a stride not equal to width.
          Buffer<uint8_t> ref =
              Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+        ASSERT_TRUE(ref.Init());
  
          fill(&rnd_, pred, width, height);
          ref.Set(&rnd_, &ACMRandom::Rand8);
@@ -98,6 +99,7 @@ TEST_P(AvgPredTest, CompareReferenceRandom) {
    const int width = 64;
    const int height = 32;
    Buffer<uint8_t> ref = Buffer<uint8_t>(width, height, 8);
+  ASSERT_TRUE(ref.Init());
    DECLARE_ALIGNED(16, uint8_t, pred[width * height]);
    DECLARE_ALIGNED(16, uint8_t, avg_ref[width * height]);
    DECLARE_ALIGNED(16, uint8_t, avg_chk[width * height]);
@@ -128,6 +130,7 @@ TEST_P(AvgPredTest, DISABLED_Speed) {
          const int height = 1 << height_pow;
          Buffer<uint8_t> ref =
              Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+        ASSERT_TRUE(ref.Init());
  
          fill(&rnd_, pred, width, height);
          ref.Set(&rnd_, &ACMRandom::Rand8);
@@ -156,6 +159,12 @@ INSTANTIATE_TEST_CASE_P(C, AvgPredTest,
  INSTANTIATE_TEST_CASE_P(SSE2, AvgPredTest,
                          ::testing::Values(&vpx_comp_avg_pred_sse2));
  #endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, AvgPredTest,
+                        ::testing::Values(&vpx_comp_avg_pred_neon));
+#endif  // HAVE_NEON
+
  #if HAVE_VSX
  INSTANTIATE_TEST_CASE_P(VSX, AvgPredTest,
                          ::testing::Values(&vpx_comp_avg_pred_vsx));
diff --git a/test/datarate_test.cc b/test/datarate_test.cc

index b93148e34df4cc0a586b145a4ea8687db3c8f119..a120a88d2a249f9abd675021f4f45f55fdae215b 100644 (file)
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -792,26 +792,29 @@ TEST_P(DatarateTestVP9Large, ChangingDropFrameThresh) {
                                         30, 1, 0, 140);
  
    const int kDropFrameThreshTestStep = 30;
-  vpx_codec_pts_t last_drop = 140;
-  int last_num_drops = 0;
-  for (int i = 10; i < 100; i += kDropFrameThreshTestStep) {
-    cfg_.rc_dropframe_thresh = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-        << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
-        << " The datarate for the file is greater than target by too much!";
-    ASSERT_LE(first_drop_, last_drop)
-        << " The first dropped frame for drop_thresh " << i
-        << " > first dropped frame for drop_thresh "
-        << i - kDropFrameThreshTestStep;
-    ASSERT_GE(num_drops_, last_num_drops * 0.85)
-        << " The number of dropped frames for drop_thresh " << i
-        << " < number of dropped frames for drop_thresh "
-        << i - kDropFrameThreshTestStep;
-    last_drop = first_drop_;
-    last_num_drops = num_drops_;
+  for (int j = 50; j <= 150; j += 100) {
+    cfg_.rc_target_bitrate = j;
+    vpx_codec_pts_t last_drop = 140;
+    int last_num_drops = 0;
+    for (int i = 10; i < 100; i += kDropFrameThreshTestStep) {
+      cfg_.rc_dropframe_thresh = i;
+      ResetModel();
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+          << " The datarate for the file is greater than target by too much!";
+      ASSERT_LE(first_drop_, last_drop)
+          << " The first dropped frame for drop_thresh " << i
+          << " > first dropped frame for drop_thresh "
+          << i - kDropFrameThreshTestStep;
+      ASSERT_GE(num_drops_, last_num_drops * 0.85)
+          << " The number of dropped frames for drop_thresh " << i
+          << " < number of dropped frames for drop_thresh "
+          << i - kDropFrameThreshTestStep;
+      last_drop = first_drop_;
+      last_num_drops = num_drops_;
+    }
    }
  }
  
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc

index bd2327520bce00cdc61d0e7d3f443d670e76c4f8..aa90bfa181a63dcac7d966af4bbf97f52c8d0312 100644 (file)
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -440,7 +440,7 @@ INSTANTIATE_TEST_CASE_P(C, Trans4x4WHT,
  
  #if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
  INSTANTIATE_TEST_CASE_P(NEON, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_c,
+                        ::testing::Values(make_tuple(&vpx_fdct4x4_neon,
                                                       &vpx_idct4x4_16_add_neon,
                                                       0, VPX_BITS_8)));
  #if !CONFIG_VP9_HIGHBITDEPTH
diff --git a/test/idct_test.cc b/test/idct_test.cc

index 084b2ed0cf3fb370e0fc5886d4f5f2969c78de06..5936d469b46b1a77f00daf94ed9a2c90b7789e15 100644 (file)
--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -31,11 +31,11 @@ class IDCTTest : public ::testing::TestWithParam<IdctFunc> {
      UUT = GetParam();
  
      input = new (std::nothrow) Buffer<int16_t>(4, 4, 0);
-    ASSERT_TRUE(input != NULL);
+    ASSERT_TRUE(input->Init());
      predict = new (std::nothrow) Buffer<uint8_t>(4, 4, 3);
-    ASSERT_TRUE(predict != NULL);
+    ASSERT_TRUE(predict->Init());
      output = new (std::nothrow) Buffer<uint8_t>(4, 4, 3);
-    ASSERT_TRUE(output != NULL);
+    ASSERT_TRUE(output->Init());
    }
  
    virtual void TearDown() {
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc

index 2dda7da4520075fbef59c3bcba483ac90c6cf119..c76308bbb7f12ba7ab0e9ee30db433df68df9944 100644 (file)
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -128,12 +128,12 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
    }
  
    void InitInput() {
-    const int max_coeff = 32766 / 4;
-    int max_energy_leftover = max_coeff * max_coeff;
+    const int64_t max_coeff = (32766 << (bit_depth_ - 8)) / 4;
+    int64_t max_energy_leftover = max_coeff * max_coeff;
      for (int j = 0; j < last_nonzero_; ++j) {
-      int16_t coeff = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
-                                           (rnd_.Rand16() - 32768) / 65536);
-      max_energy_leftover -= coeff * coeff;
+      tran_low_t coeff = static_cast<tran_low_t>(
+          sqrt(1.0 * max_energy_leftover) * (rnd_.Rand16() - 32768) / 65536);
+      max_energy_leftover -= static_cast<int64_t>(coeff) * coeff;
        if (max_energy_leftover < 0) {
          max_energy_leftover = 0;
          coeff = 0;
@@ -142,6 +142,36 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
      }
    }
  
+  void PrintDiff() {
+    if (memcmp(output_block_ref_, output_block_,
+               pixel_size_ * output_block_size_)) {
+      uint16_t ref, opt;
+      for (int y = 0; y < size_; y++) {
+        for (int x = 0; x < size_; x++) {
+          if (pixel_size_ == 1) {
+            ref = output_block_ref_[y * stride_ + x];
+            opt = output_block_[y * stride_ + x];
+          } else {
+            ref = reinterpret_cast<uint16_t *>(
+                output_block_ref_)[y * stride_ + x];
+            opt = reinterpret_cast<uint16_t *>(output_block_)[y * stride_ + x];
+          }
+          if (ref != opt) {
+            printf("dest[%d][%d] diff:%6d (ref),%6d (opt)\n", y, x, ref, opt);
+          }
+        }
+      }
+
+      printf("\ninput_block_:\n");
+      for (int y = 0; y < size_; y++) {
+        for (int x = 0; x < size_; x++) {
+          printf("%6d,", input_block_[y * size_ + x]);
+        }
+        printf("\n");
+      }
+    }
+  }
+
   protected:
    int last_nonzero_;
    TX_SIZE tx_size_;
@@ -162,23 +192,32 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
  };
  
  TEST_P(PartialIDctTest, RunQuantCheck) {
+  const int count_test_block = (size_ != 4) ? kCountTestBlock : 65536;
    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
  
    InitMem();
-  for (int i = 0; i < kCountTestBlock; ++i) {
+
+  for (int i = 0; i < count_test_block; ++i) {
      // Initialize a test block with input range [-mask_, mask_].
-    if (i == 0) {
-      for (int k = 0; k < input_block_size_; ++k) {
-        input_extreme_block[k] = mask_;
-      }
-    } else if (i == 1) {
-      for (int k = 0; k < input_block_size_; ++k) {
-        input_extreme_block[k] = -mask_;
+    if (size_ != 4) {
+      if (i == 0) {
+        for (int k = 0; k < input_block_size_; ++k) {
+          input_extreme_block[k] = mask_;
+        }
+      } else if (i == 1) {
+        for (int k = 0; k < input_block_size_; ++k) {
+          input_extreme_block[k] = -mask_;
+        }
+      } else {
+        for (int k = 0; k < input_block_size_; ++k) {
+          input_extreme_block[k] = rnd_.Rand8() % 2 ? mask_ : -mask_;
+        }
        }
      } else {
+      // Try all possible combinations.
        for (int k = 0; k < input_block_size_; ++k) {
-        input_extreme_block[k] = rnd_.Rand8() % 2 ? mask_ : -mask_;
+        input_extreme_block[k] = (i & (1 << k)) ? mask_ : -mask_;
        }
      }
  
@@ -277,9 +316,9 @@ TEST_P(PartialIDctTest, DISABLED_Speed) {
    vpx_usec_timer_mark(&timer);
    const int elapsed_time =
        static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
-  printf("idct%dx%d_%d (bitdepth %d) time: %5d ms\n", size_, size_,
-         last_nonzero_, bit_depth_, elapsed_time);
-
+  printf("idct%dx%d_%d (%s %d) time: %5d ms\n", size_, size_, last_nonzero_,
+         (pixel_size_ == 1) ? "bitdepth" : "high bitdepth", bit_depth_,
+         elapsed_time);
    ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
                        pixel_size_ * output_block_size_))
        << "Error: partial inverse transform produces different results";
@@ -619,6 +658,15 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
    make_tuple(
        &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
        &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 12, 2),
    make_tuple(&vpx_highbd_fdct8x8_c,
               &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
               &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse2>, TX_8X8, 64, 8, 2),
@@ -637,6 +685,12 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
    make_tuple(
        &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
        &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse2>, TX_8X8, 12, 12, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 8, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 10, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 12, 2),
    make_tuple(&vpx_highbd_fdct4x4_c,
               &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
               &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse2>, TX_4X4, 16, 8, 2),
@@ -646,6 +700,12 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
    make_tuple(
        &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
        &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse2>, TX_4X4, 16, 12, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 8, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 10, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 12, 2),
  #endif  // CONFIG_VP9_HIGHBITDEPTH
    make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
               &wrapper<vpx_idct32x32_1024_add_sse2>, TX_32X32, 1024, 8, 1),
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc

index 95da09c314826175d4aecfd5d0b2a091fd15482d..b11a1ba25ee88b900d02e82f3a115c1101fb2c81 100644 (file)
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -57,12 +57,14 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
  
    // 5-tap filter needs 2 padding rows above and below the block in the input.
    Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width, block_height, 2);
+  ASSERT_TRUE(src_image.Init());
  
    // Filter extends output block by 8 samples at left and right edges.
    // Though the left padding is only 8 bytes, the assembly code tries to
    // read 16 bytes before the pointer.
    Buffer<uint8_t> dst_image =
        Buffer<uint8_t>(block_width, block_height, 8, 16, 8, 8);
+  ASSERT_TRUE(dst_image.Init());
  
    uint8_t *const flimits =
        reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
@@ -108,6 +110,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
    // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
    Buffer<uint8_t> src_image =
        Buffer<uint8_t>(block_width, block_height, 2, 2, 10, 2);
+  ASSERT_TRUE(src_image.Init());
  
    // Filter extends output block by 8 samples at left and right edges.
    // Though the left padding is only 8 bytes, there is 'above' padding as well
@@ -116,7 +119,9 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
    // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
    Buffer<uint8_t> dst_image =
        Buffer<uint8_t>(block_width, block_height, 8, 8, 16, 8);
+  ASSERT_TRUE(dst_image.Init());
    Buffer<uint8_t> dst_image_ref = Buffer<uint8_t>(block_width, block_height, 8);
+  ASSERT_TRUE(dst_image_ref.Init());
  
    // Filter values are set in blocks of 16 for Y and 8 for U/V. Each macroblock
    // can have a different filter. SSE2 assembly reads flimits in blocks of 16 so
@@ -197,10 +202,12 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) {
    const int cols = 16;
  
    Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(src.Init());
    src.SetPadding(10);
    SetCols(src.TopLeftPixel(), rows, cols, src.stride());
  
    Buffer<uint8_t> expected_output = Buffer<uint8_t>(cols, rows, 0);
+  ASSERT_TRUE(expected_output.Init());
    SetCols(expected_output.TopLeftPixel(), rows, cols, expected_output.stride());
  
    RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(0),
@@ -212,6 +219,7 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckMediumFilterOutput) {
    const int cols = 16;
  
    Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(src.Init());
    src.SetPadding(10);
    SetCols(src.TopLeftPixel(), rows, cols, src.stride());
  
@@ -228,6 +236,7 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckHighFilterOutput) {
    const int cols = 16;
  
    Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(src.Init());
    src.SetPadding(10);
    SetCols(src.TopLeftPixel(), rows, cols, src.stride());
  
@@ -249,7 +258,9 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) {
    const int cols = 16;
  
    Buffer<uint8_t> c_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(c_mem.Init());
    Buffer<uint8_t> asm_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(asm_mem.Init());
  
    // When level >= 100, the filter behaves the same as the level = INT_MAX
    // When level < 20, it behaves the same as the level = 0
@@ -305,6 +316,7 @@ TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) {
    const int cols = 16;
  
    Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_c.Init());
    src_c.SetPadding(10);
  
    SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
@@ -340,6 +352,7 @@ TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) {
    const int cols = 16;
  
    Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_c.Init());
    src_c.SetPadding(10);
  
    SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
@@ -370,6 +383,7 @@ TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) {
    const int cols = 16;
  
    Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_c.Init());
    src_c.SetPadding(10);
  
    SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
@@ -392,7 +406,9 @@ TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) {
    rnd.Reset(ACMRandom::DeterministicSeed());
  
    Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_c.Init());
    Buffer<uint8_t> src_asm = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_asm.Init());
  
    for (int level = 0; level < 100; level++) {
      src_c.SetPadding(10);
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc

index 8615ba45af3f4c55816eba79b503c9331e7c2738..0395eb82e41a76c1ef8e7c6c430db4c4f4fee955 100644 (file)
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -35,6 +35,7 @@ void reference_filter(const Buffer<uint8_t> &a, const Buffer<uint8_t> &b, int w,
                        Buffer<unsigned int> *accumulator,
                        Buffer<uint16_t> *count) {
    Buffer<int> diff_sq = Buffer<int>(w, h, 0);
+  ASSERT_TRUE(diff_sq.Init());
    diff_sq.Set(0);
  
    int rounding = 0;
@@ -119,6 +120,7 @@ TEST_P(TemporalFilterTest, SizeCombinations) {
    // Depending on subsampling this function may be called with values of 8 or 16
    // for width and height, in any combination.
    Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
+  ASSERT_TRUE(a.Init());
  
    const int filter_weight = 2;
    const int filter_strength = 6;
@@ -127,10 +129,15 @@ TEST_P(TemporalFilterTest, SizeCombinations) {
      for (int height = 8; height <= 16; height += 8) {
        // The second buffer must not have any border.
        Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+      ASSERT_TRUE(b.Init());
        Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_ref.Init());
        Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_chk.Init());
        Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_ref.Init());
        Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_chk.Init());
  
        a.Set(&rnd_, &ACMRandom::Rand8);
        b.Set(&rnd_, &ACMRandom::Rand8);
@@ -161,12 +168,18 @@ TEST_P(TemporalFilterTest, CompareReferenceRandom) {
    for (int width = 8; width <= 16; width += 8) {
      for (int height = 8; height <= 16; height += 8) {
        Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8);
+      ASSERT_TRUE(a.Init());
        // The second buffer must not have any border.
        Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+      ASSERT_TRUE(b.Init());
        Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_ref.Init());
        Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_chk.Init());
        Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_ref.Init());
        Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_chk.Init());
  
        for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) {
          for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) {
@@ -202,6 +215,7 @@ TEST_P(TemporalFilterTest, CompareReferenceRandom) {
  
  TEST_P(TemporalFilterTest, DISABLED_Speed) {
    Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
+  ASSERT_TRUE(a.Init());
  
    const int filter_weight = 2;
    const int filter_strength = 6;
@@ -210,10 +224,15 @@ TEST_P(TemporalFilterTest, DISABLED_Speed) {
      for (int height = 8; height <= 16; height += 8) {
        // The second buffer must not have any border.
        Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+      ASSERT_TRUE(b.Init());
        Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_ref.Init());
        Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_chk.Init());
        Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_ref.Init());
        Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_chk.Init());
  
        a.Set(&rnd_, &ACMRandom::Rand8);
        b.Set(&rnd_, &ACMRandom::Rand8);
diff --git a/test/variance_test.cc b/test/variance_test.cc

index 4fc5cf5d65293232d8212e2a9ee0e3030b1f8197..d5727a67a53a736ef8eb0f7a48479993683ba1ae 100644 (file)
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1259,7 +1259,9 @@ INSTANTIATE_TEST_CASE_P(
                        VarianceParams(4, 3, &vpx_variance16x8_neon),
                        VarianceParams(3, 4, &vpx_variance8x16_neon),
                        VarianceParams(3, 3, &vpx_variance8x8_neon),
-                      VarianceParams(3, 2, &vpx_variance8x4_neon)));
+                      VarianceParams(3, 2, &vpx_variance8x4_neon),
+                      VarianceParams(2, 3, &vpx_variance4x8_neon),
+                      VarianceParams(2, 2, &vpx_variance4x4_neon)));
  
  INSTANTIATE_TEST_CASE_P(
      NEON, VpxSubpelVarianceTest,
@@ -1273,7 +1275,26 @@ INSTANTIATE_TEST_CASE_P(
                        make_tuple(4, 3, &vpx_sub_pixel_variance16x8_neon, 0),
                        make_tuple(3, 4, &vpx_sub_pixel_variance8x16_neon, 0),
                        make_tuple(3, 3, &vpx_sub_pixel_variance8x8_neon, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_neon, 0)));
+                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_neon, 0),
+                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_neon, 0),
+                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_neon, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, VpxSubpelAvgVarianceTest,
+    ::testing::Values(
+        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_neon, 0),
+        make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_neon, 0),
+        make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_neon, 0),
+        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_neon, 0),
+        make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_neon, 0),
+        make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_neon, 0),
+        make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_neon, 0),
+        make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_neon, 0),
+        make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_neon, 0),
+        make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_neon, 0),
+        make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0),
+        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0),
+        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0)));
  #endif  // HAVE_NEON
  
  #if HAVE_MSA
diff --git a/test/vpx_temporal_svc_encoder.sh b/test/vpx_temporal_svc_encoder.sh

index 3d5152ae34fb924d81962d60626d302342aba35e..56a7902f4f2e32cf9ad86540020ac5a55c8d9f73 100755 (executable)
--- a/test/vpx_temporal_svc_encoder.sh
+++ b/test/vpx_temporal_svc_encoder.sh
@@ -52,11 +52,19 @@ vpx_tsvc_encoder() {
  
    # TODO(tomfinegan): Verify file output for all thread runs.
    for threads in $(seq $max_threads); do
-    eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" \
-        "${codec}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
-        "${timebase_num}" "${timebase_den}" "${speed}" "${frame_drop_thresh}" \
-        "${error_resilient}" "${threads}" "$@" \
-        ${devnull}
+    if [ "$(vpx_config_option_enabled CONFIG_VP9_HIGHBITDEPTH)" != "yes" ]; then
+      eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \
+        "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+        "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \
+        "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \
+        "$@" ${devnull}
+    else
+      eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \
+        "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+        "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \
+        "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \
+        "$@" "8" ${devnull}
+    fi
    done
  }
  
diff --git a/vp8/common/arm/neon/bilinearpredict_neon.c b/vp8/common/arm/neon/bilinearpredict_neon.c

index af566c2c41a1639980c11461e5df794ff37eff31..8520ab5ca01641a5dac98cd8419aa7843eb3447d 100644 (file)
--- a/vp8/common/arm/neon/bilinearpredict_neon.c
+++ b/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -11,6 +11,7 @@
  #include <arm_neon.h>
  #include <string.h>
  #include "./vpx_config.h"
+#include "vpx_dsp/arm/mem_neon.h"
  
  static const uint8_t bifilter4_coeff[8][2] = { { 128, 0 }, { 112, 16 },
                                                 { 96, 32 }, { 80, 48 },
@@ -21,35 +22,6 @@ static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
    return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
  }
  
-static INLINE void store4x4(unsigned char *dst, int dst_stride,
-                            const uint8x8_t a0, const uint8x8_t a1) {
-  if (!((uintptr_t)dst & 0x3) && !(dst_stride & 0x3)) {
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 0);
-    dst += dst_stride;
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 1);
-    dst += dst_stride;
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 0);
-    dst += dst_stride;
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 1);
-  } else {
-    // Store to the aligned local buffer and memcpy instead of vget_lane_u8
-    // which is really really slow.
-    uint32_t output_buffer[4];
-    vst1_lane_u32(output_buffer, vreinterpret_u32_u8(a0), 0);
-    vst1_lane_u32(output_buffer + 1, vreinterpret_u32_u8(a0), 1);
-    vst1_lane_u32(output_buffer + 2, vreinterpret_u32_u8(a1), 0);
-    vst1_lane_u32(output_buffer + 3, vreinterpret_u32_u8(a1), 1);
-
-    memcpy(dst, output_buffer, 4);
-    dst += dst_stride;
-    memcpy(dst, output_buffer + 1, 4);
-    dst += dst_stride;
-    memcpy(dst, output_buffer + 2, 4);
-    dst += dst_stride;
-    memcpy(dst, output_buffer + 3, 4);
-  }
-}
-
  void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr,
                                    int src_pixels_per_line, int xoffset,
                                    int yoffset, unsigned char *dst_ptr,
@@ -122,7 +94,7 @@ void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr,
  
    // secondpass_filter
    if (yoffset == 0) {  // skip_2ndpass_filter
-    store4x4(dst_ptr, dst_pitch, e0, e1);
+    store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
    } else {
      uint8x8_t f0, f1;
      const uint8x8_t filter0 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
@@ -140,7 +112,7 @@ void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr,
      f0 = vqrshrn_n_u16(b0, 7);
      f1 = vqrshrn_n_u16(b1, 7);
  
-    store4x4(dst_ptr, dst_pitch, f0, f1);
+    store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(f0, f1));
    }
  }
  
diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c

index fbb552ebe2a42b5e1c79232e3570369db200526a..aa2567df792e2148479d46b00c89e4b2afffe6e5 100644 (file)
--- a/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/vp8/common/arm/neon/sixtappredict_neon.c
@@ -11,6 +11,7 @@
  #include <arm_neon.h>
  #include <string.h>
  #include "./vpx_config.h"
+#include "vpx_dsp/arm/mem_neon.h"
  #include "vpx_ports/mem.h"
  
  static const int8_t vp8_sub_pel_filters[8][8] = {
@@ -42,35 +43,6 @@ static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
    return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
  }
  
-static INLINE void store4x4(unsigned char *dst, int dst_stride,
-                            const uint8x8_t a0, const uint8x8_t a1) {
-  if (!((uintptr_t)dst & 0x3) && !(dst_stride & 0x3)) {
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 0);
-    dst += dst_stride;
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 1);
-    dst += dst_stride;
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 0);
-    dst += dst_stride;
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 1);
-  } else {
-    // Store to the aligned local buffer and memcpy instead of vget_lane_u8
-    // which is really really slow.
-    uint32_t output_buffer[4];
-    vst1_lane_u32(output_buffer, vreinterpret_u32_u8(a0), 0);
-    vst1_lane_u32(output_buffer + 1, vreinterpret_u32_u8(a0), 1);
-    vst1_lane_u32(output_buffer + 2, vreinterpret_u32_u8(a1), 0);
-    vst1_lane_u32(output_buffer + 3, vreinterpret_u32_u8(a1), 1);
-
-    memcpy(dst, output_buffer, 4);
-    dst += dst_stride;
-    memcpy(dst, output_buffer + 1, 4);
-    dst += dst_stride;
-    memcpy(dst, output_buffer + 2, 4);
-    dst += dst_stride;
-    memcpy(dst, output_buffer + 3, 4);
-  }
-}
-
  static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b,
                                           const uint8x8_t filter, uint16x8_t *c,
                                           uint16x8_t *d) {
@@ -180,7 +152,7 @@ static INLINE void yonly4x4(const unsigned char *src, int src_stride,
    e0 = vqrshrun_n_s16(d0, 7);
    e1 = vqrshrun_n_s16(d1, 7);
  
-  store4x4(dst, dst_stride, e0, e1);
+  store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1));
  }
  
  void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
@@ -297,7 +269,7 @@ void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
    b2 = vqrshrun_n_s16(e4567, 7);
  
    if (yoffset == 0) {  // firstpass_filter4x4_only
-    store4x4(dst_ptr, dst_pitch, b0, b2);
+    store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2));
      return;
    }
  
@@ -411,7 +383,7 @@ void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
    e0 = vqrshrun_n_s16(d0, 7);
    e1 = vqrshrun_n_s16(d1, 7);
  
-  store4x4(dst_ptr, dst_pitch, e0, e1);
+  store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
  }
  
  void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
diff --git a/vp8/common/skin_detection.c b/vp8/common/skin_detection.c

new file mode 100644 (file)

index 0000000..4fd2da8
--- /dev/null
+++ b/vp8/common/skin_detection.c
@@ -0,0 +1,171 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/common/skin_detection.h"
+#include "vp8/common/alloccommon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+
+#define MODEL_MODE 1
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[5][2] = { { 7463, 9614 },
+                                     { 6400, 10240 },
+                                     { 7040, 10240 },
+                                     { 8320, 9280 },
+                                     { 6800, 9614 } };
+static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 };  // q16
+static const int skin_threshold[6] = { 1570636, 1400000, 800000,
+                                       800000,  800000,  800000 };  // q18
+
+// Thresholds on luminance.
+static const int y_low = 40;
+static const int y_high = 220;
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int evaluate_skin_color_difference(const int cb, const int cr,
+                                          const int idx) {
+  const int cb_q6 = cb << 6;
+  const int cr_q6 = cr << 6;
+  const int cb_diff_q12 =
+      (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
+  const int cbcr_diff_q12 =
+      (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
+  const int cr_diff_q12 =
+      (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
+  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+  const int skin_diff =
+      skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
+      skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
+  return skin_diff;
+}
+
+// Checks if the input yCbCr values corresponds to skin color.
+int skin_pixel(int y, int cb, int cr, int motion) {
+  if (y < y_low || y > y_high) {
+    return 0;
+  } else {
+    if (MODEL_MODE == 0) {
+      return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
+    } else {
+      int i = 0;
+      // Exit on grey.
+      if (cb == 128 && cr == 128) return 0;
+      // Exit on very strong cb.
+      if (cb > 150 && cr < 110) return 0;
+      for (; i < 5; ++i) {
+        int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
+        if (skin_color_diff < skin_threshold[i + 1]) {
+          if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) {
+            return 0;
+          } else if (motion == 0 &&
+                     skin_color_diff > (skin_threshold[i + 1] >> 1)) {
+            return 0;
+          } else {
+            return 1;
+          }
+        }
+        // Exit if difference is much large than the threshold.
+        if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
+          return 0;
+        }
+      }
+      return 0;
+    }
+  }
+}
+
+int compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                       int stride, int strideuv, int consec_zeromv,
+                       int curr_motion_magn) {
+  // No skin if block has been zero/small motion for long consecutive time.
+  if (consec_zeromv > 60 && curr_motion_magn == 0) {
+    return 0;
+  } else {
+    int motion = 1;
+    // Take the average of center 2x2 pixels.
+    const int ysource = (y[7 * stride + 7] + y[7 * stride + 8] +
+                         y[8 * stride + 7] + y[8 * stride + 8]) >>
+                        2;
+    const int usource = (u[3 * strideuv + 3] + u[3 * strideuv + 4] +
+                         u[4 * strideuv + 3] + u[4 * strideuv + 4]) >>
+                        2;
+    const int vsource = (v[3 * strideuv + 3] + v[3 * strideuv + 4] +
+                         v[4 * strideuv + 3] + v[4 * strideuv + 4]) >>
+                        2;
+    if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0;
+    return skin_pixel(ysource, usource, vsource, motion);
+  }
+}
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void compute_skin_map(VP8_COMP *const cpi, FILE *yuv_skinmap_file) {
+  int i, j, mb_row, mb_col, num_bl;
+  VP8_COMMON *const cm = &cpi->common;
+  uint8_t *y;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const uint8_t *src_u = cpi->Source->u_buffer;
+  const uint8_t *src_v = cpi->Source->v_buffer;
+  const int src_ystride = cpi->Source->y_stride;
+  const int src_uvstride = cpi->Source->uv_stride;
+
+  YV12_BUFFER_CONFIG skinmap;
+  memset(&skinmap, 0, sizeof(skinmap));
+  if (vp8_yv12_alloc_frame_buffer(&skinmap, cm->Width, cm->Height,
+                                  VP8BORDERINPIXELS) < 0) {
+    vpx_free_frame_buffer(&skinmap);
+    return;
+  }
+  memset(skinmap.buffer_alloc, 128, skinmap.frame_size);
+  y = skinmap.y_buffer;
+  // Loop through blocks and set skin map based on center pixel of block.
+  // Set y to white for skin block, otherwise set to source with gray scale.
+  // Ignore rightmost/bottom boundary blocks.
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 1) {
+    num_bl = 0;
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 1) {
+      int is_skin = 0;
+      int consec_zeromv = 0;
+      const int bl_index = mb_row * cm->mb_cols + mb_col;
+      const int bl_index1 = bl_index + 1;
+      const int bl_index2 = bl_index + cm->mb_cols;
+      const int bl_index3 = bl_index2 + 1;
+      consec_zeromv = VPXMIN(cpi->consec_zero_last[bl_index],
+                             VPXMIN(cpi->consec_zero_last[bl_index1],
+                                    VPXMIN(cpi->consec_zero_last[bl_index2],
+                                           cpi->consec_zero_last[bl_index3])));
+      is_skin = compute_skin_block(src_y, src_u, src_v, src_ystride,
+                                   src_uvstride, consec_zeromv, 0);
+      for (i = 0; i < 16; i++) {
+        for (j = 0; j < 16; j++) {
+          if (is_skin)
+            y[i * src_ystride + j] = 255;
+          else
+            y[i * src_ystride + j] = src_y[i * src_ystride + j];
+        }
+      }
+      num_bl++;
+      y += 16;
+      src_y += 16;
+      src_u += 8;
+      src_v += 8;
+    }
+    y += (src_ystride << 4) - (num_bl << 4);
+    src_y += (src_ystride << 4) - (num_bl << 4);
+    src_u += (src_uvstride << 3) - (num_bl << 3);
+    src_v += (src_uvstride << 3) - (num_bl << 3);
+  }
+  vp8_write_yuv_frame(yuv_skinmap_file, &skinmap);
+  vpx_free_frame_buffer(&skinmap);
+}
+#endif  // OUTPUT_YUV_SKINMAP
diff --git a/vp8/common/skin_detection.h b/vp8/common/skin_detection.h

new file mode 100644 (file)

index 0000000..4cb22a7
--- /dev/null
+++ b/vp8/common/skin_detection.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_SKIN_DETECTION_H_
+#define VP8_ENCODER_SKIN_DETECTION_H_
+
+#include "vp8/encoder/onyx_int.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+
+// #define OUTPUT_YUV_SKINMAP
+
+int skin_pixel(int y, int cb, int cr, int motion);
+
+int compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                       int stride, int strideuv, int consec_zeromv,
+                       int curr_motion_magn);
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void compute_skin_map(struct VP8_COMP *const cpi, FILE *yuv_skinmap_file);
+extern void vp8_write_yuv_frame(FILE *f, YV12_BUFFER_CONFIG *s);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_SKIN_DETECTION_H_
diff --git a/vp8/common/x86/filter_x86.c b/vp8/common/x86/filter_x86.c

index 73435a7dde069cd6d8901a89ed4b10b05a827b6b..2405342f02a3448a0f50bac86f39b836aafcc3fe 100644 (file)
--- a/vp8/common/x86/filter_x86.c
+++ b/vp8/common/x86/filter_x86.c
@@ -17,8 +17,7 @@ DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) = {
    { 32, 32, 32, 32, 96, 96, 96, 96 }, { 16, 16, 16, 16, 112, 112, 112, 112 }
  };
  
-DECLARE_PROTECTED(DECLARE_ALIGNED(16, const short,
-                                  vp8_bilinear_filters_x86_8[8][16])) = {
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = {
    { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
    { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
    { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c

index 64d177581ede6bef3c633ef1291b7a4361172a93..9227f750d55979b572d6d6be054069c9e9ae972f 100644 (file)
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -16,6 +16,7 @@
  #include "vp8/common/blockd.h"
  #include "onyx_int.h"
  #include "vp8/common/systemdependent.h"
+#include "vp8/common/skin_detection.h"
  #include "vp8/encoder/quantize.h"
  #include "vp8/common/alloccommon.h"
  #include "mcomp.h"
@@ -87,6 +88,9 @@ FILE *yuv_file;
  #ifdef OUTPUT_YUV_DENOISED
  FILE *yuv_denoised_file;
  #endif
+#ifdef OUTPUT_YUV_SKINMAP
+FILE *yuv_skinmap_file = NULL;
+#endif
  
  #if 0
  FILE *framepsnr;
@@ -728,6 +732,7 @@ void vp8_set_speed_features(VP8_COMP *cpi) {
    SPEED_FEATURES *sf = &cpi->sf;
    int Mode = cpi->compressor_speed;
    int Speed = cpi->Speed;
+  int Speed2;
    int i;
    VP8_COMMON *cm = &cpi->common;
    int last_improved_quant = sf->improved_quant;
@@ -829,9 +834,16 @@ void vp8_set_speed_features(VP8_COMP *cpi) {
    cpi->mode_check_freq[THR_V_PRED] = cpi->mode_check_freq[THR_H_PRED] =
        cpi->mode_check_freq[THR_B_PRED] =
            speed_map(Speed, mode_check_freq_map_vhbpred);
-  cpi->mode_check_freq[THR_NEW1] = speed_map(Speed, mode_check_freq_map_new1);
+
+  // For real-time mode at speed 10 keep the mode_check_freq threshold
+  // for NEW1 similar to that of speed 9.
+  Speed2 = Speed;
+  if (cpi->Speed == 10 && Mode == 2) Speed2 = RT(9);
+  cpi->mode_check_freq[THR_NEW1] = speed_map(Speed2, mode_check_freq_map_new1);
+
    cpi->mode_check_freq[THR_NEW2] = cpi->mode_check_freq[THR_NEW3] =
        speed_map(Speed, mode_check_freq_map_new2);
+
    cpi->mode_check_freq[THR_SPLIT1] =
        speed_map(Speed, mode_check_freq_map_split1);
    cpi->mode_check_freq[THR_SPLIT2] = cpi->mode_check_freq[THR_SPLIT3] =
@@ -1925,6 +1937,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
  #ifdef OUTPUT_YUV_DENOISED
    yuv_denoised_file = fopen("denoised.yuv", "ab");
  #endif
+#ifdef OUTPUT_YUV_SKINMAP
+  yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+#endif
  
  #if 0
      framepsnr = fopen("framepsnr.stt", "a");
@@ -2290,6 +2305,9 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
  #ifdef OUTPUT_YUV_DENOISED
    fclose(yuv_denoised_file);
  #endif
+#ifdef OUTPUT_YUV_SKINMAP
+  fclose(yuv_skinmap_file);
+#endif
  
  #if 0
  
@@ -2466,10 +2484,11 @@ int vp8_update_entropy(VP8_COMP *cpi, int update) {
    return 0;
  }
  
-#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED)
+#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED) || \
+    defined(OUTPUT_YUV_SKINMAP)
  void vp8_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
    unsigned char *src = s->y_buffer;
-  int h = s->y_height;
+  int h = s->y_crop_height;
  
    do {
      fwrite(src, s->y_width, 1, yuv_file);
@@ -2477,7 +2496,7 @@ void vp8_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
    } while (--h);
  
    src = s->u_buffer;
-  h = s->uv_height;
+  h = s->uv_crop_height;
  
    do {
      fwrite(src, s->uv_width, 1, yuv_file);
@@ -2485,7 +2504,7 @@ void vp8_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
    } while (--h);
  
    src = s->v_buffer;
-  h = s->uv_height;
+  h = s->uv_crop_height;
  
    do {
      fwrite(src, s->uv_width, 1, yuv_file);
@@ -4413,6 +4432,12 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
    }
  #endif
  
+#ifdef OUTPUT_YUV_SKINMAP
+  if (cpi->common.current_video_frame > 1) {
+    compute_skin_map(cpi, yuv_skinmap_file);
+  }
+#endif
+
  #if CONFIG_MULTITHREAD
    if (cpi->b_multi_threaded) {
      /* start loopfilter in separate thread */
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c

index eb713f11c0c7e0465b79e8726b87e3fa69aba0ef..341068bb99a2df11a86baa36faf0e754c247a7f5 100644 (file)
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -25,6 +25,7 @@
  #include "vp8/common/reconintra4x4.h"
  #include "vpx_dsp/variance.h"
  #include "mcomp.h"
+#include "vp8/common/skin_detection.h"
  #include "rdopt.h"
  #include "vpx_dsp/vpx_dsp_common.h"
  #include "vpx_mem/vpx_mem.h"
@@ -36,82 +37,9 @@
  extern unsigned int cnt_pm;
  #endif
  
-#define MODEL_MODE 1
-
  extern const int vp8_ref_frame_order[MAX_MODES];
  extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
  
-// Fixed point implementation of a skin color classifier. Skin color
-// is model by a Gaussian distribution in the CbCr color space.
-// See ../../test/skin_color_detector_test.cc where the reference
-// skin color classifier is defined.
-
-// Fixed-point skin color model parameters.
-static const int skin_mean[5][2] = { { 7463, 9614 },
-                                     { 6400, 10240 },
-                                     { 7040, 10240 },
-                                     { 8320, 9280 },
-                                     { 6800, 9614 } };
-static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 };  // q16
-static const int skin_threshold[6] = { 1570636, 1400000, 800000,
-                                       800000,  800000,  800000 };  // q18
-
-// Evaluates the Mahalanobis distance measure for the input CbCr values.
-static int evaluate_skin_color_difference(int cb, int cr, int idx) {
-  const int cb_q6 = cb << 6;
-  const int cr_q6 = cr << 6;
-  const int cb_diff_q12 =
-      (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
-  const int cbcr_diff_q12 =
-      (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
-  const int cr_diff_q12 =
-      (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
-  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
-  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
-  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
-  const int skin_diff =
-      skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
-      skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
-  return skin_diff;
-}
-
-// Checks if the input yCbCr values corresponds to skin color.
-static int is_skin_color(int y, int cb, int cr, int consec_zeromv) {
-  if (y < 40 || y > 220) {
-    return 0;
-  } else {
-    if (MODEL_MODE == 0) {
-      return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
-    } else {
-      int i = 0;
-      // No skin if block has been zero motion for long consecutive time.
-      if (consec_zeromv > 60) return 0;
-      // Exit on grey.
-      if (cb == 128 && cr == 128) return 0;
-      // Exit on very strong cb.
-      if (cb > 150 && cr < 110) return 0;
-      for (; i < 5; ++i) {
-        int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
-        if (skin_color_diff < skin_threshold[i + 1]) {
-          if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) {
-            return 0;
-          } else if (consec_zeromv > 25 &&
-                     skin_color_diff > (skin_threshold[i + 1] >> 1)) {
-            return 0;
-          } else {
-            return 1;
-          }
-        }
-        // Exit if difference is much large than the threshold.
-        if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
-          return 0;
-        }
-      }
-      return 0;
-    }
-  }
-}
-
  static int macroblock_corner_grad(unsigned char *signal, int stride,
                                    int offsetx, int offsety, int sgnx,
                                    int sgny) {
@@ -760,27 +688,12 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
  #endif
  
    // Check if current macroblock is in skin area.
-  {
-    const int y = (x->src.y_buffer[7 * x->src.y_stride + 7] +
-                   x->src.y_buffer[7 * x->src.y_stride + 8] +
-                   x->src.y_buffer[8 * x->src.y_stride + 7] +
-                   x->src.y_buffer[8 * x->src.y_stride + 8]) >>
-                  2;
-    const int cb = (x->src.u_buffer[3 * x->src.uv_stride + 3] +
-                    x->src.u_buffer[3 * x->src.uv_stride + 4] +
-                    x->src.u_buffer[4 * x->src.uv_stride + 3] +
-                    x->src.u_buffer[4 * x->src.uv_stride + 4]) >>
-                   2;
-    const int cr = (x->src.v_buffer[3 * x->src.uv_stride + 3] +
-                    x->src.v_buffer[3 * x->src.uv_stride + 4] +
-                    x->src.v_buffer[4 * x->src.uv_stride + 3] +
-                    x->src.v_buffer[4 * x->src.uv_stride + 4]) >>
-                   2;
-    x->is_skin = 0;
-    if (!cpi->oxcf.screen_content_mode) {
-      int block_index = mb_row * cpi->common.mb_cols + mb_col;
-      x->is_skin = is_skin_color(y, cb, cr, cpi->consec_zero_last[block_index]);
-    }
+  x->is_skin = 0;
+  if (!cpi->oxcf.screen_content_mode) {
+    int block_index = mb_row * cpi->common.mb_cols + mb_col;
+    x->is_skin = compute_skin_block(
+        x->src.y_buffer, x->src.u_buffer, x->src.v_buffer, x->src.y_stride,
+        x->src.uv_stride, cpi->consec_zero_last[block_index], 0);
    }
  #if CONFIG_TEMPORAL_DENOISING
    if (cpi->oxcf.noise_sensitivity) {
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk

index 7bd41a3fb712bf296d4ad77d9a02ea1b38a2f04e..b3421002bcbb096267f767b1e729c83156722948 100644 (file)
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -61,6 +61,8 @@ VP8_CX_SRCS-yes += encoder/ratectrl.c
  VP8_CX_SRCS-yes += encoder/rdopt.c
  VP8_CX_SRCS-yes += encoder/segmentation.c
  VP8_CX_SRCS-yes += encoder/segmentation.h
+VP8_CX_SRCS-yes += common/skin_detection.c
+VP8_CX_SRCS-yes += common/skin_detection.h
  VP8_CX_SRCS-yes += encoder/tokenize.c
  VP8_CX_SRCS-yes += encoder/dct_value_cost.h
  VP8_CX_SRCS-yes += encoder/dct_value_tokens.h
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c

index b105e5d45a38556894249f50566dfc06120c9ab3..dfc315eeacf85bdadf4fde04edfc21a66e35ecd4 100644 (file)
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -380,7 +380,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
      // if mfqe is enabled. Need to take both the quality and the speed
      // into consideration.
      if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
-      vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
+      vpx_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
      }
      if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
        deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
@@ -390,7 +390,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
        vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q,
                    cm->postproc_state.limits);
      } else {
-      vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
+      vpx_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
      }
    } else if (flags & VP9D_DEMACROBLOCK) {
      deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
@@ -399,7 +399,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
    } else if (flags & VP9D_DEBLOCK) {
      vp9_deblock(cm->frame_to_show, ppbuf, q, cm->postproc_state.limits);
    } else {
-    vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
+    vpx_yv12_copy_frame(cm->frame_to_show, ppbuf);
    }
  
    ppstate->last_base_qindex = cm->base_qindex;
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c

index 37693f094489c62efedbc75c900813e5065068cf..94cae7e34040e0a2d365d9f4607e8755e9336701 100644 (file)
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -169,7 +169,7 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi,
        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                           "Incorrect buffer dimensions");
      else
-      vp8_yv12_copy_frame(cfg, sd);
+      vpx_yv12_copy_frame(cfg, sd);
    } else {
      vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Invalid reference frame");
    }
@@ -217,7 +217,7 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
                         "Incorrect buffer dimensions");
    } else {
      // Overwrite the reference frame buffer.
-    vp8_yv12_copy_frame(sd, ref_buf);
+    vpx_yv12_copy_frame(sd, ref_buf);
    }
  
    return cm->error.error_code;
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c

index 98c56407596dbeafb256ef937221c9fbc48a42be..0b175969be601954a692bb4670b5124cf47b93ee 100644 (file)
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -22,6 +22,7 @@
  #include "vp9/encoder/vp9_rd.h"
  
  #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
  #include "vpx_dsp/vpx_dsp_common.h"
  
  void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c

index b4a0bbe58bd36923d9e6ecd5e37cfdaef31ac233..048ea629f5aba1376a28c9ba31715b827a830009 100644 (file)
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -277,8 +277,6 @@ void vp9_cyclic_refresh_postencode(VP9_COMP *const cpi) {
        !cpi->oxcf.gf_cbr_boost_pct) {
      // Force this frame as a golden update frame if this frame changes the
      // resolution (resize_pending != 0).
-    // TODO(marpan): check on forcing golden update if the background has very
-    // high motion in current frame.
      if (cpi->resize_pending != 0) {
        vp9_cyclic_refresh_set_golden_update(cpi);
        rc->frames_till_gf_update_due = rc->baseline_gf_interval;
@@ -316,6 +314,8 @@ void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) {
    else
      rc->baseline_gf_interval = 40;
    if (cpi->oxcf.rc_mode == VPX_VBR) rc->baseline_gf_interval = 20;
+  if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40)
+    rc->baseline_gf_interval = 10;
  }
  
  // Update the segmentation map, and related quantities: cyclic refresh map,
@@ -425,6 +425,13 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
    int target_refresh = 0;
    double weight_segment_target = 0;
    double weight_segment = 0;
+  cr->apply_cyclic_refresh = 1;
+  if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 ||
+      (!cpi->use_svc && rc->avg_frame_low_motion < 55 &&
+       rc->frames_since_key > 40)) {
+    cr->apply_cyclic_refresh = 0;
+    return;
+  }
    cr->percent_refresh = 10;
    if (cr->reduce_refresh) cr->percent_refresh = 5;
    cr->max_qdelta_perc = 60;
@@ -493,14 +500,8 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
    const RATE_CONTROL *const rc = &cpi->rc;
    CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
    struct segmentation *const seg = &cm->seg;
-  // TODO(marpan): Look into whether we should reduce the amount/delta-qp
-  // instead of completely shutting off at low bitrates. For now keep it on.
-  // const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc);
-  const int apply_cyclic_refresh = 1;
    if (cm->current_video_frame == 0) cr->low_content_avg = 0.0;
-  // Don't apply refresh on key frame or temporal enhancement layer frames.
-  if (!apply_cyclic_refresh || (cm->frame_type == KEY_FRAME) ||
-      (cpi->force_update_segmentation) || (cpi->svc.temporal_layer_id > 0)) {
+  if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation)) {
      // Set segmentation map to 0 and disable.
      unsigned char *const seg_map = cpi->segmentation_map;
      memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h

index 9de5074d9ec2bb321390a537e1b4b9c3642d02b1..77fa67c9e16ef600da87b835e79c34ce135c79df 100644 (file)
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -67,6 +67,7 @@ struct CYCLIC_REFRESH {
    int qindex_delta[3];
    int reduce_refresh;
    double weight_segment;
+  int apply_cyclic_refresh;
  };
  
  struct VP9_COMP;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c

index 0bd8bdbf539f1b5b3bae566d5d6a56090c0f4e60..6215e198ca6d3842e7e70152e95ab9ad1dbb5e86 100644 (file)
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -841,7 +841,8 @@ static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
    }
  }
  
-static void copy_partitioning_helper(VP9_COMP *cpi, BLOCK_SIZE bsize,
+static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x,
+                                     MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                       int mi_row, int mi_col) {
    VP9_COMMON *const cm = &cpi->common;
    BLOCK_SIZE *prev_part = cpi->prev_partition;
@@ -851,49 +852,61 @@ static void copy_partitioning_helper(VP9_COMP *cpi, BLOCK_SIZE bsize,
    const int bs = (1 << bsl) / 4;
    BLOCK_SIZE subsize;
    PARTITION_TYPE partition;
-  MODE_INFO *mi = NULL;
  
    if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
  
    partition = partition_lookup[bsl][prev_part[start_pos]];
    subsize = get_subsize(bsize, partition);
-  mi = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
  
    if (subsize < BLOCK_8X8) {
-    mi->sb_type = bsize;
+    set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
    } else {
      switch (partition) {
-      case PARTITION_NONE: mi->sb_type = bsize; break;
+      case PARTITION_NONE:
+        set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+        break;
        case PARTITION_HORZ:
-        mi->sb_type = subsize;
-        if (mi_row + bs < cm->mi_rows)
-          cm->mi_grid_visible[(mi_row + bs) * cm->mi_stride + mi_col]->sb_type =
-              subsize;
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row + bs, mi_col, subsize);
          break;
        case PARTITION_VERT:
-        mi->sb_type = subsize;
-        if (mi_col + bs < cm->mi_cols)
-          cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col + bs]->sb_type =
-              subsize;
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row, mi_col + bs, subsize);
          break;
        case PARTITION_SPLIT:
-        copy_partitioning_helper(cpi, subsize, mi_row, mi_col);
-        copy_partitioning_helper(cpi, subsize, mi_row + bs, mi_col);
-        copy_partitioning_helper(cpi, subsize, mi_row, mi_col + bs);
-        copy_partitioning_helper(cpi, subsize, mi_row + bs, mi_col + bs);
+        copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col);
+        copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col);
+        copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col + bs);
+        copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col + bs);
          break;
        default: assert(0);
      }
    }
  }
  
-static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
-                             int mi_col, int segment_id, int sb_offset) {
-  if (cpi->rc.frames_since_key > 1 && segment_id == CR_SEGMENT_ID_BASE &&
+static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                             int mi_row, int mi_col, int segment_id,
+                             int sb_offset) {
+  int svc_copy_allowed = 1;
+  int frames_since_key_thresh = 1;
+  if (cpi->use_svc) {
+    // For SVC, don't allow copy if base spatial layer is key frame, or if
+    // frame is not a temporal enhancement layer frame.
+    int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id,
+                                 cpi->svc.number_temporal_layers);
+    const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+    if (lc->is_key_frame ||
+        (cpi->svc.temporal_layer_id != cpi->svc.number_temporal_layers - 1 &&
+         cpi->svc.number_temporal_layers > 1))
+      svc_copy_allowed = 0;
+    frames_since_key_thresh = cpi->svc.number_spatial_layers << 1;
+  }
+  if (cpi->rc.frames_since_key > frames_since_key_thresh && svc_copy_allowed &&
+      !cpi->resize_pending && segment_id == CR_SEGMENT_ID_BASE &&
        cpi->prev_segment_id[sb_offset] == CR_SEGMENT_ID_BASE &&
        cpi->copied_frame_cnt[sb_offset] < cpi->max_copied_frame) {
      if (cpi->prev_partition != NULL) {
-      copy_partitioning_helper(cpi, BLOCK_64X64, mi_row, mi_col);
+      copy_partitioning_helper(cpi, x, xd, BLOCK_64X64, mi_row, mi_col);
        cpi->copied_frame_cnt[sb_offset] += 1;
        memcpy(x->variance_low, &(cpi->prev_variance_low[sb_offset * 25]),
               sizeof(x->variance_low));
@@ -1081,7 +1094,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
        x->last_sb_high_content = cpi->content_state_sb_fd[sb_offset2];
      // If source_sad is low copy the partition without computing the y_sad.
      if (x->skip_low_source_sad && cpi->sf.copy_partition_flag &&
-        copy_partitioning(cpi, x, mi_row, mi_col, segment_id, sb_offset)) {
+        copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) {
        return 0;
      }
    }
@@ -1208,7 +1221,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
      // Stop the copy every cpi->max_copied_frame to refresh the partition.
      // TODO(jianj) : tune the threshold.
      if (cpi->sf.copy_partition_flag && y_sad_last < cpi->vbp_threshold_copy &&
-        copy_partitioning(cpi, x, mi_row, mi_col, segment_id, sb_offset)) {
+        copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) {
        chroma_check(cpi, x, bsize, y_sad, is_key_frame);
        return 0;
      }
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c

index f57f40dbe4cf68ea4be624499e009c60dce9d12a..00db9d57dd9ba2d0db41185395f211a666960367 100644 (file)
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2557,7 +2557,7 @@ int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
                             YV12_BUFFER_CONFIG *sd) {
    YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
    if (cfg) {
-    vp8_yv12_copy_frame(cfg, sd);
+    vpx_yv12_copy_frame(cfg, sd);
      return 0;
    } else {
      return -1;
@@ -2568,7 +2568,7 @@ int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
                            YV12_BUFFER_CONFIG *sd) {
    YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
    if (cfg) {
-    vp8_yv12_copy_frame(sd, cfg);
+    vpx_yv12_copy_frame(sd, cfg);
      return 0;
    } else {
      return -1;
@@ -2588,7 +2588,7 @@ int vp9_update_entropy(VP9_COMP *cpi, int update) {
  // denoising we will have to modify this.
  void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
    uint8_t *src = s->y_buffer;
-  int h = s->y_height;
+  int h = s->y_crop_height;
  
    do {
      fwrite(src, s->y_width, 1, f);
@@ -2596,7 +2596,7 @@ void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
    } while (--h);
  
    src = s->u_buffer;
-  h = s->uv_height;
+  h = s->uv_crop_height;
  
    do {
      fwrite(src, s->uv_width, 1, f);
@@ -2604,7 +2604,7 @@ void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
    } while (--h);
  
    src = s->v_buffer;
-  h = s->uv_height;
+  h = s->uv_crop_height;
  
    do {
      fwrite(src, s->uv_width, 1, f);
@@ -3518,6 +3518,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
  
    if ((cpi->use_svc &&
         (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1 ||
+        cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1 ||
          cpi->svc.current_superframe < 1)) ||
        cpi->resize_pending || cpi->resize_state || cpi->external_resize ||
        cpi->resize_state != ORIG) {
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c

index 77c49c5455a492e103f47232a79a28d70be74825..b6e3275482c9b4f695f4869b0a3913906e6b2af6 100644 (file)
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -238,14 +238,14 @@ static double calculate_active_area(const VP9_COMP *cpi,
  // Calculate a modified Error used in distributing bits between easier and
  // harder frames.
  #define ACT_AREA_CORRECTION 0.5
-static double calculate_modified_err(const VP9_COMP *cpi,
-                                     const TWO_PASS *twopass,
-                                     const VP9EncoderConfig *oxcf,
-                                     const FIRSTPASS_STATS *this_frame) {
+static double calculate_mod_frame_score(const VP9_COMP *cpi,
+                                        const TWO_PASS *twopass,
+                                        const VP9EncoderConfig *oxcf,
+                                        const FIRSTPASS_STATS *this_frame) {
    const FIRSTPASS_STATS *const stats = &twopass->total_stats;
    const double av_weight = stats->weight / stats->count;
    const double av_err = (stats->coded_error * av_weight) / stats->count;
-  double modified_error =
+  double modified_score =
        av_err * pow(this_frame->coded_error * this_frame->weight /
                         DOUBLE_DIVIDE_CHECK(av_err),
                     oxcf->two_pass_vbrbias / 100.0);
@@ -255,11 +255,38 @@ static double calculate_modified_err(const VP9_COMP *cpi,
    // remaining active MBs. The correction here assumes that coding
    // 0.5N blocks of complexity 2X is a little easier than coding N
    // blocks of complexity X.
-  modified_error *=
+  modified_score *=
        pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
  
-  return fclamp(modified_error, twopass->modified_error_min,
-                twopass->modified_error_max);
+  return modified_score;
+}
+static double calculate_norm_frame_score(const VP9_COMP *cpi,
+                                         const TWO_PASS *twopass,
+                                         const VP9EncoderConfig *oxcf,
+                                         const FIRSTPASS_STATS *this_frame) {
+  const FIRSTPASS_STATS *const stats = &twopass->total_stats;
+  const double av_weight = stats->weight / stats->count;
+  const double av_err = (stats->coded_error * av_weight) / stats->count;
+  double modified_score =
+      av_err * pow(this_frame->coded_error * this_frame->weight /
+                       DOUBLE_DIVIDE_CHECK(av_err),
+                   oxcf->two_pass_vbrbias / 100.0);
+
+  const double min_score = (double)(oxcf->two_pass_vbrmin_section) / 100.0;
+  const double max_score = (double)(oxcf->two_pass_vbrmax_section) / 100.0;
+
+  // Correction for active area. Frames with a reduced active area
+  // (eg due to formatting bars) have a higher error per mb for the
+  // remaining active MBs. The correction here assumes that coding
+  // 0.5N blocks of complexity 2X is a little easier than coding N
+  // blocks of complexity X.
+  modified_score *=
+      pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
+
+  // Normalize to a midpoint score.
+  modified_score /= DOUBLE_DIVIDE_CHECK(twopass->mean_mod_score);
+
+  return fclamp(modified_score, min_score, max_score);
  }
  
  // This function returns the maximum target rate per frame.
@@ -707,9 +734,14 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
  
    fps->frame = cm->current_video_frame;
    fps->spatial_layer_id = cpi->svc.spatial_layer_id;
-  fps->coded_error = (double)(fp_acc_data->coded_error >> 8) + min_err;
-  fps->sr_coded_error = (double)(fp_acc_data->sr_coded_error >> 8) + min_err;
-  fps->intra_error = (double)(fp_acc_data->intra_error >> 8) + min_err;
+
+  fps->coded_error =
+      ((double)(fp_acc_data->coded_error >> 8) + min_err) / num_mbs;
+  fps->sr_coded_error =
+      ((double)(fp_acc_data->sr_coded_error >> 8) + min_err) / num_mbs;
+  fps->intra_error =
+      ((double)(fp_acc_data->intra_error >> 8) + min_err) / num_mbs;
+
    fps->frame_noise_energy =
        (double)(fp_acc_data->frame_noise_energy) / (double)num_mbs;
    fps->count = 1.0;
@@ -1565,8 +1597,9 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
      const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
                              ? cpi->initial_mbs
                              : cpi->common.MBs;
-    const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
-    const double av_err_per_mb = section_err / active_mbs;
+    const double active_pct = VPXMAX(0.01, 1.0 - inactive_zone);
+    const int active_mbs = (int)VPXMAX(1, (double)num_mbs * active_pct);
+    const double av_err_per_mb = section_err / active_pct;
      const double speed_term = 1.0 + 0.04 * oxcf->speed;
      double last_group_rate_err;
      const int target_norm_bits_per_mb =
@@ -1675,22 +1708,35 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
    // This variable monitors how far behind the second ref update is lagging.
    twopass->sr_update_lag = 1;
  
-  // Scan the first pass file and calculate a modified total error based upon
-  // the bias/power function used to allocate bits.
+  // Scan the first pass file and calculate a modified score for each
+  // frame that is used to distribute bits. The modified score is assumed
+  // to provide a linear basis for bit allocation. I.e a frame A with a score
+  // that is double that of frame B will be allocated 2x as many bits.
    {
-    const double avg_error =
-        stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
      const FIRSTPASS_STATS *s = twopass->stats_in;
-    double modified_error_total = 0.0;
-    twopass->modified_error_min =
-        (avg_error * oxcf->two_pass_vbrmin_section) / 100;
-    twopass->modified_error_max =
-        (avg_error * oxcf->two_pass_vbrmax_section) / 100;
+    double modified_score_total = 0.0;
+
+    // The first scan is unclamped and gives a raw average.
      while (s < twopass->stats_in_end) {
-      modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
+      modified_score_total += calculate_mod_frame_score(cpi, twopass, oxcf, s);
        ++s;
      }
-    twopass->modified_error_left = modified_error_total;
+
+    // The average error from this first scan is used to define the midpoint
+    // error for the rate distribution function.
+    twopass->mean_mod_score =
+        modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count);
+
+    // Second scan using clamps based on the previous cycle average.
+    // This may modify the total and average somewhat but we dont bother with
+    // further itterations.
+    s = twopass->stats_in;
+    modified_score_total = 0.0;
+    while (s < twopass->stats_in_end) {
+      modified_score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s);
+      ++s;
+    }
+    twopass->normalized_score_left = modified_score_total;
    }
  
    // Reset the vbr bits off target counters
@@ -1727,9 +1773,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
  
  static double get_sr_decay_rate(const VP9_COMP *cpi,
                                  const FIRSTPASS_STATS *frame) {
-  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                             : cpi->common.MBs;
-  double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
+  double sr_diff = (frame->sr_coded_error - frame->coded_error);
    double sr_decay = 1.0;
    double modified_pct_inter;
    double modified_pcnt_intra;
@@ -1738,7 +1782,7 @@ static double get_sr_decay_rate(const VP9_COMP *cpi,
                              (cpi->initial_height + cpi->initial_width));
  
    modified_pct_inter = frame->pcnt_inter;
-  if (((frame->coded_error / num_mbs) > LOW_CODED_ERR_PER_MB) &&
+  if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
        ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
         (double)NCOUNT_FRAME_II_THRESH)) {
      modified_pct_inter =
@@ -1860,20 +1904,16 @@ static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
    const double lq = vp9_convert_qindex_to_q(
        cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
    const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5);
-  int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                       : cpi->common.MBs;
-
-  // Correct for any inactive region in the image
-  num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+  const double active_area = calculate_active_area(cpi, this_frame);
  
    // Underlying boost factor is based on inter error ratio.
-  frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+  frame_boost = (BASELINE_ERR_PER_MB * active_area) /
                  DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
  
    // Update the accumulator for second ref error difference.
    // This is intended to give an indication of how much the coded error is
    // increasing over time.
-  *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1;
+  *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error);
    *sr_accumulator = VPXMAX(0.0, *sr_accumulator);
  
    // Small adjustment for cases where there is a zoom out
@@ -1896,20 +1936,16 @@ static double calc_kf_frame_boost(VP9_COMP *cpi,
    const double lq = vp9_convert_qindex_to_q(
        cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
    const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00);
-  int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                       : cpi->common.MBs;
-
-  // Correct for any inactive region in the image
-  num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+  const double active_area = calculate_active_area(cpi, this_frame);
  
    // Underlying boost factor is based on inter error ratio.
-  frame_boost = (KF_BASELINE_ERR_PER_MB * num_mbs) /
+  frame_boost = (KF_BASELINE_ERR_PER_MB * active_area) /
                  DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
  
    // Update the accumulator for second ref error difference.
    // This is intended to give an indication of how much the coded error is
    // increasing over time.
-  *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1;
+  *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error);
    *sr_accumulator = VPXMAX(0.0, *sr_accumulator);
  
    // Small adjustment for cases where there is a zoom out
@@ -2042,7 +2078,7 @@ static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi,
    int64_t total_group_bits;
  
    // Calculate the bits to be allocated to the group as a whole.
-  if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+  if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0.0)) {
      total_group_bits = (int64_t)(twopass->kf_group_bits *
                                   (gf_group_err / twopass->kf_group_error_left));
    } else {
@@ -2336,7 +2372,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
    vp9_zero(next_frame);
  
    // Load stats for the current frame.
-  mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+  mod_frame_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
  
    // Note the error of the frame at the start of the group. This will be
    // the GF frame error if we code a normal gf.
@@ -2402,7 +2438,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      ++i;
  
      // Accumulate error score of frames in this gf group.
-    mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+    mod_frame_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
      gf_group_err += mod_frame_err;
      gf_group_raw_error += this_frame->coded_error;
      gf_group_noise += this_frame->frame_noise_energy;
@@ -2511,7 +2547,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      int j;
      for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) {
        if (EOF == input_stats(twopass, this_frame)) break;
-      gf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+      gf_group_err +=
+          calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
        gf_group_raw_error += this_frame->coded_error;
        gf_group_noise += this_frame->frame_noise_energy;
        gf_group_skip_pct += this_frame->intra_skip_pct;
@@ -2566,7 +2603,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
                                       gf_group_bits);
  
    // Adjust KF group bits and error remaining.
-  twopass->kf_group_error_left -= (int64_t)gf_group_err;
+  twopass->kf_group_error_left -= gf_group_err;
  
    // Allocate bits to each of the frames in the GF group.
    allocate_gf_group_bits(cpi, gf_group_bits, gf_arf_bits);
@@ -2616,6 +2653,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  #define II_IMPROVEMENT_THRESHOLD 3.5
  #define KF_II_MAX 128.0
  #define II_FACTOR 12.5
+// Test for very low intra complexity which could cause false key frames
+#define V_LOW_INTRA 0.5
+
  static int test_candidate_kf(TWO_PASS *twopass,
                               const FIRSTPASS_STATS *last_frame,
                               const FIRSTPASS_STATS *this_frame,
@@ -2674,7 +2714,7 @@ static int test_candidate_kf(TWO_PASS *twopass,
              0.20) &&
             (next_iiratio < 3.0)) ||
            ((boost_score - old_boost_score) < 3.0) ||
-          (local_next_frame.intra_error < 200)) {
+          (local_next_frame.intra_error < V_LOW_INTRA)) {
          break;
        }
  
@@ -2750,10 +2790,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  
    rc->frames_to_key = 1;
  
-  twopass->kf_group_bits = 0;        // Total bits available to kf group
-  twopass->kf_group_error_left = 0;  // Group modified error score.
+  twopass->kf_group_bits = 0;          // Total bits available to kf group
+  twopass->kf_group_error_left = 0.0;  // Group modified error score.
  
-  kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+  kf_mod_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
  
    // Initialize the decay rates for the recent frames to check
    for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
@@ -2763,7 +2803,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
    while (twopass->stats_in < twopass->stats_in_end &&
           rc->frames_to_key < cpi->oxcf.key_freq) {
      // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+    kf_group_err += calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
  
      // Load the next frame's stats.
      last_frame = *this_frame;
@@ -2823,7 +2863,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
  
      // Rescan to get the correct error data for the forced kf group.
      for (i = 0; i < rc->frames_to_key; ++i) {
-      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
+      kf_group_err +=
+          calculate_norm_frame_score(cpi, twopass, oxcf, &tmp_frame);
        input_stats(twopass, &tmp_frame);
      }
      rc->next_key_frame_forced = 1;
@@ -2840,7 +2881,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      int j;
      for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) {
        if (EOF == input_stats(twopass, this_frame)) break;
-      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+      kf_group_err +=
+          calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
      }
      rc->frames_to_key = new_frame_to_key;
    }
@@ -2848,11 +2890,11 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
    // Special case for the last key frame of the file.
    if (twopass->stats_in >= twopass->stats_in_end) {
      // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+    kf_group_err += calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
    }
  
    // Calculate the number of bits that should be assigned to the kf group.
-  if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
+  if (twopass->bits_left > 0 && twopass->normalized_score_left > 0.0) {
      // Maximum number of bits for a single normal frame (not key frame).
      const int max_bits = frame_max_bits(rc, &cpi->oxcf);
  
@@ -2862,7 +2904,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      // Default allocation based on bits left and relative
      // complexity of the section.
      twopass->kf_group_bits = (int64_t)(
-        twopass->bits_left * (kf_group_err / twopass->modified_error_left));
+        twopass->bits_left * (kf_group_err / twopass->normalized_score_left));
  
      // Clip based on maximum per frame rate defined by the user.
      max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
@@ -2935,12 +2977,12 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
    gf_group->rf_level[0] = KF_STD;
  
    // Note the total error score of the kf group minus the key frame itself.
-  twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+  twopass->kf_group_error_left = (kf_group_err - kf_mod_err);
  
    // Adjust the count of total modified error left.
    // The count of bits left is adjusted elsewhere based on real coded frame
    // sizes.
-  twopass->modified_error_left -= kf_group_err;
+  twopass->normalized_score_left -= kf_group_err;
  
    if (oxcf->resize_mode == RESIZE_DYNAMIC) {
      // Default to normal-sized frame on keyframes.
@@ -3172,16 +3214,10 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
    target_rate = gf_group->bit_allocation[gf_group->index];
    rc->base_frame_target = target_rate;
  
-  {
-    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
-                            ? cpi->initial_mbs
-                            : cpi->common.MBs;
-    // The multiplication by 256 reverses a scaling factor of (>> 8)
-    // applied when combining MB error values for the frame.
-    twopass->mb_av_energy =
-        log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
-    twopass->mb_smooth_pct = this_frame.intra_smooth_pct;
-  }
+  // The multiplication by 256 reverses a scaling factor of (>> 8)
+  // applied when combining MB error values for the frame.
+  twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0);
+  twopass->mb_smooth_pct = this_frame.intra_smooth_pct;
  
    // Update the total stats remaining structure.
    subtract_stats(&twopass->total_left_stats, &this_frame);
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h

index d660aa1ffb883c832a2683e85154e42c10b9ae33..000ecd77926b525913e367442616815fdf11e8f2 100644 (file)
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -138,9 +138,8 @@ typedef struct {
    FIRSTPASS_STATS total_left_stats;
    int first_pass_done;
    int64_t bits_left;
-  double modified_error_min;
-  double modified_error_max;
-  double modified_error_left;
+  double mean_mod_score;
+  double normalized_score_left;
    double mb_av_energy;
    double mb_smooth_pct;
  
@@ -159,7 +158,7 @@ typedef struct {
    int64_t kf_group_bits;
  
    // Error score of frames still to be coded in kf group
-  int64_t kf_group_error_left;
+  double kf_group_error_left;
  
    double bpm_factor;
    int rolling_arf_group_target_bits;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c

index 24e23af3b15b6bf03ebfd172ac80b1e0d1761ba1..f0e0370db39c2579d04bd3cd05f6ef89522a48ec 100644 (file)
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -361,7 +361,7 @@ static unsigned int setup_center_error(
  #endif  // CONFIG_VP9_HIGHBITDEPTH
  }
  
-static INLINE int divide_and_round(const int n, const int d) {
+static INLINE int64_t divide_and_round(const int64_t n, const int64_t d) {
    return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
  }
  
@@ -379,10 +379,13 @@ static INLINE int is_cost_list_wellbehaved(int *cost_list) {
  // y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
  // The code below is an integerized version of that.
  static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) {
-  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
-                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
-  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
-                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+  const int64_t x0 = (int64_t)cost_list[1] - cost_list[3];
+  const int64_t y0 = cost_list[1] - 2 * (int64_t)cost_list[0] + cost_list[3];
+  const int64_t x1 = (int64_t)cost_list[4] - cost_list[2];
+  const int64_t y1 = cost_list[4] - 2 * (int64_t)cost_list[0] + cost_list[2];
+  const int b = 1 << (bits - 1);
+  *ic = (int)divide_and_round(x0 * b, y0);
+  *ir = (int)divide_and_round(x1 * b, y1);
  }
  
  uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv,
@@ -441,7 +444,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore(
        cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
        cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
      int ir, ic;
-    unsigned int minpt;
+    unsigned int minpt = INT_MAX;
      get_cost_surf_min(cost_list, &ir, &ic, 2);
      if (ir != 0 || ic != 0) {
        CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic);
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c

index b05f4184bd043009c75c3825236b9c9f65e22cb8..fc0629f2b67b0171855f2300cdf76ce4fa551b6b 100644 (file)
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -158,6 +158,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
    const MvLimits tmp_mv_limits = x->mv_limits;
    int rv = 0;
    int cost_list[5];
+  int search_subpel = 1;
    const YV12_BUFFER_CONFIG *scaled_ref_frame =
        vp9_get_scaled_ref_frame(cpi, ref);
    if (scaled_ref_frame) {
@@ -210,7 +211,12 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
    rv =
        !(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) > best_rd_sofar);
  
-  if (rv) {
+  // For SVC on non-reference frame, avoid subpel for (0, 0) motion.
+  if (cpi->use_svc && cpi->svc.non_reference_frame) {
+    if (mvp_full.row == 0 && mvp_full.col == 0) search_subpel = 0;
+  }
+
+  if (rv && search_subpel) {
      const int subpel_force_stop = cpi->sf.mv.subpel_force_stop;
      cpi->find_fractional_mv_step(
          x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv,
@@ -1752,7 +1758,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
            cpi->oxcf.rc_mode == VPX_CBR) {
          int tmp_sad;
          uint32_t dis;
-        int cost_list[5];
+        int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX };
  
          if (bsize < BLOCK_16X16) continue;
  
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c

index b1866cd8ab6811c2cd751a11ffc2538c228cd080..27fea5d4e7881ba0be718218d0e6a4c21b55a664 100644 (file)
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -547,6 +547,7 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
  int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
                        int active_best_quality, int active_worst_quality) {
    const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
    int q = active_worst_quality;
    int last_error = INT_MAX;
    int i, target_bits_per_mb, bits_per_mb_at_this_q;
@@ -561,7 +562,7 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
  
    do {
      if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
-        cpi->svc.temporal_layer_id == 0 &&
+        cr->apply_cyclic_refresh &&
          (!cpi->oxcf.gf_cbr_boost_pct || !cpi->refresh_golden_frame)) {
        bits_per_mb_at_this_q =
            (int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c

index 7917c63bf16e2b7e67d2538439b22cd50a1a2b97..14bdf4b02d447652799e0dfb29da037418aeab5c 100644 (file)
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -534,7 +534,6 @@ static void set_rt_speed_feature_framesize_independent(
      if (cpi->svc.temporal_layer_id > 0) {
        sf->adaptive_rd_thresh = 4;
        sf->limit_newmv_early_exit = 0;
-      sf->mv.subpel_force_stop = (cpi->svc.temporal_layer_id == 1) ? 1 : 2;
        sf->base_mv_aggressive =
            (cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1)
                ? 1
@@ -551,6 +550,8 @@ static void set_rt_speed_feature_framesize_independent(
        sf->mv.search_method = NSTEP;
        sf->mv.fullpel_search_step_param = 6;
      }
+    if (cpi->svc.temporal_layer_id > 0)
+      sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE;
      if (!cpi->external_resize) sf->use_source_sad = 1;
      if (sf->use_source_sad) {
        if (cpi->content_state_sb_fd == NULL &&
@@ -560,25 +561,36 @@ static void set_rt_speed_feature_framesize_independent(
              (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t));
        }
      }
+    // Enable partition copy. For SVC only enabled for top spatial resolution
+    // layer.
+    cpi->max_copied_frame = 0;
+    if (!cpi->last_frame_dropped && cpi->resize_state == ORIG &&
+        !cpi->external_resize &&
+        (!cpi->use_svc ||
+         cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
+      sf->copy_partition_flag = 1;
+      cpi->max_copied_frame = 2;
+      // The top temporal enhancement layer (for number of temporal layers > 1)
+      // are non-reference frames, so use large/max value for max_copied_frame.
+      if (cpi->svc.number_temporal_layers > 1 &&
+          cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1)
+        cpi->max_copied_frame = 255;
+    }
    }
  
    if (speed >= 8) {
      sf->adaptive_rd_thresh = 4;
-    // Enable partition copy
-    if (!cpi->last_frame_dropped && !cpi->use_svc && !cpi->resize_pending &&
-        cpi->resize_state == ORIG && !cpi->external_resize &&
-        cpi->oxcf.resize_mode == RESIZE_NONE) {
-      sf->copy_partition_flag = 1;
-      cpi->max_copied_frame = 4;
-    }
-
+    if (!cpi->use_svc) cpi->max_copied_frame = 4;
      if (cpi->row_mt && cpi->oxcf.max_threads > 1)
        sf->adaptive_rd_thresh_row_mt = 1;
  
      if (content == VP9E_CONTENT_SCREEN)
        sf->mv.subpel_force_stop = 3;
-    else if (cm->width > 352 && cm->height > 288)
+    else if (cm->width * cm->height > 352 * 288) {
        sf->mv.subpel_force_stop = 2;
+      if (cpi->rc.avg_frame_low_motion > 87 && cm->current_video_frame > 30)
+        sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE;
+    }
  
      if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
      // Only keep INTRA_DC mode for speed 8.
@@ -601,7 +613,7 @@ static void set_rt_speed_feature_framesize_independent(
        }
        // Since the short_circuit_low_temp_var is used, reduce the
        // adaptive_rd_thresh level.
-      if (cm->width > 352 && cm->height > 288)
+      if (cm->width * cm->height > 352 * 288)
          sf->adaptive_rd_thresh = 1;
        else
          sf->adaptive_rd_thresh = 2;
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c

index 5867a6c38b8be2150a62eb851091bea88648d11c..71aa82065d3047fe3819c65a78495e49fbfc5ccf 100644 (file)
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -36,6 +36,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
    svc->scaled_temp_is_alloc = 0;
    svc->scaled_one_half = 0;
    svc->current_superframe = 0;
+  svc->non_reference_frame = 0;
    for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1;
    for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
      svc->ext_frame_flags[sl] = 0;
@@ -677,6 +678,12 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
      }
    }
  
+  cpi->svc.non_reference_frame = 0;
+  if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame &&
+      !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame) {
+    cpi->svc.non_reference_frame = 1;
+  }
+
    if (vp9_set_size_literal(cpi, width, height) != 0)
      return VPX_CODEC_INVALID_PARAM;
  
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h

index d8e6772b26f1ca3d170af67ac0745ea954feeb27..4e8aaf73fb84973e151cc4e198e414bf9798cff2 100644 (file)
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -87,6 +87,7 @@ typedef struct {
    int ref_frame_index[REF_FRAMES];
    int force_zero_mode_spatial_ref;
    int current_superframe;
+  int non_reference_frame;
    int use_base_mv;
    // Used to control the downscaling filter for source scaling, for 1 pass CBR.
    // downsample_filter_phase: = 0 will do sub-sampling (no weighted average),
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c

index cca9a9324230a81c702cb60f6199f34be7250991..257e8ffee577ea5ee82090982218b731b3681ce9 100644 (file)
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -16,6 +16,7 @@
  
  #include "vpx/vpx_integer.h"
  #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
  
  static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
    const uint32x4_t a = vpaddlq_u16(v_16x8);
diff --git a/vpx_dsp/arm/avg_pred_neon.c b/vpx_dsp/arm/avg_pred_neon.c

new file mode 100644 (file)

index 0000000..1370ec2
--- /dev/null
+++ b/vpx_dsp/arm/avg_pred_neon.c
@@ -0,0 +1,55 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride) {
+  if (width > 8) {
+    int x, y;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; x += 16) {
+        const uint8x16_t p = vld1q_u8(pred + x);
+        const uint8x16_t r = vld1q_u8(ref + x);
+        const uint8x16_t avg = vrhaddq_u8(p, r);
+        vst1q_u8(comp + x, avg);
+      }
+      comp += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else {
+    int i;
+    for (i = 0; i < width * height; i += 16) {
+      const uint8x16_t p = vld1q_u8(pred);
+      uint8x16_t r;
+
+      if (width == 4) {
+        r = load_unaligned_u8q(ref, ref_stride);
+        ref += 4 * ref_stride;
+      } else {
+        const uint8x8_t r_0 = vld1_u8(ref);
+        const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
+        assert(width == 8);
+        r = vcombine_u8(r_0, r_1);
+        ref += 2 * ref_stride;
+      }
+      r = vrhaddq_u8(r, p);
+      vst1q_u8(comp, r);
+
+      pred += 16;
+      comp += 16;
+    }
+  }
+}
diff --git a/vpx_dsp/arm/fdct_neon.c b/vpx_dsp/arm/fdct_neon.c

new file mode 100644 (file)

index 0000000..fe78f3f
--- /dev/null
+++ b/vpx_dsp/arm/fdct_neon.c
@@ -0,0 +1,92 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+                      int stride) {
+  int i;
+  // input[M * stride] * 16
+  int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+  int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+  int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+  int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+
+  // If the very first value != 0, then add 1.
+  if (input[0] != 0) {
+    const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
+    input_0 = vadd_s16(input_0, one);
+  }
+
+  for (i = 0; i < 2; ++i) {
+    const int16x8_t input_01 = vcombine_s16(input_0, input_1);
+    const int16x8_t input_32 = vcombine_s16(input_3, input_2);
+
+    // in_0 +/- in_3, in_1 +/- in_2
+    const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+    const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+    // step_0 +/- step_1, step_2 +/- step_3
+    const int16x4_t s_0 = vget_low_s16(s_01);
+    const int16x4_t s_1 = vget_high_s16(s_01);
+    const int16x4_t s_2 = vget_high_s16(s_32);
+    const int16x4_t s_3 = vget_low_s16(s_32);
+
+    // (s_0 +/- s_1) * cospi_16_64
+    // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+    const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
+    const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
+    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int16_t)cospi_16_64);
+    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int16_t)cospi_16_64);
+
+    // fdct_round_shift
+    int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
+    int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
+
+    // s_3 * cospi_8_64 + s_2 * cospi_24_64
+    // s_3 * cospi_24_64 - s_2 * cospi_8_64
+    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int16_t)cospi_8_64);
+    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int16_t)cospi_24_64);
+
+    const int32x4_t temp3 =
+        vmlal_n_s16(s_3_cospi_8_64, s_2, (int16_t)cospi_24_64);
+    const int32x4_t temp4 =
+        vmlsl_n_s16(s_3_cospi_24_64, s_2, (int16_t)cospi_8_64);
+
+    // fdct_round_shift
+    int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
+    int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
+
+    transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+
+    input_0 = out_0;
+    input_1 = out_1;
+    input_2 = out_2;
+    input_3 = out_3;
+  }
+
+  {
+    // Not quite a rounding shift. Only add 1 despite shifting by 2.
+    const int16x8_t one = vdupq_n_s16(1);
+    int16x8_t out_01 = vcombine_s16(input_0, input_1);
+    int16x8_t out_23 = vcombine_s16(input_2, input_3);
+    out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+    out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+    store_s16q_to_tran_low(final_output + 0 * 8, out_01);
+    store_s16q_to_tran_low(final_output + 1 * 8, out_23);
+  }
+}
diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c

index b26920504f58ac86f845c7300e16f56337450ebf..c449b4660163e27bd569c567fdb42ae1513234e1 100644 (file)
--- a/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/vpx_dsp/arm/fwd_txfm_neon.c
@@ -14,6 +14,7 @@
  #include "vpx_dsp/txfm_common.h"
  #include "vpx_dsp/vpx_dsp_common.h"
  #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
  
  void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
                        int stride) {
diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c

index ebeafed31fdbfb2ad758eea2b0b2041d28702753..79bedd848a31e5b100482281332b1df6dcbf1943 100644 (file)
--- a/vpx_dsp/arm/hadamard_neon.c
+++ b/vpx_dsp/arm/hadamard_neon.c
@@ -13,6 +13,7 @@
  #include "./vpx_dsp_rtcd.h"
  #include "vpx/vpx_integer.h"
  #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
  #include "vpx_dsp/arm/transpose_neon.h"
  
  static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
diff --git a/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_neon.c

index f769620a43bfaf470e399c0b7b46ce4391fa6f29..1f2631b7989b6c65b3ce3caee70d746a7ab827c3 100644 (file)
--- a/vpx_dsp/arm/highbd_vpx_convolve_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve_neon.c
@@ -20,7 +20,7 @@ void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
                                 int h, int bd) {
    const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
    // + 1 to make it divisible by 4
-  DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
+  uint16_t temp[64 * 136];
    const int intermediate_height =
        (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
  
@@ -44,7 +44,7 @@ void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
                                     int w, int h, int bd) {
    const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
    // + 1 to make it divisible by 4
-  DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
+  uint16_t temp[64 * 136];
    const int intermediate_height =
        (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
  
diff --git a/vpx_dsp/arm/idct16x16_add_neon.c b/vpx_dsp/arm/idct16x16_add_neon.c

index 828fb5f6c71f89446233d4239b4352e719ebe22c..5c5963d277efa5fc73ccbf5f6179041d447f8762 100644 (file)
--- a/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/vpx_dsp/arm/idct16x16_add_neon.c
@@ -12,6 +12,7 @@
  
  #include "./vpx_dsp_rtcd.h"
  #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
  #include "vpx_dsp/txfm_common.h"
  
  static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
diff --git a/vpx_dsp/arm/idct32x32_135_add_neon.c b/vpx_dsp/arm/idct32x32_135_add_neon.c

index b398259918a0a3519f24d235cef0bd35e2cb13d5..021211bc99072263b8566ec7197bb0bb7b44c9f6 100644 (file)
--- a/vpx_dsp/arm/idct32x32_135_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_135_add_neon.c
@@ -13,6 +13,7 @@
  #include "./vpx_config.h"
  #include "./vpx_dsp_rtcd.h"
  #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
  #include "vpx_dsp/arm/transpose_neon.h"
  #include "vpx_dsp/txfm_common.h"
  
diff --git a/vpx_dsp/arm/idct32x32_34_add_neon.c b/vpx_dsp/arm/idct32x32_34_add_neon.c

index fc0c4cd84628b03ba2166b32d589cee8c45fd947..f3c336fa31f13ef2ccd2f1addb6bd734c7705be7 100644 (file)
--- a/vpx_dsp/arm/idct32x32_34_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -13,6 +13,7 @@
  #include "./vpx_config.h"
  #include "./vpx_dsp_rtcd.h"
  #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
  #include "vpx_dsp/arm/transpose_neon.h"
  #include "vpx_dsp/txfm_common.h"
  
diff --git a/vpx_dsp/arm/idct32x32_add_neon.c b/vpx_dsp/arm/idct32x32_add_neon.c

index 91418c9e69139b8d1e5e1cba2bef1f52b90d6370..9f4589ea968c5608cb36cc210de90e405f99790a 100644 (file)
--- a/vpx_dsp/arm/idct32x32_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_add_neon.c
@@ -13,6 +13,7 @@
  #include "./vpx_config.h"
  #include "./vpx_dsp_rtcd.h"
  #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
  #include "vpx_dsp/arm/transpose_neon.h"
  #include "vpx_dsp/txfm_common.h"
  
diff --git a/vpx_dsp/arm/idct4x4_1_add_neon.c b/vpx_dsp/arm/idct4x4_1_add_neon.c

index d1eae24a2228e2867523d1f9a5c6e1fc8c84df50..21d21b033681ed9e5b15e07c80b36995b6a6c3f9 100644 (file)
--- a/vpx_dsp/arm/idct4x4_1_add_neon.c
+++ b/vpx_dsp/arm/idct4x4_1_add_neon.c
@@ -12,6 +12,7 @@
  #include <assert.h>
  
  #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
  #include "vpx_dsp/inv_txfm.h"
  
  static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,
diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c

index bff98cbc1690494948e93f4a378c255bf518b5d0..673a36840e336decedb1bdbc0c4d8316cb0f8a20 100644 (file)
--- a/vpx_dsp/arm/idct4x4_add_neon.c
+++ b/vpx_dsp/arm/idct4x4_add_neon.c
@@ -13,13 +13,14 @@
  
  #include "./vpx_dsp_rtcd.h"
  #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
  #include "vpx_dsp/txfm_common.h"
  
  void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
                               int stride) {
    const uint8_t *dst = dest;
    const int16x4_t cospis = vld1_s16(kCospi);
-  uint32x2_t dest01_u32 = vdup_n_u32(0);
+  uint8x8_t dest01_u8;
    uint32x2_t dest32_u32 = vdup_n_u32(0);
    int16x8_t a0, a1;
    uint8x8_t d01, d32;
@@ -39,25 +40,22 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
    a0 = vrshrq_n_s16(a0, 4);
    a1 = vrshrq_n_s16(a1, 4);
  
-  dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 0);
-  dst += stride;
-  dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 1);
-  dst += stride;
+  dest01_u8 = load_u8(dst, stride);
+  dst += 2 * stride;
+  // The elements are loaded in reverse order.
    dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1);
    dst += stride;
    dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0);
  
-  d01_u16 =
-      vaddw_u8(vreinterpretq_u16_s16(a0), vreinterpret_u8_u32(dest01_u32));
+  d01_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), dest01_u8);
    d32_u16 =
        vaddw_u8(vreinterpretq_u16_s16(a1), vreinterpret_u8_u32(dest32_u32));
    d01 = vqmovun_s16(vreinterpretq_s16_u16(d01_u16));
    d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16));
  
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 0);
-  dest += stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 1);
-  dest += stride;
+  store_u8(dest, stride, d01);
+  dest += 2 * stride;
+  // The elements are stored in reverse order.
    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1);
    dest += stride;
    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0);
diff --git a/vpx_dsp/arm/idct8x8_add_neon.c b/vpx_dsp/arm/idct8x8_add_neon.c

index 279da67d74f6cf1b6ec80d317c08b67f1efbaa1d..1121ade27961486ceb57a7d7c46664e57e5ec106 100644 (file)
--- a/vpx_dsp/arm/idct8x8_add_neon.c
+++ b/vpx_dsp/arm/idct8x8_add_neon.c
@@ -13,6 +13,7 @@
  #include "./vpx_config.h"
  #include "./vpx_dsp_rtcd.h"
  #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
  #include "vpx_dsp/arm/transpose_neon.h"
  #include "vpx_dsp/txfm_common.h"
  
diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h

index 27c784edca925950bd46db670073fc566b7e6f37..6ed02af5acc1b94173048378413585ed95b0f670 100644 (file)
--- a/vpx_dsp/arm/idct_neon.h
+++ b/vpx_dsp/arm/idct_neon.h
@@ -18,7 +18,7 @@
  #include "vpx_dsp/txfm_common.h"
  #include "vpx_dsp/vpx_dsp_common.h"
  
-DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
+static const int16_t kCospi[16] = {
    16384 /*  cospi_0_64  */, 15137 /*  cospi_8_64  */,
    11585 /*  cospi_16_64 */, 6270 /*  cospi_24_64 */,
    16069 /*  cospi_4_64  */, 13623 /*  cospi_12_64 */,
@@ -29,7 +29,7 @@ DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
    12665 /*  cospi_14_64 */, -10394 /* -cospi_18_64 */
  };
  
-DECLARE_ALIGNED(16, static const int32_t, kCospi32[16]) = {
+static const int32_t kCospi32[16] = {
    16384 /*  cospi_0_64  */, 15137 /*  cospi_8_64  */,
    11585 /*  cospi_16_64 */, 6270 /*  cospi_24_64 */,
    16069 /*  cospi_4_64  */, 13623 /*  cospi_12_64 */,
@@ -40,58 +40,6 @@ DECLARE_ALIGNED(16, static const int32_t, kCospi32[16]) = {
    12665 /*  cospi_14_64 */, -10394 /* -cospi_18_64 */
  };
  
-//------------------------------------------------------------------------------
-// Helper functions used to load tran_low_t into int16, narrowing if necessary.
-
-static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int32x4x2_t v0 = vld2q_s32(buf);
-  const int32x4x2_t v1 = vld2q_s32(buf + 8);
-  const int16x4_t s0 = vmovn_s32(v0.val[0]);
-  const int16x4_t s1 = vmovn_s32(v0.val[1]);
-  const int16x4_t s2 = vmovn_s32(v1.val[0]);
-  const int16x4_t s3 = vmovn_s32(v1.val[1]);
-  int16x8x2_t res;
-  res.val[0] = vcombine_s16(s0, s2);
-  res.val[1] = vcombine_s16(s1, s3);
-  return res;
-#else
-  return vld2q_s16(buf);
-#endif
-}
-
-static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int32x4_t v0 = vld1q_s32(buf);
-  const int32x4_t v1 = vld1q_s32(buf + 4);
-  const int16x4_t s0 = vmovn_s32(v0);
-  const int16x4_t s1 = vmovn_s32(v1);
-  return vcombine_s16(s0, s1);
-#else
-  return vld1q_s16(buf);
-#endif
-}
-
-static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int32x4_t v0 = vld1q_s32(buf);
-  return vmovn_s32(v0);
-#else
-  return vld1_s16(buf);
-#endif
-}
-
-static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
-  const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
-  vst1q_s32(buf, v0);
-  vst1q_s32(buf + 4, v1);
-#else
-  vst1q_s16(buf, a);
-#endif
-}
-
  //------------------------------------------------------------------------------
  // Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth
  static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) {
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h

new file mode 100644 (file)

index 0000000..4efad53
--- /dev/null
+++ b/vpx_dsp/arm/mem_neon.h
@@ -0,0 +1,169 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_ARM_MEM_NEON_H_
+#define VPX_DSP_ARM_MEM_NEON_H_
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Helper functions used to load tran_low_t into int16, narrowing if necessary.
+static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4x2_t v0 = vld2q_s32(buf);
+  const int32x4x2_t v1 = vld2q_s32(buf + 8);
+  const int16x4_t s0 = vmovn_s32(v0.val[0]);
+  const int16x4_t s1 = vmovn_s32(v0.val[1]);
+  const int16x4_t s2 = vmovn_s32(v1.val[0]);
+  const int16x4_t s3 = vmovn_s32(v1.val[1]);
+  int16x8x2_t res;
+  res.val[0] = vcombine_s16(s0, s2);
+  res.val[1] = vcombine_s16(s1, s3);
+  return res;
+#else
+  return vld2q_s16(buf);
+#endif
+}
+
+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4_t v0 = vld1q_s32(buf);
+  const int32x4_t v1 = vld1q_s32(buf + 4);
+  const int16x4_t s0 = vmovn_s32(v0);
+  const int16x4_t s1 = vmovn_s32(v1);
+  return vcombine_s16(s0, s1);
+#else
+  return vld1q_s16(buf);
+#endif
+}
+
+static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4_t v0 = vld1q_s32(buf);
+  return vmovn_s32(v0);
+#else
+  return vld1_s16(buf);
+#endif
+}
+
+static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
+  const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
+  vst1q_s32(buf, v0);
+  vst1q_s32(buf + 4, v1);
+#else
+  vst1q_s16(buf, a);
+#endif
+}
+
+// Propagate type information to the compiler. Without this the compiler may
+// assume the required alignment of uint32_t (4 bytes) and add alignment hints
+// to the memory access.
+//
+// This is used for functions operating on uint8_t which wish to load or store 4
+// values at a time but which may not be on 4 byte boundaries.
+static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
+  memcpy(buf, &a, 4);
+}
+
+// Load 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
+  uint32_t a;
+  uint32x2_t a_u32 = vdup_n_u32(0);
+  if (stride == 4) return vld1_u8(buf);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1_lane_u32(&a, a_u32, 0);
+  memcpy(&a, buf, 4);
+  a_u32 = vld1_lane_u32(&a, a_u32, 1);
+  return vreinterpret_u8_u32(a_u32);
+}
+
+// Store 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE void store_unaligned_u8(uint8_t *buf, int stride,
+                                      const uint8x8_t a) {
+  const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
+  if (stride == 4) {
+    vst1_u8(buf, a);
+    return;
+  }
+  uint32_to_mem(buf, vget_lane_u32(a_u32, 0));
+  buf += stride;
+  uint32_to_mem(buf, vget_lane_u32(a_u32, 1));
+}
+
+// Load 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+  uint32_t a;
+  uint32x4_t a_u32 = vdupq_n_u32(0);
+  if (stride == 4) return vld1q_u8(buf);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1q_lane_u32(&a, a_u32, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1q_lane_u32(&a, a_u32, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1q_lane_u32(&a, a_u32, 2);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1q_lane_u32(&a, a_u32, 3);
+  return vreinterpretq_u8_u32(a_u32);
+}
+
+// Store 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE void store_unaligned_u8q(uint8_t *buf, int stride,
+                                       const uint8x16_t a) {
+  const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
+  if (stride == 4) {
+    vst1q_u8(buf, a);
+    return;
+  }
+  uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
+  buf += stride;
+  uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));
+  buf += stride;
+  uint32_to_mem(buf, vgetq_lane_u32(a_u32, 2));
+  buf += stride;
+  uint32_to_mem(buf, vgetq_lane_u32(a_u32, 3));
+}
+
+// Load 2 sets of 4 bytes when alignment is guaranteed.
+static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) {
+  uint32x2_t a = vdup_n_u32(0);
+
+  assert(!((intptr_t)buf % sizeof(uint32_t)));
+  assert(!(stride % sizeof(uint32_t)));
+
+  a = vld1_lane_u32((const uint32_t *)buf, a, 0);
+  buf += stride;
+  a = vld1_lane_u32((const uint32_t *)buf, a, 1);
+  return vreinterpret_u8_u32(a);
+}
+
+// Store 2 sets of 4 bytes when alignment is guaranteed.
+static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) {
+  uint32x2_t a_u32 = vreinterpret_u32_u8(a);
+
+  assert(!((intptr_t)buf % sizeof(uint32_t)));
+  assert(!(stride % sizeof(uint32_t)));
+
+  vst1_lane_u32((uint32_t *)buf, a_u32, 0);
+  buf += stride;
+  vst1_lane_u32((uint32_t *)buf, a_u32, 1);
+}
+#endif  // VPX_DSP_ARM_MEM_NEON_H_
diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c

index 9b1622ff038cc82cdbea1c15e1effab75e808dd6..4f58a7832a568023b43c5f25ee6097bc8b301cb9 100644 (file)
--- a/vpx_dsp/arm/subpel_variance_neon.c
+++ b/vpx_dsp/arm/subpel_variance_neon.c
@@ -12,16 +12,39 @@
  #include "./vpx_dsp_rtcd.h"
  #include "./vpx_config.h"
  
-#include "vpx_ports/mem.h"
  #include "vpx/vpx_integer.h"
  
  #include "vpx_dsp/variance.h"
+#include "vpx_dsp/arm/mem_neon.h"
  
  static const uint8_t bilinear_filters[8][2] = {
    { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
    { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
  };
  
+// Process a block exactly 4 wide and a multiple of 2 high.
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+                                      uint8_t *output_ptr,
+                                      unsigned int src_pixels_per_line,
+                                      int pixel_step,
+                                      unsigned int output_height,
+                                      const uint8_t *filter) {
+  const uint8x8_t f0 = vdup_n_u8(filter[0]);
+  const uint8x8_t f1 = vdup_n_u8(filter[1]);
+  unsigned int i;
+  for (i = 0; i < output_height; i += 2) {
+    const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
+    const uint8x8_t src_1 =
+        load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
+    const uint16x8_t a = vmull_u8(src_0, f0);
+    const uint16x8_t b = vmlal_u8(a, src_1, f1);
+    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+    vst1_u8(output_ptr, out);
+    src_ptr += 2 * src_pixels_per_line;
+    output_ptr += 8;
+  }
+}
+
  // Process a block exactly 8 wide and any height.
  static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                        uint8_t *output_ptr,
@@ -29,8 +52,8 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                        int pixel_step,
                                        unsigned int output_height,
                                        const uint8_t *filter) {
-  const uint8x8_t f0 = vmov_n_u8(filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  const uint8x8_t f0 = vdup_n_u8(filter[0]);
+  const uint8x8_t f1 = vdup_n_u8(filter[1]);
    unsigned int i;
    for (i = 0; i < output_height; ++i) {
      const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
@@ -38,8 +61,7 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
      const uint16x8_t a = vmull_u8(src_0, f0);
      const uint16x8_t b = vmlal_u8(a, src_1, f1);
      const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(&output_ptr[0], out);
-    // Next row...
+    vst1_u8(output_ptr, out);
      src_ptr += src_pixels_per_line;
      output_ptr += 8;
    }
@@ -53,8 +75,8 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
                                         unsigned int output_height,
                                         unsigned int output_width,
                                         const uint8_t *filter) {
-  const uint8x8_t f0 = vmov_n_u8(filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  const uint8x8_t f0 = vdup_n_u8(filter[0]);
+  const uint8x8_t f1 = vdup_n_u8(filter[1]);
    unsigned int i, j;
    for (i = 0; i < output_height; ++i) {
      for (j = 0; j < output_width; j += 16) {
@@ -66,36 +88,43 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
        const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
        const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
        const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
-      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
+      vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
      }
-    // Next row...
      src_ptr += src_pixels_per_line;
      output_ptr += output_width;
    }
  }
  
-// TODO(johannkoenig): support 4xM block sizes.
-#define sub_pixel_varianceNxM(n, m)                                      \
-  unsigned int vpx_sub_pixel_variance##n##x##m##_neon(                   \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,      \
-      const uint8_t *dst, int dst_stride, unsigned int *sse) {           \
-    DECLARE_ALIGNED(16, uint8_t, fdata3[n * (m + 1)]);                   \
-    DECLARE_ALIGNED(16, uint8_t, temp2[n * m]);                          \
-                                                                         \
-    if (n == 8) {                                                        \
-      var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, (m + 1),     \
-                                bilinear_filters[xoffset]);              \
-      var_filter_block2d_bil_w8(fdata3, temp2, n, n, m,                  \
-                                bilinear_filters[yoffset]);              \
-    } else {                                                             \
-      var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, (m + 1), n, \
-                                 bilinear_filters[xoffset]);             \
-      var_filter_block2d_bil_w16(fdata3, temp2, n, n, m, n,              \
-                                 bilinear_filters[yoffset]);             \
-    }                                                                    \
-    return vpx_variance##n##x##m(temp2, n, dst, dst_stride, sse);        \
+// 4xM filter writes an extra row to fdata because it processes two rows at a
+// time.
+#define sub_pixel_varianceNxM(n, m)                                 \
+  uint32_t vpx_sub_pixel_variance##n##x##m##_neon(                  \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
+      const uint8_t *b, int b_stride, uint32_t *sse) {              \
+    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                      \
+    uint8_t temp1[n * m];                                           \
+                                                                    \
+    if (n == 4) {                                                   \
+      var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2),     \
+                                bilinear_filters[xoffset]);         \
+      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,              \
+                                bilinear_filters[yoffset]);         \
+    } else if (n == 8) {                                            \
+      var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1),     \
+                                bilinear_filters[xoffset]);         \
+      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,              \
+                                bilinear_filters[yoffset]);         \
+    } else {                                                        \
+      var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
+                                 bilinear_filters[xoffset]);        \
+      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,          \
+                                 bilinear_filters[yoffset]);        \
+    }                                                               \
+    return vpx_variance##n##x##m(temp1, n, b, b_stride, sse);       \
    }
  
+sub_pixel_varianceNxM(4, 4);
+sub_pixel_varianceNxM(4, 8);
  sub_pixel_varianceNxM(8, 4);
  sub_pixel_varianceNxM(8, 8);
  sub_pixel_varianceNxM(8, 16);
@@ -107,3 +136,49 @@ sub_pixel_varianceNxM(32, 32);
  sub_pixel_varianceNxM(32, 64);
  sub_pixel_varianceNxM(64, 32);
  sub_pixel_varianceNxM(64, 64);
+
+// 4xM filter writes an extra row to fdata because it processes two rows at a
+// time.
+#define sub_pixel_avg_varianceNxM(n, m)                             \
+  uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon(              \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
+      const uint8_t *b, int b_stride, uint32_t *sse,                \
+      const uint8_t *second_pred) {                                 \
+    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                      \
+    uint8_t temp1[n * m];                                           \
+                                                                    \
+    if (n == 4) {                                                   \
+      var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2),     \
+                                bilinear_filters[xoffset]);         \
+      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,              \
+                                bilinear_filters[yoffset]);         \
+    } else if (n == 8) {                                            \
+      var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1),     \
+                                bilinear_filters[xoffset]);         \
+      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,              \
+                                bilinear_filters[yoffset]);         \
+    } else {                                                        \
+      var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
+                                 bilinear_filters[xoffset]);        \
+      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,          \
+                                 bilinear_filters[yoffset]);        \
+    }                                                               \
+                                                                    \
+    vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n);          \
+                                                                    \
+    return vpx_variance##n##x##m(temp0, n, b, b_stride, sse);       \
+  }
+
+sub_pixel_avg_varianceNxM(4, 4);
+sub_pixel_avg_varianceNxM(4, 8);
+sub_pixel_avg_varianceNxM(8, 4);
+sub_pixel_avg_varianceNxM(8, 8);
+sub_pixel_avg_varianceNxM(8, 16);
+sub_pixel_avg_varianceNxM(16, 8);
+sub_pixel_avg_varianceNxM(16, 16);
+sub_pixel_avg_varianceNxM(16, 32);
+sub_pixel_avg_varianceNxM(32, 16);
+sub_pixel_avg_varianceNxM(32, 32);
+sub_pixel_avg_varianceNxM(32, 64);
+sub_pixel_avg_varianceNxM(64, 32);
+sub_pixel_avg_varianceNxM(64, 64);
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c

index c0828e8f63992e285d97cf01f5c2529359b8e263..a6b2c53b7a45880056823b1a2a730e2d6c6b8575 100644 (file)
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -9,11 +9,13 @@
   */
  
  #include <arm_neon.h>
+#include <assert.h>
  
  #include "./vpx_dsp_rtcd.h"
  #include "./vpx_config.h"
  
  #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
  #include "vpx_ports/mem.h"
  
  static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
@@ -31,7 +33,56 @@ static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
    return vget_lane_s32(c, 0);
  }
  
-// w * h must be less than 2048 or sum_s16 may overflow.
+// The variance helper functions use int16_t for sum. 8 values are accumulated
+// and then added (at which point they expand up to int32_t). To avoid overflow,
+// there can be no more than 32767 / 255 ~= 128 values accumulated in each
+// column. For a 32x32 buffer, this results in 32 / 8 = 4 values per row * 32
+// rows = 128. Asserts have been added to each function to warn against reaching
+// this limit.
+
+// Process a block of width 4 four rows at a time.
+static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, int h, uint32_t *sse, int *sum) {
+  int i;
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int32x4_t sse_lo_s32 = vdupq_n_s32(0);
+  int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+
+  // Since width is only 4, sum_s16 only loads a half row per loop.
+  assert(h <= 256);
+
+  for (i = 0; i < h; i += 4) {
+    const uint8x16_t a_u8 = load_unaligned_u8q(a, a_stride);
+    const uint8x16_t b_u8 = load_unaligned_u8q(b, b_stride);
+    const uint16x8_t diff_lo_u16 =
+        vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
+    const uint16x8_t diff_hi_u16 =
+        vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8));
+
+    const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16);
+    const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16);
+
+    sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
+    sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
+
+    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16),
+                           vget_low_s16(diff_lo_s16));
+    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16),
+                           vget_high_s16(diff_lo_s16));
+
+    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16),
+                           vget_low_s16(diff_hi_s16));
+    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
+                           vget_high_s16(diff_hi_s16));
+
+    a += 4 * a_stride;
+    b += 4 * b_stride;
+  }
+
+  *sum = horizontal_add_s16x8(sum_s16);
+  *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32));
+}
+
  // Process a block of any size where the width is divisible by 16.
  static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b,
                                int b_stride, int w, int h, uint32_t *sse,
@@ -41,6 +92,10 @@ static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b,
    int32x4_t sse_lo_s32 = vdupq_n_s32(0);
    int32x4_t sse_hi_s32 = vdupq_n_s32(0);
  
+  // The loop loads 16 values at a time but doubles them up when accumulating
+  // into sum_s16.
+  assert(w / 8 * h <= 128);
+
    for (i = 0; i < h; ++i) {
      for (j = 0; j < w; j += 16) {
        const uint8x16_t a_u8 = vld1q_u8(a + j);
@@ -75,7 +130,6 @@ static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b,
    *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32));
  }
  
-// w * h must be less than 2048 or sum_s16 may overflow.
  // Process a block of width 8 two rows at a time.
  static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b,
                                 int b_stride, int h, uint32_t *sse, int *sum) {
@@ -84,6 +138,9 @@ static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b,
    int32x4_t sse_lo_s32 = vdupq_n_s32(0);
    int32x4_t sse_hi_s32 = vdupq_n_s32(0);
  
+  // Each column has it's own accumulator entry in sum_s16.
+  assert(h <= 128);
+
    do {
      const uint8x8_t a_0_u8 = vld1_u8(a);
      const uint8x8_t a_1_u8 = vld1_u8(a + a_stride);
@@ -127,7 +184,9 @@ void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
                                              const uint8_t *b, int b_stride, \
                                              unsigned int *sse) {            \
      int sum;                                                                \
-    if (n == 8)                                                             \
+    if (n == 4)                                                             \
+      variance_neon_w4x4(a, a_stride, b, b_stride, m, sse, &sum);           \
+    else if (n == 8)                                                        \
        variance_neon_w8x2(a, a_stride, b, b_stride, m, sse, &sum);           \
      else                                                                    \
        variance_neon_w16(a, a_stride, b, b_stride, n, m, sse, &sum);         \
@@ -137,6 +196,8 @@ void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
        return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);              \
    }
  
+varianceNxM(4, 4, 4);
+varianceNxM(4, 8, 5);
  varianceNxM(8, 4, 5);
  varianceNxM(8, 8, 6);
  varianceNxM(8, 16, 7);
diff --git a/vpx_dsp/arm/vpx_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c

index 6ca0e501b3c3a5d01ab55bb11a3ed1f4742602ab..bdaaff16a9c03defd8b4219c5df6f88155225d10 100644 (file)
--- a/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_neon.c
@@ -21,7 +21,7 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
    /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
     * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
     */
-  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+  uint8_t temp[64 * 72];
  
    // Account for the vertical phase needing 3 lines prior and 4 lines post
    const int intermediate_height = h + 7;
@@ -47,7 +47,7 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                              const int16_t *filter_x, int x_step_q4,
                              const int16_t *filter_y, int y_step_q4, int w,
                              int h) {
-  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+  uint8_t temp[64 * 72];
    const int intermediate_height = h + 7;
  
    assert(y_step_q4 == 16);
diff --git a/vpx_dsp/deblock.c b/vpx_dsp/deblock.c

index 3734ac251bd50bbaadab84eea17ad21193e50259..a0db1e40c98e87ecdfd3fb8d6d33485051bca995 100644 (file)
--- a/vpx_dsp/deblock.c
+++ b/vpx_dsp/deblock.c
@@ -9,9 +9,9 @@
   */
  #include <assert.h>
  #include <stdlib.h>
-#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
  
-DECLARE_PROTECTED(const int16_t vpx_rv[]) = {
+const int16_t vpx_rv[] = {
    8,  5,  2,  2,  8,  12, 4,  9,  8,  3,  0,  3,  9,  0,  0,  0,  8,  3,  14,
    4,  10, 1,  11, 14, 1,  14, 9,  6,  12, 11, 8,  6,  10, 0,  0,  8,  9,  0,
    3,  14, 8,  11, 13, 4,  2,  9,  0,  3,  9,  6,  1,  2,  3,  14, 13, 1,  8,
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c

index b1744047af176484cc47b1d1caba0a582ed6c0d0..4214150251ffb5b3874e6969a997810ab097c58d 100644 (file)
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -8,8 +8,6 @@
   *  be found in the AUTHORS file in the root of the source tree.
   */
  
-#include <assert.h>
-
  #include "./vpx_config.h"
  #include "./vpx_dsp_rtcd.h"
  
@@ -226,9 +224,6 @@ MSE(8, 8)
  void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
                           int height, const uint8_t *ref, int ref_stride) {
    int i, j;
-  /* comp_pred and pred must be 16 byte aligned. */
-  assert(((intptr_t)comp_pred & 0xf) == 0);
-  assert(((intptr_t)pred & 0xf) == 0);
  
    for (i = 0; i < height; ++i) {
      for (j = 0; j < width; ++j) {
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk

index 8d1ecbe8c746fda5b3bf40ec3a60415e4c89718f..da057c883769f3e08a1f1f23f2c09281a72096b3 100644 (file)
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -193,6 +193,7 @@ DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
  endif
  DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
  DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct_neon.c
  DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
  DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
  DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
@@ -324,6 +325,7 @@ ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)
  DSP_SRCS-yes            += variance.c
  DSP_SRCS-yes            += variance.h
  
+DSP_SRCS-$(HAVE_NEON)   += arm/avg_pred_neon.c
  DSP_SRCS-$(HAVE_NEON)   += arm/subpel_variance_neon.c
  DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
  
@@ -352,6 +354,7 @@ endif  # CONFIG_VP9_HIGHBITDEPTH
  endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
  
  # Neon utilities
+DSP_SRCS-$(HAVE_NEON) += arm/mem_neon.h
  DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h
  
  # PPC VSX utilities
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl

index 3461a80c78700cb5dd5b1a7a2bb8532866318222..9f005b284058092a643f64ee400dbcffc520fbb3 100644 (file)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -484,7 +484,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct4x4 sse2/;
+  specialize qw/vpx_fdct4x4 neon sse2/;
  
    add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vpx_fdct4x4_1 sse2/;
@@ -532,7 +532,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
  } else {
    add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct4x4 sse2 msa/;
+  specialize qw/vpx_fdct4x4 neon sse2 msa/;
  
    add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vpx_fdct4x4_1 sse2/;
@@ -629,18 +629,18 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  
    add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
    add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
-  specialize qw/vpx_highbd_idct4x4_1_add neon/;
+  specialize qw/vpx_highbd_idct4x4_1_add neon sse2/;
  
    add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
    add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
    add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
-  specialize qw/vpx_highbd_idct8x8_1_add neon/;
+  specialize qw/vpx_highbd_idct8x8_1_add neon sse2/;
  
    add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
    add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
    add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
    add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
-  specialize qw/vpx_highbd_idct16x16_1_add neon/;
+  specialize qw/vpx_highbd_idct16x16_1_add neon sse2/;
  
    add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
    add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
@@ -1142,12 +1142,10 @@ add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_
    specialize qw/vpx_variance8x4 sse2 neon msa/;
  
  add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-# TODO(johannkoenig): neon
-  specialize qw/vpx_variance4x8 sse2 msa/;
+  specialize qw/vpx_variance4x8 sse2 neon msa/;
  
  add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-# TODO(johannkoenig): neon
-  specialize qw/vpx_variance4x4 sse2 msa/;
+  specialize qw/vpx_variance4x4 sse2 neon msa/;
  
  #
  # Specialty Variance
@@ -1177,7 +1175,7 @@ add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int
    specialize qw/vpx_get4x4sse_cs neon msa vsx/;
  
  add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
-  specialize qw/vpx_comp_avg_pred sse2 vsx/;
+  specialize qw/vpx_comp_avg_pred neon sse2 vsx/;
  
  #
  # Subpixel Variance
@@ -1216,49 +1214,49 @@ add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int s
    specialize qw/vpx_sub_pixel_variance8x4 neon msa sse2 ssse3/;
  
  add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x8 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance4x8 neon msa sse2 ssse3/;
  
  add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x4 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance4x4 neon msa sse2 ssse3/;
  
  add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa sse2 ssse3/;
  
  add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x32 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa sse2 ssse3/;
  
  add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x64 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance32x64 neon msa sse2 ssse3/;
  
  add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance32x32 neon avx2 msa sse2 ssse3/;
  
  add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x16 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance32x16 neon msa sse2 ssse3/;
  
  add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance16x32 neon msa sse2 ssse3/;
  
  add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance16x16 neon msa sse2 ssse3/;
  
  add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance16x8 neon msa sse2 ssse3/;
  
  add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance8x16 neon msa sse2 ssse3/;
  
  add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance8x8 neon msa sse2 ssse3/;
  
  add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance8x4 neon msa sse2 ssse3/;
  
  add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance4x8 neon msa sse2 ssse3/;
  
  add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance4x4 neon msa sse2 ssse3/;
  
  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c

index f16e4d0718669dcffc0704bf7f6e25d4be6d7a6b..a2412d124f1df31e6d61ea96b64b7f2c9db54133 100644 (file)
--- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -242,3 +242,8 @@ void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
      }
    }
  }
+
+void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int bd) {
+  highbd_idct_1_add_kernel(input, dest, stride, bd, 16);
+}
diff --git a/vpx_dsp/x86/highbd_idct32x32_add_sse2.c b/vpx_dsp/x86/highbd_idct32x32_add_sse2.c

index bc9debf319c24b8133492428ef8bf4845cf8162b..06f26591808079c27d6d8d22ac04d9cbe949dd3f 100644 (file)
--- a/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
@@ -9,33 +9,12 @@
   */
  
  #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
  #include "vpx_dsp/x86/inv_txfm_sse2.h"
  #include "vpx_dsp/x86/transpose_sse2.h"
  #include "vpx_dsp/x86/txfm_common_sse2.h"
  
  void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,
                                       int stride, int bd) {
-  __m128i dc_value, d;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
-  int a, i, j;
-  tran_low_t out;
-
-  out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
-  a = ROUND_POWER_OF_TWO(out, 6);
-
-  d = _mm_set1_epi32(a);
-  dc_value = _mm_packs_epi32(d, d);
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 4; ++j) {
-      d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));
-      d = _mm_adds_epi16(d, dc_value);
-      d = _mm_max_epi16(d, zero);
-      d = _mm_min_epi16(d, max);
-      _mm_storeu_si128((__m128i *)(&dest[j * 8]), d);
-    }
-    dest += stride;
-  }
+  highbd_idct_1_add_kernel(input, dest, stride, bd, 32);
  }
diff --git a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c

index f3a69795ec112ea63909e690a559217741b42828..5293f5694f65a6c1f114e68cf1177b2fa5b3056b 100644 (file)
--- a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -14,116 +14,249 @@
  #include "vpx_dsp/x86/transpose_sse2.h"
  #include "vpx_dsp/x86/txfm_common_sse2.h"
  
+static INLINE __m128i dct_const_round_shift_4_sse2(const __m128i in0,
+                                                   const __m128i in1) {
+  const __m128i t0 = _mm_unpacklo_epi32(in0, in1);  // 0, 1
+  const __m128i t1 = _mm_unpackhi_epi32(in0, in1);  // 2, 3
+  const __m128i t2 = _mm_unpacklo_epi64(t0, t1);    // 0, 1, 2, 3
+  return dct_const_round_shift_sse2(t2);
+}
+
+static INLINE __m128i wraplow_16bit_sse2(const __m128i in0, const __m128i in1,
+                                         const __m128i rounding) {
+  __m128i temp[2];
+  temp[0] = _mm_add_epi32(in0, rounding);
+  temp[1] = _mm_add_epi32(in1, rounding);
+  temp[0] = _mm_srai_epi32(temp[0], 4);
+  temp[1] = _mm_srai_epi32(temp[1], 4);
+  return _mm_packs_epi32(temp[0], temp[1]);
+}
+
+static INLINE void highbd_idct4_small_sse2(__m128i *const io) {
+  const __m128i cospi_p16_p16 = _mm_setr_epi32(cospi_16_64, 0, cospi_16_64, 0);
+  const __m128i cospi_p08_p08 = _mm_setr_epi32(cospi_8_64, 0, cospi_8_64, 0);
+  const __m128i cospi_p24_p24 = _mm_setr_epi32(cospi_24_64, 0, cospi_24_64, 0);
+  __m128i temp1[4], temp2[4], step[4];
+
+  transpose_32bit_4x4(&io[0], &io[1], &io[2], &io[3]);
+
+  // Note: There is no 32-bit signed multiply SIMD instruction in SSE2.
+  //       _mm_mul_epu32() is used which can only guarantee the lower 32-bit
+  //       (signed) result is meaningful, which is enough in this function.
+
+  // stage 1
+  temp1[0] = _mm_add_epi32(io[0], io[2]);             // input[0] + input[2]
+  temp2[0] = _mm_sub_epi32(io[0], io[2]);             // input[0] - input[2]
+  temp1[1] = _mm_srli_si128(temp1[0], 4);             // 1, 3
+  temp2[1] = _mm_srli_si128(temp2[0], 4);             // 1, 3
+  temp1[0] = _mm_mul_epu32(temp1[0], cospi_p16_p16);  // ([0] + [2])*cospi_16_64
+  temp1[1] = _mm_mul_epu32(temp1[1], cospi_p16_p16);  // ([0] + [2])*cospi_16_64
+  temp2[0] = _mm_mul_epu32(temp2[0], cospi_p16_p16);  // ([0] - [2])*cospi_16_64
+  temp2[1] = _mm_mul_epu32(temp2[1], cospi_p16_p16);  // ([0] - [2])*cospi_16_64
+  step[0] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]);
+  step[1] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]);
+
+  temp1[3] = _mm_srli_si128(io[1], 4);
+  temp2[3] = _mm_srli_si128(io[3], 4);
+  temp1[0] = _mm_mul_epu32(io[1], cospi_p24_p24);     // input[1] * cospi_24_64
+  temp1[1] = _mm_mul_epu32(temp1[3], cospi_p24_p24);  // input[1] * cospi_24_64
+  temp2[0] = _mm_mul_epu32(io[1], cospi_p08_p08);     // input[1] * cospi_8_64
+  temp2[1] = _mm_mul_epu32(temp1[3], cospi_p08_p08);  // input[1] * cospi_8_64
+  temp1[2] = _mm_mul_epu32(io[3], cospi_p08_p08);     // input[3] * cospi_8_64
+  temp1[3] = _mm_mul_epu32(temp2[3], cospi_p08_p08);  // input[3] * cospi_8_64
+  temp2[2] = _mm_mul_epu32(io[3], cospi_p24_p24);     // input[3] * cospi_24_64
+  temp2[3] = _mm_mul_epu32(temp2[3], cospi_p24_p24);  // input[3] * cospi_24_64
+  temp1[0] = _mm_sub_epi64(temp1[0], temp1[2]);  // [1]*cospi_24 - [3]*cospi_8
+  temp1[1] = _mm_sub_epi64(temp1[1], temp1[3]);  // [1]*cospi_24 - [3]*cospi_8
+  temp2[0] = _mm_add_epi64(temp2[0], temp2[2]);  // [1]*cospi_8 + [3]*cospi_24
+  temp2[1] = _mm_add_epi64(temp2[1], temp2[3]);  // [1]*cospi_8 + [3]*cospi_24
+  step[2] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]);
+  step[3] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]);
+
+  // stage 2
+  io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
+  io[1] = _mm_add_epi32(step[1], step[2]);  // step[1] + step[2]
+  io[2] = _mm_sub_epi32(step[1], step[2]);  // step[1] - step[2]
+  io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
+}
+
+static INLINE void abs_extend_64bit_sse2(const __m128i in,
+                                         __m128i *const out /*out[2]*/,
+                                         __m128i *const sign /*sign[2]*/) {
+  sign[0] = _mm_srai_epi32(in, 31);
+  out[0] = _mm_xor_si128(in, sign[0]);
+  out[0] = _mm_sub_epi32(out[0], sign[0]);
+  sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]);  // 64-bit sign of 2, 3
+  sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]);  // 64-bit sign of 0, 1
+  out[1] = _mm_unpackhi_epi32(out[0], out[0]);     // 2, 3
+  out[0] = _mm_unpacklo_epi32(out[0], out[0]);     // 0, 1
+}
+
+static INLINE __m128i multiply_apply_sign_sse2(const __m128i in,
+                                               const __m128i sign,
+                                               const __m128i cospi) {
+  __m128i out = _mm_mul_epu32(in, cospi);
+  out = _mm_xor_si128(out, sign);
+  return _mm_sub_epi64(out, sign);
+}
+
+static INLINE __m128i dct_const_round_shift_64bit_sse2(const __m128i in) {
+  const __m128i t = _mm_add_epi64(
+      in,
+      _mm_setr_epi32(DCT_CONST_ROUNDING << 2, 0, DCT_CONST_ROUNDING << 2, 0));
+  return _mm_srli_si128(t, 2);
+}
+
+static INLINE __m128i pack_4_sse2(const __m128i in0, const __m128i in1) {
+  const __m128i t0 = _mm_unpacklo_epi32(in0, in1);  // 0, 2
+  const __m128i t1 = _mm_unpackhi_epi32(in0, in1);  // 1, 3
+  return _mm_unpacklo_epi32(t0, t1);                // 0, 1, 2, 3
+}
+
+static INLINE void highbd_idct4_large_sse2(__m128i *const io) {
+  const __m128i cospi_p16_p16 =
+      _mm_setr_epi32(cospi_16_64 << 2, 0, cospi_16_64 << 2, 0);
+  const __m128i cospi_p08_p08 =
+      _mm_setr_epi32(cospi_8_64 << 2, 0, cospi_8_64 << 2, 0);
+  const __m128i cospi_p24_p24 =
+      _mm_setr_epi32(cospi_24_64 << 2, 0, cospi_24_64 << 2, 0);
+  __m128i temp1[4], temp2[4], step[4], sign1[4], sign2[4];
+
+  transpose_32bit_4x4(&io[0], &io[1], &io[2], &io[3]);
+
+  // stage 1
+  temp1[0] = _mm_add_epi32(io[0], io[2]);  // input[0] + input[2]
+  temp2[0] = _mm_sub_epi32(io[0], io[2]);  // input[0] - input[2]
+  abs_extend_64bit_sse2(temp1[0], temp1, sign1);
+  abs_extend_64bit_sse2(temp2[0], temp2, sign2);
+  temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], cospi_p16_p16);
+  temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], cospi_p16_p16);
+  temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], cospi_p16_p16);
+  temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], cospi_p16_p16);
+  temp1[0] = dct_const_round_shift_64bit_sse2(temp1[0]);
+  temp1[1] = dct_const_round_shift_64bit_sse2(temp1[1]);
+  temp2[0] = dct_const_round_shift_64bit_sse2(temp2[0]);
+  temp2[1] = dct_const_round_shift_64bit_sse2(temp2[1]);
+  step[0] = pack_4_sse2(temp1[0], temp1[1]);
+  step[1] = pack_4_sse2(temp2[0], temp2[1]);
+
+  abs_extend_64bit_sse2(io[1], temp1, sign1);
+  abs_extend_64bit_sse2(io[3], temp2, sign2);
+  temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], cospi_p08_p08);
+  temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], cospi_p08_p08);
+  temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], cospi_p24_p24);
+  temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], cospi_p24_p24);
+  temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], cospi_p24_p24);
+  temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], cospi_p24_p24);
+  temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], cospi_p08_p08);
+  temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], cospi_p08_p08);
+  temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);  // [1]*cospi_24 - [3]*cospi_8
+  temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);  // [1]*cospi_24 - [3]*cospi_8
+  temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);  // [1]*cospi_8 + [3]*cospi_24
+  temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);  // [1]*cospi_8 + [3]*cospi_24
+  temp1[0] = dct_const_round_shift_64bit_sse2(temp1[0]);
+  temp1[1] = dct_const_round_shift_64bit_sse2(temp1[1]);
+  temp2[0] = dct_const_round_shift_64bit_sse2(temp2[0]);
+  temp2[1] = dct_const_round_shift_64bit_sse2(temp2[1]);
+  step[2] = pack_4_sse2(temp1[0], temp1[1]);
+  step[3] = pack_4_sse2(temp2[0], temp2[1]);
+
+  // stage 2
+  io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
+  io[1] = _mm_add_epi32(step[1], step[2]);  // step[1] + step[2]
+  io[2] = _mm_sub_epi32(step[1], step[2]);  // step[1] - step[2]
+  io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
+}
+
  void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
                                      int stride, int bd) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  __m128i inptr[4];
-  __m128i sign_bits[2];
-  __m128i temp_mm, min_input, max_input;
-  int test;
-  int optimised_cols = 0;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i max = _mm_set1_epi16(12043);
-  const __m128i min = _mm_set1_epi16(-12043);
-  // Load input into __m128i
-  inptr[0] = _mm_loadu_si128((const __m128i *)input);
-  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
-  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
-  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
-
-  // Pack to 16 bits
-  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
-  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
-
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp_mm = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp_mm);
-
-  if (!test) {
-    // Do the row transform
-    idct4_sse2(inptr);
-
-    // Check the min & max values
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp_mm = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp_mm);
-
-    if (test) {
-      transpose_4x4(inptr);
-      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
-      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
-      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
-      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
-      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
-      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
-      _mm_storeu_si128((__m128i *)outptr, inptr[0]);
-      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
-      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
-      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct4_c(input, outptr, bd);
-      input += 4;
-      outptr += 4;
-    }
+  int16_t max = 0, min = 0;
+  __m128i io[4], io_short[2];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 8));
+  io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+  io_short[0] = _mm_packs_epi32(io[0], io[1]);
+  io_short[1] = _mm_packs_epi32(io[2], io[3]);
+
+  if (bd != 8) {
+    __m128i max_input, min_input;
+
+    max_input = _mm_max_epi16(io_short[0], io_short[1]);
+    min_input = _mm_min_epi16(io_short[0], io_short[1]);
+    max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 8));
+    min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 8));
+    max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 4));
+    min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 4));
+    max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 2));
+    min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 2));
+    max = _mm_extract_epi16(max_input, 0);
+    min = _mm_extract_epi16(min_input, 0);
    }
  
-  if (optimised_cols) {
-    idct4_sse2(inptr);
-
-    // Final round and shift
-    inptr[0] = _mm_add_epi16(inptr[0], eight);
-    inptr[1] = _mm_add_epi16(inptr[1], eight);
-
-    inptr[0] = _mm_srai_epi16(inptr[0], 4);
-    inptr[1] = _mm_srai_epi16(inptr[1], 4);
-
-    // Reconstruction and Store
-    {
-      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
-      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
-      d0 = _mm_unpacklo_epi64(
-          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
-      d2 = _mm_unpacklo_epi64(
-          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
-      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
-      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
-      // store input0
-      _mm_storel_epi64((__m128i *)dest, d0);
-      // store input1
-      d0 = _mm_srli_si128(d0, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride), d0);
-      // store input2
-      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
-      // store input3
-      d2 = _mm_srli_si128(d2, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
-    }
+  if (bd == 8 || (max < 4096 && min >= -4096)) {
+    idct4_sse2(io_short);
+    idct4_sse2(io_short);
+    io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+    io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+    io[0] = _mm_srai_epi16(io_short[0], 4);
+    io[1] = _mm_srai_epi16(io_short[1], 4);
    } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[4], temp_out[4];
-    // Columns
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-      vpx_highbd_idct4_c(temp_in, temp_out, bd);
-      for (j = 0; j < 4; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
-      }
+    if (max < 32767 && min > -32768) {
+      highbd_idct4_small_sse2(io);
+      highbd_idct4_small_sse2(io);
+    } else {
+      highbd_idct4_large_sse2(io);
+      highbd_idct4_large_sse2(io);
      }
+    io[0] = wraplow_16bit_sse2(io[0], io[1], _mm_set1_epi32(8));
+    io[1] = wraplow_16bit_sse2(io[2], io[3], _mm_set1_epi32(8));
+  }
+
+  // Reconstruction and Store
+  {
+    __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+    __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+    d0 = _mm_unpacklo_epi64(d0,
+                            _mm_loadl_epi64((const __m128i *)(dest + stride)));
+    d2 = _mm_unpacklo_epi64(
+        d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+    d0 = clamp_high_sse2(_mm_adds_epi16(d0, io[0]), bd);
+    d2 = clamp_high_sse2(_mm_adds_epi16(d2, io[1]), bd);
+    // store input0
+    _mm_storel_epi64((__m128i *)dest, d0);
+    // store input1
+    d0 = _mm_srli_si128(d0, 8);
+    _mm_storel_epi64((__m128i *)(dest + stride), d0);
+    // store input2
+    _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+    // store input3
+    d2 = _mm_srli_si128(d2, 8);
+    _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+  }
+}
+
+void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                   int stride, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  // Faster than _mm_set1_epi16((1 << bd) - 1).
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  int a1, i;
+  tran_low_t out;
+  __m128i dc, d;
+
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+  dc = _mm_set1_epi16(a1);
+
+  for (i = 0; i < 4; ++i) {
+    d = _mm_loadl_epi64((const __m128i *)dest);
+    d = add_dc_clamp(&zero, &max, &dc, &d);
+    _mm_storel_epi64((__m128i *)dest, d);
+    dest += stride;
    }
  }
diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c

index 6a2e180646c77a0ce2eb7faa3113c32645f58634..29cc1d30ec360bec220ee3af140d7a67428e3825 100644 (file)
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -214,3 +214,8 @@ void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
      }
    }
  }
+
+void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                   int stride, int bd) {
+  highbd_idct_1_add_kernel(input, dest, stride, bd, 8);
+}
diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h

index 774cce1d40c39aaf139c92313debc6dea4f3bc24..ea100c6e1975306e5d9d418b9781ae408f19a25f 100644 (file)
--- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -17,6 +17,43 @@
  #include "vpx_dsp/inv_txfm.h"
  #include "vpx_dsp/x86/txfm_common_sse2.h"
  
+static INLINE __m128i add_dc_clamp(const __m128i *const min,
+                                   const __m128i *const max,
+                                   const __m128i *const dc,
+                                   const __m128i *const in) {
+  __m128i out;
+  out = _mm_adds_epi16(*in, *dc);
+  out = _mm_max_epi16(out, *min);
+  out = _mm_min_epi16(out, *max);
+  return out;
+}
+
+static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,
+                                            uint16_t *dest, int stride, int bd,
+                                            const int size) {
+  const __m128i zero = _mm_setzero_si128();
+  // Faster than _mm_set1_epi16((1 << bd) - 1).
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  int a1, i, j;
+  tran_low_t out;
+  __m128i dc, d;
+
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  a1 = ROUND_POWER_OF_TWO(out, (size == 8) ? 5 : 6);
+  dc = _mm_set1_epi16(a1);
+
+  for (i = 0; i < size; ++i) {
+    for (j = 0; j < (size >> 3); ++j) {
+      d = _mm_load_si128((const __m128i *)(&dest[j * 8]));
+      d = add_dc_clamp(&zero, &max, &dc, &d);
+      _mm_store_si128((__m128i *)(&dest[j * 8]), d);
+    }
+    dest += stride;
+  }
+}
+
  static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
    __m128i ubounded, retval;
    const __m128i zero = _mm_set1_epi16(0);
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c

index 4b201b987595992ae3d3ccf192c543c7c1a74799..d221084970e566c8d7301d5eda26e1961fd980d5 100644 (file)
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -74,30 +74,14 @@ void idct4_sse2(__m128i *in) {
    const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
    const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
    const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8];
+  __m128i u[2];
  
-  transpose_4x4(in);
+  transpose_16bit_4x4(in);
    // stage 1
    u[0] = _mm_unpacklo_epi16(in[0], in[1]);
    u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
-  u[0] = _mm_packs_epi32(v[0], v[1]);
-  u[1] = _mm_packs_epi32(v[3], v[2]);
+  u[0] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_p16_m16, u[0]);
+  u[1] = idct_calc_wraplow_sse2(k__cospi_p08_p24, k__cospi_p24_m08, u[1]);
  
    // stage 2
    in[0] = _mm_add_epi16(u[0], u[1]);
@@ -115,7 +99,7 @@ void iadst4_sse2(__m128i *in) {
    const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
    __m128i u[8], v[8], in7;
  
-  transpose_4x4(in);
+  transpose_16bit_4x4(in);
    in7 = _mm_srli_si128(in[1], 8);
    in7 = _mm_add_epi16(in7, in[0]);
    in7 = _mm_sub_epi16(in7, in[1]);
@@ -156,23 +140,8 @@ void iadst4_sse2(__m128i *in) {
  
  #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
    {                                                                  \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                               \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                               \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                               \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                               \
-                                                                     \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                            \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                            \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                            \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                            \
-                                                                     \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                     \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                     \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                     \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                     \
-                                                                     \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                              \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                              \
+    res0 = idct_calc_wraplow_sse2(lo_0, hi_0, cst0);                 \
+    res1 = idct_calc_wraplow_sse2(lo_0, hi_0, cst1);                 \
    }
  
  #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
@@ -215,23 +184,8 @@ void iadst4_sse2(__m128i *in) {
        stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                 \
        stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                 \
                                                                                \
-      tmp0 = _mm_madd_epi16(lo_56, stg2_1);                                   \
-      tmp1 = _mm_madd_epi16(hi_56, stg2_1);                                   \
-      tmp2 = _mm_madd_epi16(lo_56, stg2_0);                                   \
-      tmp3 = _mm_madd_epi16(hi_56, stg2_0);                                   \
-                                                                              \
-      tmp0 = _mm_add_epi32(tmp0, rounding);                                   \
-      tmp1 = _mm_add_epi32(tmp1, rounding);                                   \
-      tmp2 = _mm_add_epi32(tmp2, rounding);                                   \
-      tmp3 = _mm_add_epi32(tmp3, rounding);                                   \
-                                                                              \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                            \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                            \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                            \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                            \
-                                                                              \
-      stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                   \
-      stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                   \
+      stp1_5 = idct_calc_wraplow_sse2(lo_56, hi_56, stg2_1);                  \
+      stp1_6 = idct_calc_wraplow_sse2(lo_56, hi_56, stg2_0);                  \
      }                                                                         \
                                                                                \
      /* Stage4  */                                                             \
@@ -248,7 +202,6 @@ void iadst4_sse2(__m128i *in) {
  void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
                               int stride) {
    const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
    const __m128i final_rounding = _mm_set1_epi16(1 << 4);
    const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
@@ -262,7 +215,6 @@ void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
    __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
    __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    int i;
  
    // Load input data.
@@ -338,7 +290,6 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
  }
  
  void idct8_sse2(__m128i *in) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
    const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
    const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
@@ -351,7 +302,6 @@ void idct8_sse2(__m128i *in) {
    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
    __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
    __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  
    // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
    TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
@@ -548,37 +498,10 @@ void iadst8_sse2(__m128i *in) {
    u2 = _mm_unpacklo_epi16(s6, s7);
    u3 = _mm_unpackhi_epi16(s6, s7);
  
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
-  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
-  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
-  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  s2 = _mm_packs_epi32(v0, v1);
-  s3 = _mm_packs_epi32(v2, v3);
-  s6 = _mm_packs_epi32(v4, v5);
-  s7 = _mm_packs_epi32(v6, v7);
+  s2 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_p16);
+  s3 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_m16);
+  s6 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_p16);
+  s7 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_m16);
  
    in[0] = s0;
    in[1] = _mm_sub_epi16(k__const_0, s4);
@@ -593,7 +516,6 @@ void iadst8_sse2(__m128i *in) {
  void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
                               int stride) {
    const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
    const __m128i final_rounding = _mm_set1_epi16(1 << 4);
    const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
    const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
@@ -608,7 +530,7 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
    __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
    __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i tmp0, tmp1, tmp2, tmp3;
  
    // Rows. Load 4-row input data.
    in0 = load_input_data(input);
@@ -623,22 +545,8 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
      const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
      const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
  
-    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
-    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
-    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
-    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
+    stp1_4 = idct_calc_wraplow_sse2(stg1_0, stg1_1, lo_17);
+    stp1_5 = idct_calc_wraplow_sse2(stg1_2, stg1_3, lo_35);
    }
  
    // Stage2
@@ -646,22 +554,8 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
      const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
      const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
  
-    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
-    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
-    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
+    stp2_0 = idct_calc_wraplow_sse2(stg2_0, stg2_1, lo_04);
+    stp2_2 = idct_calc_wraplow_sse2(stg2_3, stg2_2, lo_26);
  
      tmp0 = _mm_add_epi16(stp1_4, stp1_5);
      tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
@@ -675,21 +569,11 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
    {
      const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
  
-    tmp4 = _mm_add_epi16(stp2_0, stp2_2);
-    tmp6 = _mm_sub_epi16(stp2_0, stp2_2);
-
-    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
-    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
-
-    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
+    tmp0 = _mm_add_epi16(stp2_0, stp2_2);
+    tmp1 = _mm_sub_epi16(stp2_0, stp2_2);
+    stp1_2 = _mm_unpackhi_epi64(tmp1, tmp0);
+    stp1_3 = _mm_unpacklo_epi64(tmp1, tmp0);
+    stp1_5 = idct_calc_wraplow_sse2(stg3_0, stg2_0, lo_56);  // stg3_1 = stg2_0
    }
  
    // Stage4
@@ -806,23 +690,8 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
      stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
      stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
                                                                                 \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
+    stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1);                   \
+    stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0);                   \
                                                                                 \
      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
@@ -910,23 +779,8 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
      stp1_2 = stp1_1;                                                           \
      stp1_3 = stp1_0;                                                           \
                                                                                 \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
+    stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1);                   \
+    stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0);                   \
                                                                                 \
      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
@@ -962,7 +816,6 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
  
  void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
                                  int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
    const __m128i final_rounding = _mm_set1_epi16(1 << 5);
    const __m128i zero = _mm_setzero_si128();
  
@@ -997,7 +850,6 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
        stp1_8_0, stp1_12_0;
    __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
        stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    int i;
  
    curr1 = l;
@@ -1505,69 +1357,19 @@ static void iadst16_8col(__m128i *in) {
    u[6] = _mm_unpacklo_epi16(s[14], s[15]);
    u[7] = _mm_unpackhi_epi16(s[14], s[15]);
  
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+  in[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_m16);
+  in[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
+  in[4] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
+  in[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16);
+  in[6] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p16_p16);
+  in[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m16_p16);
+  in[5] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m16_m16);
+  in[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p16_m16);
  
    in[0] = s[0];
    in[1] = _mm_sub_epi16(kZero, s[8]);
    in[2] = s[12];
    in[3] = _mm_sub_epi16(kZero, s[4]);
-  in[4] = _mm_packs_epi32(v[4], v[5]);
-  in[5] = _mm_packs_epi32(v[12], v[13]);
-  in[6] = _mm_packs_epi32(v[8], v[9]);
-  in[7] = _mm_packs_epi32(v[0], v[1]);
-  in[8] = _mm_packs_epi32(v[2], v[3]);
-  in[9] = _mm_packs_epi32(v[10], v[11]);
-  in[10] = _mm_packs_epi32(v[14], v[15]);
-  in[11] = _mm_packs_epi32(v[6], v[7]);
    in[12] = s[5];
    in[13] = _mm_sub_epi16(kZero, s[13]);
    in[14] = s[9];
@@ -1595,8 +1397,7 @@ static void idct16_8col(__m128i *in) {
    const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
    const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
    const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i v[16], u[16], s[16], t[16];
+  __m128i u[16], s[16], t[16];
  
    // stage 1
    s[0] = in[0];
@@ -1626,65 +1427,14 @@ static void idct16_8col(__m128i *in) {
    u[6] = _mm_unpacklo_epi16(s[11], s[12]);
    u[7] = _mm_unpackhi_epi16(s[11], s[12]);
  
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[8] = _mm_packs_epi32(u[0], u[1]);
-  s[15] = _mm_packs_epi32(u[2], u[3]);
-  s[9] = _mm_packs_epi32(u[4], u[5]);
-  s[14] = _mm_packs_epi32(u[6], u[7]);
-  s[10] = _mm_packs_epi32(u[8], u[9]);
-  s[13] = _mm_packs_epi32(u[10], u[11]);
-  s[11] = _mm_packs_epi32(u[12], u[13]);
-  s[12] = _mm_packs_epi32(u[14], u[15]);
+  s[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p30_m02);
+  s[15] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p02_p30);
+  s[9] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p14_m18);
+  s[14] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p18_p14);
+  s[10] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p22_m10);
+  s[13] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p10_p22);
+  s[11] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p06_m26);
+  s[12] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p26_p06);
  
    // stage 3
    t[0] = s[0];
@@ -1696,37 +1446,10 @@ static void idct16_8col(__m128i *in) {
    u[2] = _mm_unpacklo_epi16(s[5], s[6]);
    u[3] = _mm_unpackhi_epi16(s[5], s[6]);
  
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[4] = _mm_packs_epi32(u[0], u[1]);
-  t[7] = _mm_packs_epi32(u[2], u[3]);
-  t[5] = _mm_packs_epi32(u[4], u[5]);
-  t[6] = _mm_packs_epi32(u[6], u[7]);
+  t[4] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p28_m04);
+  t[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p04_p28);
+  t[5] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p12_m20);
+  t[6] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p20_p12);
    t[8] = _mm_add_epi16(s[8], s[9]);
    t[9] = _mm_sub_epi16(s[8], s[9]);
    t[10] = _mm_sub_epi16(s[11], s[10]);
@@ -1746,71 +1469,20 @@ static void idct16_8col(__m128i *in) {
    u[6] = _mm_unpacklo_epi16(t[10], t[13]);
    u[7] = _mm_unpackhi_epi16(t[10], t[13]);
  
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[0] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
+  s[1] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
+  s[2] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p24_m08);
+  s[3] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p08_p24);
+  s[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m08_p24);
+  s[14] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p24_p08);
+  s[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m24_m08);
+  s[13] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m08_p24);
    s[4] = _mm_add_epi16(t[4], t[5]);
    s[5] = _mm_sub_epi16(t[4], t[5]);
    s[6] = _mm_sub_epi16(t[7], t[6]);
    s[7] = _mm_add_epi16(t[6], t[7]);
    s[8] = t[8];
    s[15] = t[15];
-  s[9] = _mm_packs_epi32(u[8], u[9]);
-  s[14] = _mm_packs_epi32(u[10], u[11]);
-  s[10] = _mm_packs_epi32(u[12], u[13]);
-  s[13] = _mm_packs_epi32(u[14], u[15]);
    s[11] = t[11];
    s[12] = t[12];
  
@@ -1824,20 +1496,8 @@ static void idct16_8col(__m128i *in) {
  
    u[0] = _mm_unpacklo_epi16(s[5], s[6]);
    u[1] = _mm_unpackhi_epi16(s[5], s[6]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  t[5] = _mm_packs_epi32(u[0], u[1]);
-  t[6] = _mm_packs_epi32(u[2], u[3]);
+  t[5] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_p16);
+  t[6] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
  
    t[8] = _mm_add_epi16(s[8], s[11]);
    t[9] = _mm_add_epi16(s[9], s[10]);
@@ -1865,37 +1525,10 @@ static void idct16_8col(__m128i *in) {
    u[2] = _mm_unpacklo_epi16(t[11], t[12]);
    u[3] = _mm_unpackhi_epi16(t[11], t[12]);
  
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  s[10] = _mm_packs_epi32(u[0], u[1]);
-  s[13] = _mm_packs_epi32(u[2], u[3]);
-  s[11] = _mm_packs_epi32(u[4], u[5]);
-  s[12] = _mm_packs_epi32(u[6], u[7]);
+  s[10] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_p16);
+  s[13] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
+  s[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16);
+  s[12] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
    s[14] = t[14];
    s[15] = t[15];
  
@@ -1932,7 +1565,6 @@ void iadst16_sse2(__m128i *in0, __m128i *in1) {
  
  void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
                                 int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
    const __m128i final_rounding = _mm_set1_epi16(1 << 5);
    const __m128i zero = _mm_setzero_si128();
  
@@ -1958,7 +1590,7 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
        stp1_12_0;
    __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
        stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i tmp0, tmp1, tmp2, tmp3;
    int i;
    // First 1-D inverse DCT
    // Load input data.
@@ -1974,41 +1606,17 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
      const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
      const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
  
-    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
-    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
-    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp2_8 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
+    stp2_8 = idct_calc_wraplow_sse2(stg2_0, stg2_1, lo_1_15);
+    stp2_11 = idct_calc_wraplow_sse2(stg2_6, stg2_7, lo_13_3);
    }
  
    // Stage3
    {
      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
  
-    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
+    stp1_4 = idct_calc_wraplow_sse2(stg3_0, stg3_1, lo_2_14);
      stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
      stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
    }
  
    // Stage4
@@ -2017,31 +1625,12 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
  
-    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
-    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
-    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
-    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
-    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
-    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
+    tmp0 = idct_madd_round_shift_sse2(lo_0_8, stg4_0);
+    tmp1 = idct_madd_round_shift_sse2(lo_0_8, stg4_1);
      stp1_0 = _mm_packs_epi32(tmp0, tmp0);
-    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
-    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
-    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
+    stp1_1 = _mm_packs_epi32(tmp1, tmp1);
+    stp2_9 = idct_calc_wraplow_sse2(stg4_4, stg4_5, lo_9_14);
+    stp2_10 = idct_calc_wraplow_sse2(stg4_6, stg4_7, lo_10_13);
  
      stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
    }
@@ -2070,33 +1659,16 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
  
-    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
-    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
-    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
-    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
-    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
-    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
-
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
+    stp1_6 = idct_calc_wraplow_sse2(stg4_0, stg4_1, lo_6_5);
+    tmp0 = idct_madd_round_shift_sse2(lo_10_13, stg6_0);
+    tmp1 = idct_madd_round_shift_sse2(lo_10_13, stg4_0);
+    tmp2 = idct_madd_round_shift_sse2(lo_11_12, stg6_0);
+    tmp3 = idct_madd_round_shift_sse2(lo_11_12, stg4_0);
  
      stp2_10 = _mm_packs_epi32(tmp0, zero);
-    stp2_13 = _mm_packs_epi32(tmp2, zero);
-    stp2_11 = _mm_packs_epi32(tmp4, zero);
-    stp2_12 = _mm_packs_epi32(tmp6, zero);
+    stp2_13 = _mm_packs_epi32(tmp1, zero);
+    stp2_11 = _mm_packs_epi32(tmp2, zero);
+    stp2_12 = _mm_packs_epi32(tmp3, zero);
  
      tmp0 = _mm_add_epi16(stp1_0, stp1_4);
      tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
@@ -2330,23 +1902,8 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
      stp1_2 = stp2_1;                                                           \
      stp1_3 = stp2_0;                                                           \
                                                                                 \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
+    stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1);                   \
+    stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0);                   \
                                                                                 \
      stp1_4 = stp2_4;                                                           \
      stp1_7 = stp2_7;                                                           \
@@ -2660,23 +2217,8 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
      stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
      stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
                                                                                 \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
+    stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1);                   \
+    stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0);                   \
                                                                                 \
      stp1_4 = stp2_4;                                                           \
      stp1_7 = stp2_7;                                                           \
@@ -2803,7 +2345,6 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
  void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
                                 int stride) {
    const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
    const __m128i final_rounding = _mm_set1_epi16(1 << 5);
  
    // idct constants for each stage
@@ -2847,7 +2388,6 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
        stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
        stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
        stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    int i;
  
    // Load input data. Only need to load the top left 8x8 block.
@@ -2949,7 +2489,6 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
  
  void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
                                   int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
    const __m128i final_rounding = _mm_set1_epi16(1 << 5);
    const __m128i zero = _mm_setzero_si128();
  
@@ -3010,7 +2549,6 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
        stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
        stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
        stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    int i, j, i32;
  
    for (i = 0; i < 4; i++) {
diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h

index 0460ab13bcbda57f36aa63ef9f9d8c248c02afaf..9eead0915cf94739ad3e06d1182c2bb5c09c6a44 100644 (file)
--- a/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/vpx_dsp/x86/inv_txfm_sse2.h
@@ -120,6 +120,26 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
    res0[15] = tbuf[7];
  }
  
+static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) {
+  const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING));
+  return _mm_srai_epi32(t, DCT_CONST_BITS);
+}
+
+static INLINE __m128i idct_madd_round_shift_sse2(const __m128i in,
+                                                 const __m128i cospi) {
+  const __m128i t = _mm_madd_epi16(in, cospi);
+  return dct_const_round_shift_sse2(t);
+}
+
+// Calculate the dot product between in0/1 and x and wrap to short.
+static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0,
+                                             const __m128i in1,
+                                             const __m128i x) {
+  const __m128i t0 = idct_madd_round_shift_sse2(in0, x);
+  const __m128i t1 = idct_madd_round_shift_sse2(in1, x);
+  return _mm_packs_epi32(t0, t1);
+}
+
  // Function to allow 8 bit optimisations to be used when profile 0 is used with
  // highbitdepth enabled
  static INLINE __m128i load_input_data(const tran_low_t *data) {
@@ -246,37 +266,10 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
  #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
                                 res0, res1, res2, res3)                         \
    {                                                                            \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                                         \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                                         \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                                         \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                                         \
-    tmp4 = _mm_madd_epi16(lo_1, cst2);                                         \
-    tmp5 = _mm_madd_epi16(hi_1, cst2);                                         \
-    tmp6 = _mm_madd_epi16(lo_1, cst3);                                         \
-    tmp7 = _mm_madd_epi16(hi_1, cst3);                                         \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-    tmp4 = _mm_add_epi32(tmp4, rounding);                                      \
-    tmp5 = _mm_add_epi32(tmp5, rounding);                                      \
-    tmp6 = _mm_add_epi32(tmp6, rounding);                                      \
-    tmp7 = _mm_add_epi32(tmp7, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);                               \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);                               \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);                               \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);                               \
-                                                                               \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                                        \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                                        \
-    res2 = _mm_packs_epi32(tmp4, tmp5);                                        \
-    res3 = _mm_packs_epi32(tmp6, tmp7);                                        \
+    res0 = idct_calc_wraplow_sse2(lo_0, hi_0, cst0);                           \
+    res1 = idct_calc_wraplow_sse2(lo_0, hi_0, cst1);                           \
+    res2 = idct_calc_wraplow_sse2(lo_1, hi_1, cst2);                           \
+    res3 = idct_calc_wraplow_sse2(lo_1, hi_1, cst3);                           \
    }
  
  static INLINE void recon_and_store4x4_sse2(const __m128i *const in,
diff --git a/vpx_dsp/x86/transpose_sse2.h b/vpx_dsp/x86/transpose_sse2.h

index 7292723e38c4d5de7122c8b6e3f360ca01ce39ae..a5e40245a09ef1adc030fbf06f29475ad2ddf916 100644 (file)
--- a/vpx_dsp/x86/transpose_sse2.h
+++ b/vpx_dsp/x86/transpose_sse2.h
@@ -15,7 +15,7 @@
  #include "vpx_dsp/x86/inv_txfm_sse2.h"
  #include "vpx_dsp/x86/txfm_common_sse2.h"
  
-static INLINE void transpose_4x4(__m128i *res) {
+static INLINE void transpose_16bit_4x4(__m128i *res) {
    const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
    const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
  
@@ -23,4 +23,33 @@ static INLINE void transpose_4x4(__m128i *res) {
    res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
  }
  
+static INLINE void transpose_32bit_4x4(__m128i *const a0, __m128i *const a1,
+                                       __m128i *const a2, __m128i *const a3) {
+  // Unpack 32 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0: 00 10 01 11
+  // b1: 20 30 21 31
+  // b2: 02 12 03 13
+  // b3: 22 32 23 33
+
+  const __m128i b0 = _mm_unpacklo_epi32(*a0, *a1);
+  const __m128i b1 = _mm_unpacklo_epi32(*a2, *a3);
+  const __m128i b2 = _mm_unpackhi_epi32(*a0, *a1);
+  const __m128i b3 = _mm_unpackhi_epi32(*a2, *a3);
+
+  // Unpack 64 bit elements resulting in:
+  // a0: 00 10 20 30
+  // a1: 01 11 21 31
+  // a2: 02 12 22 32
+  // a3: 03 13 23 33
+  *a0 = _mm_unpacklo_epi64(b0, b1);
+  *a1 = _mm_unpackhi_epi64(b0, b1);
+  *a2 = _mm_unpacklo_epi64(b2, b3);
+  *a3 = _mm_unpackhi_epi64(b2, b3);
+}
+
  #endif  // VPX_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/vpx_ports/mem.h b/vpx_ports/mem.h

index 35751cef8c6cfe902ddbe799684b1cd682ec32e8..bfef783b133219d1f2ab401b63f20dcb75100d7e 100644 (file)
--- a/vpx_ports/mem.h
+++ b/vpx_ports/mem.h
@@ -23,12 +23,6 @@
  #define DECLARE_ALIGNED(n, typ, val) typ val
  #endif
  
-#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(_WIN32)
-#define DECLARE_PROTECTED(decl) decl __attribute__((visibility("protected")))
-#else
-#define DECLARE_PROTECTED(decl) decl
-#endif
-
  #if HAVE_NEON && defined(_MSC_VER)
  #define __builtin_prefetch(x)
  #endif
diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c

index a6aaff95a04c80ee0dd791c450aa6129d62eb8e6..e231806505d528bc38160a74685642c56f0ee375 100644 (file)
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c
@@ -111,25 +111,6 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
    assert(ybf->y_height - ybf->y_crop_height >= 0);
    assert(ybf->y_width - ybf->y_crop_width >= 0);
  
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
-                      ybf->y_crop_height, ybf->border, ybf->border,
-                      ybf->border + ybf->y_height - ybf->y_crop_height,
-                      ybf->border + ybf->y_width - ybf->y_crop_width);
-
-    extend_plane_high(ybf->u_buffer, ybf->uv_stride, ybf->uv_crop_width,
-                      ybf->uv_crop_height, uv_border, uv_border,
-                      uv_border + ybf->uv_height - ybf->uv_crop_height,
-                      uv_border + ybf->uv_width - ybf->uv_crop_width);
-
-    extend_plane_high(ybf->v_buffer, ybf->uv_stride, ybf->uv_crop_width,
-                      ybf->uv_crop_height, uv_border, uv_border,
-                      uv_border + ybf->uv_height - ybf->uv_crop_height,
-                      uv_border + ybf->uv_width - ybf->uv_crop_width);
-    return;
-  }
-#endif
    extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
                 ybf->y_crop_height, ybf->border, ybf->border,
                 ybf->border + ybf->y_height - ybf->y_crop_height,
@@ -208,12 +189,55 @@ static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
  // Copies the source image into the destination image and updates the
  // destination's UMV borders.
  // Note: The frames are assumed to be identical in size.
+
  void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
                             YV12_BUFFER_CONFIG *dst_ybc) {
    int row;
    const uint8_t *src = src_ybc->y_buffer;
    uint8_t *dst = dst_ybc->y_buffer;
  
+#if 0
+  /* These assertions are valid in the codec, but the libvpx-tester uses
+   * this code slightly differently.
+   */
+  assert(src_ybc->y_width == dst_ybc->y_width);
+  assert(src_ybc->y_height == dst_ybc->y_height);
+#endif
+
+  for (row = 0; row < src_ybc->y_height; ++row) {
+    memcpy(dst, src, src_ybc->y_width);
+    src += src_ybc->y_stride;
+    dst += dst_ybc->y_stride;
+  }
+
+  src = src_ybc->u_buffer;
+  dst = dst_ybc->u_buffer;
+
+  for (row = 0; row < src_ybc->uv_height; ++row) {
+    memcpy(dst, src, src_ybc->uv_width);
+    src += src_ybc->uv_stride;
+    dst += dst_ybc->uv_stride;
+  }
+
+  src = src_ybc->v_buffer;
+  dst = dst_ybc->v_buffer;
+
+  for (row = 0; row < src_ybc->uv_height; ++row) {
+    memcpy(dst, src, src_ybc->uv_width);
+    src += src_ybc->uv_stride;
+    dst += dst_ybc->uv_stride;
+  }
+
+  vp8_yv12_extend_frame_borders_c(dst_ybc);
+}
+
+#if CONFIG_VP9
+void vpx_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
+                           YV12_BUFFER_CONFIG *dst_ybc) {
+  int row;
+  const uint8_t *src = src_ybc->y_buffer;
+  uint8_t *dst = dst_ybc->y_buffer;
+
  #if 0
    /* These assertions are valid in the codec, but the libvpx-tester uses
     * this code slightly differently.
@@ -249,7 +273,7 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
        dst += dst_ybc->uv_stride;
      }
  
-    vp8_yv12_extend_frame_borders_c(dst_ybc);
+    vpx_extend_frame_borders_c(dst_ybc);
      return;
    } else {
      assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH));
@@ -280,8 +304,9 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
      dst += dst_ybc->uv_stride;
    }
  
-  vp8_yv12_extend_frame_borders_c(dst_ybc);
+  vpx_extend_frame_borders_c(dst_ybc);
  }
+#endif  // CONFIG_VP9
  
  void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
                         YV12_BUFFER_CONFIG *dst_ybc) {
diff --git a/vpx_scale/vpx_scale_rtcd.pl b/vpx_scale/vpx_scale_rtcd.pl

index 44b115c7eb78d06de322757482c309fd454ef668..75a3ad3881457d90a1e8d827d343d60d8b8d7347 100644 (file)
--- a/vpx_scale/vpx_scale_rtcd.pl
+++ b/vpx_scale/vpx_scale_rtcd.pl
@@ -23,6 +23,8 @@ add_proto qw/void vp8_yv12_copy_frame/, "const struct yv12_buffer_config *src_yb
  add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
  
  if (vpx_config("CONFIG_VP9") eq "yes") {
+    add_proto qw/void vpx_yv12_copy_frame/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
+
      add_proto qw/void vpx_extend_frame_borders/, "struct yv12_buffer_config *ybf";
      specialize qw/vpx_extend_frame_borders dspr2/;
author	James Zern <jzern@google.com>
	Tue, 6 Jun 2017 23:52:39 +0000 (23:52 +0000)
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>
	Tue, 6 Jun 2017 23:52:39 +0000 (23:52 +0000)
README		patch \| blob \| history
build/make/gen_msvs_sln.sh		patch \| blob \| history
build/make/gen_msvs_vcxproj.sh		patch \| blob \| history
configure		patch \| blob \| history
test/buffer.h		patch \| blob \| history
test/comp_avg_pred_test.cc		patch \| blob \| history
test/datarate_test.cc		patch \| blob \| history
test/fdct4x4_test.cc		patch \| blob \| history
test/idct_test.cc		patch \| blob \| history
test/partial_idct_test.cc		patch \| blob \| history
test/pp_filter_test.cc		patch \| blob \| history
test/temporal_filter_test.cc		patch \| blob \| history
test/variance_test.cc		patch \| blob \| history
test/vpx_temporal_svc_encoder.sh		patch \| blob \| history
vp8/common/arm/neon/bilinearpredict_neon.c		patch \| blob \| history
vp8/common/arm/neon/sixtappredict_neon.c		patch \| blob \| history
vp8/common/skin_detection.c	[new file with mode: 0644]	patch \| blob
vp8/common/skin_detection.h	[new file with mode: 0644]	patch \| blob
vp8/common/x86/filter_x86.c		patch \| blob \| history
vp8/encoder/onyx_if.c		patch \| blob \| history
vp8/encoder/pickinter.c		patch \| blob \| history
vp8/vp8cx.mk		patch \| blob \| history
vp9/common/vp9_postproc.c		patch \| blob \| history
vp9/decoder/vp9_decoder.c		patch \| blob \| history
vp9/encoder/arm/neon/vp9_quantize_neon.c		patch \| blob \| history
vp9/encoder/vp9_aq_cyclicrefresh.c		patch \| blob \| history
vp9/encoder/vp9_aq_cyclicrefresh.h		patch \| blob \| history
vp9/encoder/vp9_encodeframe.c		patch \| blob \| history
vp9/encoder/vp9_encoder.c		patch \| blob \| history
vp9/encoder/vp9_firstpass.c		patch \| blob \| history
vp9/encoder/vp9_firstpass.h		patch \| blob \| history
vp9/encoder/vp9_mcomp.c		patch \| blob \| history
vp9/encoder/vp9_pickmode.c		patch \| blob \| history
vp9/encoder/vp9_ratectrl.c		patch \| blob \| history
vp9/encoder/vp9_speed_features.c		patch \| blob \| history
vp9/encoder/vp9_svc_layercontext.c		patch \| blob \| history
vp9/encoder/vp9_svc_layercontext.h		patch \| blob \| history
vpx_dsp/arm/avg_neon.c		patch \| blob \| history
vpx_dsp/arm/avg_pred_neon.c	[new file with mode: 0644]	patch \| blob
vpx_dsp/arm/fdct_neon.c	[new file with mode: 0644]	patch \| blob
vpx_dsp/arm/fwd_txfm_neon.c		patch \| blob \| history
vpx_dsp/arm/hadamard_neon.c		patch \| blob \| history
vpx_dsp/arm/highbd_vpx_convolve_neon.c		patch \| blob \| history
vpx_dsp/arm/idct16x16_add_neon.c		patch \| blob \| history
vpx_dsp/arm/idct32x32_135_add_neon.c		patch \| blob \| history
vpx_dsp/arm/idct32x32_34_add_neon.c		patch \| blob \| history
vpx_dsp/arm/idct32x32_add_neon.c		patch \| blob \| history
vpx_dsp/arm/idct4x4_1_add_neon.c		patch \| blob \| history
vpx_dsp/arm/idct4x4_add_neon.c		patch \| blob \| history
vpx_dsp/arm/idct8x8_add_neon.c		patch \| blob \| history
vpx_dsp/arm/idct_neon.h		patch \| blob \| history
vpx_dsp/arm/mem_neon.h	[new file with mode: 0644]	patch \| blob
vpx_dsp/arm/subpel_variance_neon.c		patch \| blob \| history
vpx_dsp/arm/variance_neon.c		patch \| blob \| history
vpx_dsp/arm/vpx_convolve_neon.c		patch \| blob \| history
vpx_dsp/deblock.c		patch \| blob \| history
vpx_dsp/variance.c		patch \| blob \| history
vpx_dsp/vpx_dsp.mk		patch \| blob \| history
vpx_dsp/vpx_dsp_rtcd_defs.pl		patch \| blob \| history
vpx_dsp/x86/highbd_idct16x16_add_sse2.c		patch \| blob \| history
vpx_dsp/x86/highbd_idct32x32_add_sse2.c		patch \| blob \| history
vpx_dsp/x86/highbd_idct4x4_add_sse2.c		patch \| blob \| history
vpx_dsp/x86/highbd_idct8x8_add_sse2.c		patch \| blob \| history
vpx_dsp/x86/highbd_inv_txfm_sse2.h		patch \| blob \| history
vpx_dsp/x86/inv_txfm_sse2.c		patch \| blob \| history
vpx_dsp/x86/inv_txfm_sse2.h		patch \| blob \| history
vpx_dsp/x86/transpose_sse2.h		patch \| blob \| history
vpx_ports/mem.h		patch \| blob \| history
vpx_scale/generic/yv12extend.c		patch \| blob \| history
vpx_scale/vpx_scale_rtcd.pl		patch \| blob \| history