quantize: simplify 32x32_b args

author Johann <johann@duck.com>

Sat, 5 Nov 2022 00:53:07 +0000 (09:53 +0900)

committer Johann <johann@duck.com>

Tue, 28 Feb 2023 09:46:16 +0000 (18:46 +0900)
author Johann <johann@duck.com>
Sat, 5 Nov 2022 00:53:07 +0000 (09:53 +0900)
committer Johann <johann@duck.com>
Tue, 28 Feb 2023 09:46:16 +0000 (18:46 +0900)
diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc

index 587cec69232e2022ac9ee1a8be9a452942d08e87..ecb6116f0ca31b189a24386768c49b09d2f9ac23 100644 (file)
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -26,6 +26,7 @@
  #include "test/util.h"
  #include "vp9/common/vp9_entropy.h"
  #include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
  #include "vpx/vpx_codec.h"
  #include "vpx/vpx_integer.h"
  #include "vpx_ports/msvc.h"
@@ -38,8 +39,7 @@ namespace {
  const int number_of_iterations = 100;
  
  typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
-                             const int16_t *zbin, const int16_t *round,
-                             const int16_t *quant, const int16_t *quant_shift,
+                             const macroblock_plane *const mb_plane,
                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
                               const int16_t *dequant, uint16_t *eob,
                               const int16_t *scan, const int16_t *iscan);
@@ -47,6 +47,41 @@ typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
                     int /*max_size*/, bool /*is_fp*/>
      QuantizeParam;
  
+// Wrapper which takes a macroblock_plane.
+typedef void (*QuantizeBaseFunc)(const tran_low_t *coeff, intptr_t count,
+                                 const int16_t *zbin, const int16_t *round,
+                                 const int16_t *quant,
+                                 const int16_t *quant_shift, tran_low_t *qcoeff,
+                                 tran_low_t *dqcoeff, const int16_t *dequant,
+                                 uint16_t *eob, const int16_t *scan,
+                                 const int16_t *iscan);
+
+template <QuantizeBaseFunc fn>
+void QuantWrapper(const tran_low_t *coeff, intptr_t count,
+                  const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
+                  tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
+                  const int16_t *scan, const int16_t *iscan) {
+  fn(coeff, count, mb_plane->zbin, mb_plane->round, mb_plane->quant,
+     mb_plane->quant_shift, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+}
+
+// Wrapper for 32x32 version which does not use count
+typedef void (*Quantize32x32Func)(const tran_low_t *coeff,
+                                  const macroblock_plane *const mb_plane,
+                                  tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                  const int16_t *dequant, uint16_t *eob,
+                                  const int16_t *scan, const int16_t *iscan);
+
+template <Quantize32x32Func fn>
+void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count,
+                       const macroblock_plane *const mb_plane,
+                       tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                       const int16_t *dequant, uint16_t *eob,
+                       const int16_t *scan, const int16_t *iscan) {
+  (void)count;
+  fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+}
+
  // Wrapper for FP version which does not use zbin or quant_shift.
  typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
                                 const int16_t *round, const int16_t *quant,
@@ -56,15 +91,11 @@ typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
  
  template <QuantizeFPFunc fn>
  void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
-                    const int16_t *zbin, const int16_t *round,
-                    const int16_t *quant, const int16_t *quant_shift,
-                    tran_low_t *qcoeff, tran_low_t *dqcoeff,
-                    const int16_t *dequant, uint16_t *eob, const int16_t *scan,
-                    const int16_t *iscan) {
-  (void)zbin;
-  (void)quant_shift;
-
-  fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+                    const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
+                    tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
+                    const int16_t *scan, const int16_t *iscan) {
+  fn(coeff, count, mb_plane->round_fp, mb_plane->quant_fp, qcoeff, dqcoeff,
+     dequant, eob, scan, iscan);
  }
  
  void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
@@ -119,17 +150,16 @@ class VP9QuantizeBase : public AbstractBench {
  #else
      max_value_ = (1 << bit_depth_) - 1;
  #endif
-    zbin_ptr_ =
+    zbin_ptr_ = mb_plane_.zbin =
          reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
-    round_fp_ptr_ = reinterpret_cast<int16_t *>(
-        vpx_memalign(16, 8 * sizeof(*round_fp_ptr_)));
-    quant_fp_ptr_ = reinterpret_cast<int16_t *>(
+    round_fp_ptr_ = mb_plane_.round_fp;
+    quant_fp_ptr_ = mb_plane_.quant_fp = reinterpret_cast<int16_t *>(
          vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_)));
-    round_ptr_ =
+    round_ptr_ = mb_plane_.round =
          reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_)));
-    quant_ptr_ =
+    quant_ptr_ = mb_plane_.quant =
          reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*quant_ptr_)));
-    quant_shift_ptr_ = reinterpret_cast<int16_t *>(
+    quant_shift_ptr_ = mb_plane_.quant_shift = reinterpret_cast<int16_t *>(
          vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_)));
      dequant_ptr_ = reinterpret_cast<int16_t *>(
          vpx_memalign(16, 8 * sizeof(*dequant_ptr_)));
@@ -140,7 +170,6 @@ class VP9QuantizeBase : public AbstractBench {
  
    ~VP9QuantizeBase() {
      vpx_free(zbin_ptr_);
-    vpx_free(round_fp_ptr_);
      vpx_free(quant_fp_ptr_);
      vpx_free(round_ptr_);
      vpx_free(quant_ptr_);
@@ -157,6 +186,7 @@ class VP9QuantizeBase : public AbstractBench {
    }
  
   protected:
+  macroblock_plane mb_plane_;
    int16_t *zbin_ptr_;
    int16_t *round_fp_ptr_;
    int16_t *quant_fp_ptr_;
@@ -193,10 +223,9 @@ class VP9QuantizeTest : public VP9QuantizeBase,
  };
  
  void VP9QuantizeTest::Run() {
-  quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-               quant_shift_ptr_, qcoeff_.TopLeftPixel(),
-               dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan,
-               scan_->iscan);
+  quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
+               qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), dequant_ptr_,
+               &eob_, scan_->scan, scan_->iscan);
  }
  
  void VP9QuantizeTest::Speed(bool is_median) {
@@ -266,8 +295,8 @@ void VP9QuantizeTest::Speed(bool is_median) {
  
          vpx_usec_timer_start(&timer);
          for (int n = 0; n < kNumTests; ++n) {
-          ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_,
-                           q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+          ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
+                           ref_qcoeff.TopLeftPixel(),
                             ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
                             scan_->scan, scan_->iscan);
          }
@@ -275,10 +304,9 @@ void VP9QuantizeTest::Speed(bool is_median) {
  
          vpx_usec_timer_start(&simd_timer);
          for (int n = 0; n < kNumTests; ++n) {
-          quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-                       quant_shift_ptr_, qcoeff_.TopLeftPixel(),
-                       dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_,
-                       scan_->scan, scan_->iscan);
+          quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
+                       qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
+                       dequant_ptr_, &eob_, scan_->scan, scan_->iscan);
          }
          vpx_usec_timer_mark(&simd_timer);
  
@@ -417,15 +445,14 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
      GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                           quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                           quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-                     quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
-                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                     scan_->scan, scan_->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
  
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-        quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
-        dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
+                                          &mb_plane_, qcoeff_.TopLeftPixel(),
+                                          dqcoeff_.TopLeftPixel(), dequant_ptr_,
+                                          &eob_, scan_->scan, scan_->iscan));
  
      EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
      EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -475,15 +502,14 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
      GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                           quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                           quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-                     quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
-                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                     scan_->scan, scan_->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, &mb_plane_,
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
  
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-        quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
-        dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_.TopLeftPixel(), count_,
+                                          &mb_plane_, qcoeff_.TopLeftPixel(),
+                                          dqcoeff_.TopLeftPixel(), dequant_ptr_,
+                                          &eob_, scan_->scan, scan_->iscan));
  
      EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
      EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -510,28 +536,35 @@ using std::make_tuple;
  INSTANTIATE_TEST_SUITE_P(
      SSE2, VP9QuantizeTest,
      ::testing::Values(
-        make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16,
-                   false),
+        make_tuple(&QuantWrapper<vpx_quantize_b_sse2>,
+                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
          make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
                     &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+                   32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+                   32, false)));
  
  #else
  INSTANTIATE_TEST_SUITE_P(
      SSE2, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
+    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_sse2>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
                        make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
                                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                   16, true)));
@@ -541,11 +574,12 @@ INSTANTIATE_TEST_SUITE_P(
  #if HAVE_SSSE3
  INSTANTIATE_TEST_SUITE_P(
      SSSE3, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_ssse3,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false),
+    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_ssse3>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_ssse3>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false),
                        make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
                                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                   16, true),
@@ -555,13 +589,14 @@ INSTANTIATE_TEST_SUITE_P(
  #endif  // HAVE_SSSE3
  
  #if HAVE_AVX
-INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
-                         ::testing::Values(make_tuple(&vpx_quantize_b_avx,
-                                                      &vpx_quantize_b_c,
-                                                      VPX_BITS_8, 16, false),
-                                           make_tuple(&vpx_quantize_b_32x32_avx,
-                                                      &vpx_quantize_b_32x32_c,
-                                                      VPX_BITS_8, 32, false)));
+INSTANTIATE_TEST_SUITE_P(
+    AVX, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_avx>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
  #endif  // HAVE_AVX
  
  #if VPX_ARCH_X86_64 && HAVE_AVX2
@@ -577,22 +612,29 @@ INSTANTIATE_TEST_SUITE_P(
          make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>,
                     &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
                     32, true),
-        make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16,
+        make_tuple(&QuantWrapper<vpx_quantize_b_avx2>,
+                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
                     false),
-        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16, false),
-        make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
+                   false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+                   32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+                   32, false)));
  #else
  INSTANTIATE_TEST_SUITE_P(
      AVX2, VP9QuantizeTest,
@@ -602,11 +644,12 @@ INSTANTIATE_TEST_SUITE_P(
                        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>,
                                   &QuantFPWrapper<quantize_fp_32x32_nz_c>,
                                   VPX_BITS_8, 32, true),
-                      make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_avx2,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false)));
+                      make_tuple(&QuantWrapper<vpx_quantize_b_avx2>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
  #endif  // CONFIG_VP9_HIGHBITDEPTH
  #endif  // HAVE_AVX2
  
@@ -615,22 +658,29 @@ INSTANTIATE_TEST_SUITE_P(
  INSTANTIATE_TEST_SUITE_P(
      NEON, VP9QuantizeTest,
      ::testing::Values(
-        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16,
+        make_tuple(&QuantWrapper<vpx_quantize_b_neon>,
+                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_10, 16,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_c>, VPX_BITS_12, 16,
                     false),
-        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16, false),
-        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
-                   VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_10,
+                   32, false),
+        make_tuple(&QuantWrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &QuantWrapper<vpx_highbd_quantize_b_32x32_c>, VPX_BITS_12,
+                   32, false),
          make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
                     &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
          make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
@@ -639,11 +689,12 @@ INSTANTIATE_TEST_SUITE_P(
  #else
  INSTANTIATE_TEST_SUITE_P(
      NEON, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_neon,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false),
+    ::testing::Values(make_tuple(&QuantWrapper<vpx_quantize_b_neon>,
+                                 &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8,
+                                 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false),
                        make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
                                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
                                   16, true),
@@ -683,9 +734,11 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest,
  INSTANTIATE_TEST_SUITE_P(
      DISABLED_C, VP9QuantizeTest,
      ::testing::Values(
-        make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false),
-        make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8,
-                   32, false),
+        make_tuple(&QuantWrapper<vpx_quantize_b_c>,
+                   &QuantWrapper<vpx_quantize_b_c>, VPX_BITS_8, 16, false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
          make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>,
                     &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
          make_tuple(&QuantFPWrapper<quantize_fp_nz_c>,
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h

index 178695291178a542a72b6fdb16f5ea4290de3376..fc27a0fbdaaf7c89a925746bd6fc54c3658d0688 100644 (file)
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -13,6 +13,7 @@
  
  #include "vpx_util/vpx_thread.h"
  
+#include "vp9/common/vp9_blockd.h"
  #include "vp9/common/vp9_entropymv.h"
  #include "vp9/common/vp9_entropy.h"
  
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c

index fa222f9dcf97defbd76d322f5fda3f135732145b..4910dc20f5a0947107c2bb5006058c6e712a0286 100644 (file)
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -542,8 +542,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
    switch (tx_size) {
      case TX_32X32:
        fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
-                           p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+      vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
                             scan_order->scan, scan_order->iscan);
        break;
      case TX_16X16:
@@ -948,8 +947,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
          vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst,
                             dst_stride);
          fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
-                             p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+        vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
                               scan_order->scan, scan_order->iscan);
        }
        if (args->enable_coeff_opt && !x->skip_recode) {
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c

index 9c227d560faf834250bfd0893c958a89804db5e8..e81738a7bb3d0e502fbdb9c02078d16f89ceefd7 100644 (file)
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -14,6 +14,7 @@
  #include "./vpx_config.h"
  #include "./vpx_dsp_rtcd.h"
  #include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/encoder/vp9_block.h"
  
  static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
                                                 const int16x8_t dequant,
@@ -213,11 +214,8 @@ quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
  
  // Main difference is that zbin values are halved before comparison and dqcoeff
  // values are divided by 2. zbin is rounded but dqcoeff is not.
-void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               const int16_t *zbin_ptr,
-                               const int16_t *round_ptr,
-                               const int16_t *quant_ptr,
-                               const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
+                               const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                 const int16_t *scan, const int16_t *iscan) {
@@ -226,10 +224,10 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    int i;
  
    // Only the first element of each vector is DC.
-  int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
-  int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
-  int16x8_t quant = vld1q_s16(quant_ptr);
-  int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+  int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1);
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1);
+  int16x8_t quant = vld1q_s16(mb_plane->quant);
+  int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
    int16x8_t dequant = vld1q_s16(dequant_ptr);
  
    // Process first 8 values which include a dc component.
@@ -289,6 +287,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
  #endif  // __aarch64__
    // Need these here, else the compiler complains about mixing declarations and
    // code in C90
-  (void)n_coeffs;
    (void)scan;
  }
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c

index 5d6ba64a8a5df40a857edc8f4964141201e1a50c..212db45c88fc1c513d568b00c297f4bd1671fc8a 100644 (file)
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -14,6 +14,7 @@
  #include "vpx_dsp/quantize.h"
  #include "vpx_dsp/vpx_dsp_common.h"
  #include "vpx_mem/vpx_mem.h"
+#include "vp9/encoder/vp9_block.h"
  
  void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                       const int16_t *round_ptr, const int16_t quant,
@@ -208,19 +209,21 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
  }
  #endif
  
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            const int16_t *zbin_ptr, const int16_t *round_ptr,
-                            const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                            const struct macroblock_plane *const mb_plane,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+  const int n_coeffs = 32 * 32;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
+                         ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
    const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
  
    int idx = 0;
-  int idx_arr[1024];
+  int idx_arr[32 * 32 /* n_coeffs */];
    int i, eob = -1;
    (void)iscan;
  
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl

index eef72249e0be5a628f8442ffbde1ceab5dba33ba..639c18bc985d35562b1357958def828b6f89ad7b 100644 (file)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -17,6 +17,9 @@ print <<EOF
  #include "vpx/vpx_integer.h"
  #include "vpx_dsp/vpx_dsp_common.h"
  #include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+#endif
  
  EOF
  }
@@ -717,7 +720,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
    add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
    specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
  
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane * const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
    specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
  
    if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c

index 7d8352721605cd4361c2f5e2c027b554d0cc7d1e..d52f6c66440f8ee40216fe15ef352ffe5eada0d8 100644 (file)
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -140,15 +140,12 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    *eob_ptr = accumulate_eob(eob);
  }
  
-void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              const int16_t *zbin_ptr, const int16_t *round_ptr,
-                              const int16_t *quant_ptr,
-                              const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
+                              const struct macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                const int16_t *scan, const int16_t *iscan) {
    const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
    const __m256i big_zero = _mm256_setzero_si256();
    int index;
  
@@ -160,26 +157,9 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    __m128i eob = zero, eob0;
  
    (void)scan;
-  (void)n_coeffs;
-
-  // Setup global values.
-  // The 32x32 halves zbin and round.
-  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  // Shift with rounding.
-  zbin = _mm_add_epi16(zbin, one);
-  zbin = _mm_srli_epi16(zbin, 1);
-  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
-  // it is a strict "greater" comparison.
-  zbin = _mm_sub_epi16(zbin, one);
-
-  round = _mm_load_si128((const __m128i *)round_ptr);
-  round = _mm_add_epi16(round, one);
-  round = _mm_srli_epi16(round, 1);
-
-  quant = _mm_load_si128((const __m128i *)quant_ptr);
-  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
-  shift = _mm_slli_epi16(shift, 1);
+
+  load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+                     &shift);
  
    // Do DC and first 15 AC.
    coeff0 = load_tran_low(coeff_ptr);
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c

index 28f7c9c7da78b228815abf89f0b2d0843b18a8a3..a8412c5b8e3748c3b1fd1e5f1d58c8ac5b35a4fc 100644 (file)
--- a/vpx_dsp/x86/quantize_avx2.c
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -13,6 +13,7 @@
  
  #include "./vpx_dsp_rtcd.h"
  #include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_block.h"
  
  static VPX_FORCE_INLINE void load_b_values_avx2(
      const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
@@ -250,23 +251,19 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
    }
  }
  
-void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               const int16_t *zbin_ptr,
-                               const int16_t *round_ptr,
-                               const int16_t *quant_ptr,
-                               const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
+                               const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                 const int16_t *scan, const int16_t *iscan) {
    __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
    __m256i v_eobmax = _mm256_setzero_si256();
    intptr_t count;
-  (void)n_coeffs;
    (void)scan;
  
-  load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
-                     &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
-                     &v_quant_shift, 1);
+  load_b_values_avx2(mb_plane->zbin, &v_zbin, mb_plane->round, &v_round,
+                     mb_plane->quant, &v_quant, dequant_ptr, &v_dequant,
+                     mb_plane->quant_shift, &v_quant_shift, 1);
  
    // Do DC and first 15 AC.
    v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h

index 27bfb4e41beb6d97b3f1aa861d41fb8ed18c37e6..fe42fee018d64540c39e7137476ff939ee775ea8 100644 (file)
--- a/vpx_dsp/x86/quantize_sse2.h
+++ b/vpx_dsp/x86/quantize_sse2.h
@@ -15,6 +15,7 @@
  
  #include "./vpx_config.h"
  #include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_block.h"
  
  static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
                                   const int16_t *round_ptr, __m128i *round,
@@ -29,6 +30,33 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
    *shift = _mm_load_si128((const __m128i *)shift_ptr);
  }
  
+static INLINE void load_b_values32x32(
+    const struct macroblock_plane *const mb_plane, __m128i *zbin,
+    __m128i *round, __m128i *quant, const int16_t *dequant_ptr,
+    __m128i *dequant, __m128i *shift) {
+  const __m128i one = _mm_set1_epi16(1);
+  // The 32x32 halves zbin and round.
+  *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+  // Shift with rounding.
+  *zbin = _mm_add_epi16(*zbin, one);
+  *zbin = _mm_srli_epi16(*zbin, 1);
+  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+  // it is a strict "greater" comparison.
+  *zbin = _mm_sub_epi16(*zbin, one);
+
+  *round = _mm_load_si128((const __m128i *)mb_plane->round);
+  *round = _mm_add_epi16(*round, one);
+  *round = _mm_srli_epi16(*round, 1);
+
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
+  // I suspect this is not technically OK because quant_shift can be up
+  // to 1 << 16 and shifting up again will outrange that, but the test is not
+  // comprehensive enough to catch that and "it's been that way forever"
+  *shift = _mm_slli_epi16(*shift, 1);
+}
+
  static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round,
                                    const int16_t *quant_ptr, __m128i *quant,
                                    const int16_t *dequant_ptr,
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c

index 476230286d7890b67a5aa96a6456b56bbcaf715f..6fe54d7d98c678a74c93a667a78ebc18ee0981ff 100644 (file)
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -16,6 +16,7 @@
  #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
  #include "vpx_dsp/x86/quantize_sse2.h"
  #include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/encoder/vp9_block.h"
  
  void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                            const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -107,16 +108,12 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    *eob_ptr = accumulate_eob(eob);
  }
  
-void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
+                                const struct macroblock_plane *const mb_plane,
                                  tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                  const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                  const int16_t *scan, const int16_t *iscan) {
    const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
    int index;
  
    __m128i zbin, round, quant, dequant, shift;
@@ -127,29 +124,9 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    __m128i eob = zero, eob0;
  
    (void)scan;
-  (void)n_coeffs;
-
-  // Setup global values.
-  // The 32x32 halves zbin and round.
-  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  // Shift with rounding.
-  zbin = _mm_add_epi16(zbin, one);
-  zbin = _mm_srli_epi16(zbin, 1);
-  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
-  // it is a strict "greater" comparison.
-  zbin = _mm_sub_epi16(zbin, one);
-
-  round = _mm_load_si128((const __m128i *)round_ptr);
-  round = _mm_add_epi16(round, one);
-  round = _mm_srli_epi16(round, 1);
-
-  quant = _mm_load_si128((const __m128i *)quant_ptr);
-  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
-  // I suspect this is not technically OK because quant_shift can be up
-  // to 1 << 16 and shifting up again will outrange that, but the test is not
-  // comprehensive enough to catch that and "it's been that way forever"
-  shift = _mm_slli_epi16(shift, 1);
+
+  load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+                     &shift);
  
    // Do DC and first 15 AC.
    coeff0 = load_tran_low(coeff_ptr);
author	Johann <johann@duck.com>
	Sat, 5 Nov 2022 00:53:07 +0000 (09:53 +0900)
committer	Johann <johann@duck.com>
	Tue, 28 Feb 2023 09:46:16 +0000 (18:46 +0900)
test/vp9_quantize_test.cc		patch \| blob \| history
vp9/encoder/vp9_block.h		patch \| blob \| history
vp9/encoder/vp9_encodemb.c		patch \| blob \| history
vpx_dsp/arm/quantize_neon.c		patch \| blob \| history
vpx_dsp/quantize.c		patch \| blob \| history
vpx_dsp/vpx_dsp_rtcd_defs.pl		patch \| blob \| history
vpx_dsp/x86/quantize_avx.c		patch \| blob \| history
vpx_dsp/x86/quantize_avx2.c		patch \| blob \| history
vpx_dsp/x86/quantize_sse2.h		patch \| blob \| history
vpx_dsp/x86/quantize_ssse3.c		patch \| blob \| history