From a7b2d09f36371e917db4ff877b56c4d2a39d4124 Mon Sep 17 00:00:00 2001 From: Peter de Rivaz Date: Thu, 16 Oct 2014 13:38:46 +0100 Subject: [PATCH] Added highbitdepth sse2 acceleration for quantize Also includes block error. (This patch is mostly cherry picked from commit db7192e0b014a331a1dcb102c8a1148e9f0e1081) Change-Id: Idef18f90b111a0d0c9546543d3347e551908fd78 --- test/test.mk | 2 + test/vp9_error_block_test.cc | 150 ++++++++ test/vp9_quantize_test.cc | 357 ++++++++++++++++++ vp9/common/vp9_rtcd_defs.pl | 6 +- .../x86/vp9_highbd_block_error_intrin_sse2.c | 71 ++++ .../x86/vp9_highbd_quantize_intrin_sse2.c | 182 +++++++++ vp9/vp9cx.mk | 2 + 7 files changed, 767 insertions(+), 3 deletions(-) create mode 100644 test/vp9_error_block_test.cc create mode 100644 test/vp9_quantize_test.cc create mode 100644 vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c create mode 100644 vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c diff --git a/test/test.mk b/test/test.mk index ab4ebbf18..c665ae262 100644 --- a/test/test.mk +++ b/test/test.mk @@ -137,6 +137,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += lpf_8_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9) += vp9_intrapred_test.cc ifeq ($(CONFIG_VP9_ENCODER),yes) diff --git a/test/vp9_error_block_test.cc b/test/vp9_error_block_test.cc new file mode 100644 index 000000000..b59d95ea8 --- /dev/null +++ b/test/vp9_error_block_test.cc @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_entropy.h" +#include "vpx/vpx_integer.h" + +using libvpx_test::ACMRandom; + +namespace { +#if CONFIG_VP9_HIGHBITDEPTH +const int kNumIterations = 1000; + +typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, + int64_t *ssz, int bps); + +typedef std::tr1::tuple + ErrorBlockParam; + +class ErrorBlockTest + : public ::testing::TestWithParam { + public: + virtual ~ErrorBlockTest() {} + virtual void SetUp() { + error_block_op_ = GET_PARAM(0); + ref_error_block_op_ = GET_PARAM(1); + bit_depth_ = GET_PARAM(2); + } + + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + vpx_bit_depth_t bit_depth_; + ErrorBlockFunc error_block_op_; + ErrorBlockFunc ref_error_block_op_; +}; + +TEST_P(ErrorBlockTest, OperationCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, 4096); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096); + int err_count_total = 0; + int first_failure = -1; + intptr_t block_size; + int64_t ssz; + int64_t ret; + int64_t ref_ssz; + int64_t ref_ret; + for (int i = 0; i < kNumIterations; ++i) { + int err_count = 0; + block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64 + for (int j = 0; j < block_size; j++) { + coeff[j] = rnd(2 << 20) - (1 << 20); + dqcoeff[j] = rnd(2 << 20) - (1 << 20); + } + ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, + bit_depth_); + ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size, + &ssz, bit_depth_)); + err_count += (ref_ret != ret) | (ref_ssz != ssz); + if (err_count && !err_count_total) { + first_failure = i; + } + err_count_total += err_count; + } + EXPECT_EQ(0, err_count_total) + << "Error: Error Block Test, C output doesn't match SSE2 output. " + << "First failed at test case " << first_failure; +} + +TEST_P(ErrorBlockTest, ExtremeValues) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, 4096); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096); + int err_count_total = 0; + int first_failure = -1; + intptr_t block_size; + int64_t ssz; + int64_t ret; + int64_t ref_ssz; + int64_t ref_ret; + int max_val = ((1 << 20) - 1); + for (int i = 0; i < kNumIterations; ++i) { + int err_count = 0; + int k = (i / 9) % 5; + + // Change the maximum coeff value, to test different bit boundaries + if ( k == 4 && (i % 9) == 0 ) { + max_val >>= 1; + } + block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64 + for (int j = 0; j < block_size; j++) { + if (k < 4) { // Test at maximum values + coeff[j] = k % 2 ? max_val : -max_val; + dqcoeff[j] = (k >> 1) % 2 ? max_val : -max_val; + } else { + coeff[j] = rnd(2 << 14) - (1 << 14); + dqcoeff[j] = rnd(2 << 14) - (1 << 14); + } + } + ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, + bit_depth_); + ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size, + &ssz, bit_depth_)); + err_count += (ref_ret != ret) | (ref_ssz != ssz); + if (err_count && !err_count_total) { + first_failure = i; + } + err_count_total += err_count; + } + EXPECT_EQ(0, err_count_total) + << "Error: Error Block Test, C output doesn't match SSE2 output. " + << "First failed at test case " << first_failure; +} + +using std::tr1::make_tuple; + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, ErrorBlockTest, + ::testing::Values( + make_tuple(&vp9_highbd_block_error_sse2, + &vp9_highbd_block_error_c, VPX_BITS_10), + make_tuple(&vp9_highbd_block_error_sse2, + &vp9_highbd_block_error_c, VPX_BITS_12), + make_tuple(&vp9_highbd_block_error_sse2, + &vp9_highbd_block_error_c, VPX_BITS_8))); +#endif // HAVE_SSE2 +#endif // CONFIG_VP9_HIGHBITDEPTH +} // namespace diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc new file mode 100644 index 000000000..c30b82763 --- /dev/null +++ b/test/vp9_quantize_test.cc @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_entropy.h" +#include "vpx/vpx_integer.h" + +using libvpx_test::ACMRandom; + +namespace { +#if CONFIG_VP9_HIGHBITDEPTH +const int number_of_iterations = 100; + +typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, + int skip_block, const int16_t *zbin, + const int16_t *round, const int16_t *quant, + const int16_t *quant_shift, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, int zbin_oq_value, + uint16_t *eob, const int16_t *scan, + const int16_t *iscan); +typedef std::tr1::tuple + QuantizeParam; + +class VP9QuantizeTest : public ::testing::TestWithParam { + public: + virtual ~VP9QuantizeTest() {} + virtual void SetUp() { + quantize_op_ = GET_PARAM(0); + ref_quantize_op_ = GET_PARAM(1); + bit_depth_ = GET_PARAM(2); + mask_ = (1 << bit_depth_) - 1; + } + + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + vpx_bit_depth_t bit_depth_; + int mask_; + QuantizeFunc quantize_op_; + QuantizeFunc ref_quantize_op_; +}; + +class VP9Quantize32Test : public ::testing::TestWithParam { + public: + virtual ~VP9Quantize32Test() {} + virtual void SetUp() { + quantize_op_ = GET_PARAM(0); + ref_quantize_op_ = GET_PARAM(1); + bit_depth_ = GET_PARAM(2); + mask_ = (1 << bit_depth_) - 1; + } + + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + vpx_bit_depth_t bit_depth_; + int mask_; + QuantizeFunc quantize_op_; + QuantizeFunc ref_quantize_op_; +}; + +TEST_P(VP9QuantizeTest, OperationCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + int zbin_oq_value = 0; + DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 256); + DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 256); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 256); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 256); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 256); + DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1); + DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1); + int err_count_total = 0; + int first_failure = -1; + for (int i = 0; i < number_of_iterations; ++i) { + const int skip_block = i == 0; + const TX_SIZE sz = (TX_SIZE)(i % 3); // TX_4X4, TX_8X8 TX_16X16 + const TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3); + const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; + const int count = (4 << sz) * (4 << sz); // 16, 64, 256 + int err_count = 0; + *eob_ptr = rnd.Rand16(); + *ref_eob_ptr = *eob_ptr; + for (int j = 0; j < count; j++) { + coeff_ptr[j] = rnd.Rand16()&mask_; + } + for (int j = 0; j < 2; j++) { + zbin_ptr[j] = rnd.Rand16()&mask_; + round_ptr[j] = rnd.Rand16(); + quant_ptr[j] = rnd.Rand16(); + quant_shift_ptr[j] = rnd.Rand16(); + dequant_ptr[j] = rnd.Rand16(); + } + ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, ref_qcoeff_ptr, + ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value, + ref_eob_ptr, scan_order->scan, scan_order->iscan); + ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block, + zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, + zbin_oq_value, eob_ptr, + scan_order->scan, scan_order->iscan)); + for (int j = 0; j < sz; ++j) { + err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) | + (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]); + } + err_count += (*ref_eob_ptr != *eob_ptr); + if (err_count && !err_count_total) { + first_failure = i; + } + err_count_total += err_count; + } + EXPECT_EQ(0, err_count_total) + << "Error: Quantization Test, C output doesn't match SSE2 output. " + << "First failed at test case " << first_failure; +} + +TEST_P(VP9Quantize32Test, OperationCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + int zbin_oq_value = 0; + DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 1024); + DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 1024); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 1024); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 1024); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 1024); + DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1); + DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1); + int err_count_total = 0; + int first_failure = -1; + for (int i = 0; i < number_of_iterations; ++i) { + const int skip_block = i == 0; + const TX_SIZE sz = TX_32X32; + const TX_TYPE tx_type = (TX_TYPE)(i % 4); + const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; + const int count = (4 << sz) * (4 << sz); // 1024 + int err_count = 0; + *eob_ptr = rnd.Rand16(); + *ref_eob_ptr = *eob_ptr; + for (int j = 0; j < count; j++) { + coeff_ptr[j] = rnd.Rand16()&mask_; + } + for (int j = 0; j < 2; j++) { + zbin_ptr[j] = rnd.Rand16()&mask_; + round_ptr[j] = rnd.Rand16(); + quant_ptr[j] = rnd.Rand16(); + quant_shift_ptr[j] = rnd.Rand16(); + dequant_ptr[j] = rnd.Rand16(); + } + ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, ref_qcoeff_ptr, + ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value, + ref_eob_ptr, scan_order->scan, scan_order->iscan); + ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block, + zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, + zbin_oq_value, eob_ptr, + scan_order->scan, scan_order->iscan)); + for (int j = 0; j < sz; ++j) { + err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) | + (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]); + } + err_count += (*ref_eob_ptr != *eob_ptr); + if (err_count && !err_count_total) { + first_failure = i; + } + err_count_total += err_count; + } + EXPECT_EQ(0, err_count_total) + << "Error: Quantization Test, C output doesn't match SSE2 output. " + << "First failed at test case " << first_failure; +} + +TEST_P(VP9QuantizeTest, EOBCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + int zbin_oq_value = 0; + DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 256); + DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 256); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 256); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 256); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 256); + DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1); + DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1); + int err_count_total = 0; + int first_failure = -1; + for (int i = 0; i < number_of_iterations; ++i) { + int skip_block = i == 0; + TX_SIZE sz = (TX_SIZE)(i % 3); // TX_4X4, TX_8X8 TX_16X16 + TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3); + const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; + int count = (4 << sz) * (4 << sz); // 16, 64, 256 + int err_count = 0; + *eob_ptr = rnd.Rand16(); + *ref_eob_ptr = *eob_ptr; + // Two random entries + for (int j = 0; j < count; j++) { + coeff_ptr[j] = 0; + } + coeff_ptr[rnd(count)] = rnd.Rand16()&mask_; + coeff_ptr[rnd(count)] = rnd.Rand16()&mask_; + for (int j = 0; j < 2; j++) { + zbin_ptr[j] = rnd.Rand16()&mask_; + round_ptr[j] = rnd.Rand16(); + quant_ptr[j] = rnd.Rand16(); + quant_shift_ptr[j] = rnd.Rand16(); + dequant_ptr[j] = rnd.Rand16(); + } + + ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, ref_qcoeff_ptr, + ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value, + ref_eob_ptr, scan_order->scan, scan_order->iscan); + ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block, + zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, + zbin_oq_value, eob_ptr, + scan_order->scan, scan_order->iscan)); + + for (int j = 0; j < sz; ++j) { + err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) | + (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]); + } + err_count += (*ref_eob_ptr != *eob_ptr); + if (err_count && !err_count_total) { + first_failure = i; + } + err_count_total += err_count; + } + EXPECT_EQ(0, err_count_total) + << "Error: Quantization Test, C output doesn't match SSE2 output. " + << "First failed at test case " << first_failure; +} + +TEST_P(VP9Quantize32Test, EOBCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + int zbin_oq_value = 0; + DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 1024); + DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 1024); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 1024); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 1024); + DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 1024); + DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2); + DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1); + DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1); + int err_count_total = 0; + int first_failure = -1; + for (int i = 0; i < number_of_iterations; ++i) { + int skip_block = i == 0; + TX_SIZE sz = TX_32X32; + TX_TYPE tx_type = (TX_TYPE)(i % 4); + const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; + int count = (4 << sz) * (4 << sz); // 1024 + int err_count = 0; + *eob_ptr = rnd.Rand16(); + *ref_eob_ptr = *eob_ptr; + for (int j = 0; j < count; j++) { + coeff_ptr[j] = 0; + } + // Two random entries + coeff_ptr[rnd(count)] = rnd.Rand16()&mask_; + coeff_ptr[rnd(count)] = rnd.Rand16()&mask_; + for (int j = 0; j < 2; j++) { + zbin_ptr[j] = rnd.Rand16()&mask_; + round_ptr[j] = rnd.Rand16(); + quant_ptr[j] = rnd.Rand16(); + quant_shift_ptr[j] = rnd.Rand16(); + dequant_ptr[j] = rnd.Rand16(); + } + + ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, ref_qcoeff_ptr, + ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value, + ref_eob_ptr, scan_order->scan, scan_order->iscan); + ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block, + zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, + zbin_oq_value, eob_ptr, + scan_order->scan, scan_order->iscan)); + + for (int j = 0; j < sz; ++j) { + err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) | + (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]); + } + err_count += (*ref_eob_ptr != *eob_ptr); + if (err_count && !err_count_total) { + first_failure = i; + } + err_count_total += err_count; + } + EXPECT_EQ(0, err_count_total) + << "Error: Quantization Test, C output doesn't match SSE2 output. " + << "First failed at test case " << first_failure; +} +using std::tr1::make_tuple; + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, VP9QuantizeTest, + ::testing::Values( + make_tuple(&vp9_highbd_quantize_b_sse2, + &vp9_highbd_quantize_b_c, VPX_BITS_8), + make_tuple(&vp9_highbd_quantize_b_sse2, + &vp9_highbd_quantize_b_c, VPX_BITS_10), + make_tuple(&vp9_highbd_quantize_b_sse2, + &vp9_highbd_quantize_b_c, VPX_BITS_12))); +INSTANTIATE_TEST_CASE_P( + SSE2, VP9Quantize32Test, + ::testing::Values( + make_tuple(&vp9_highbd_quantize_b_32x32_sse2, + &vp9_highbd_quantize_b_32x32_c, VPX_BITS_8), + make_tuple(&vp9_highbd_quantize_b_32x32_sse2, + &vp9_highbd_quantize_b_32x32_c, VPX_BITS_10), + make_tuple(&vp9_highbd_quantize_b_32x32_sse2, + &vp9_highbd_quantize_b_32x32_c, VPX_BITS_12))); +#endif // HAVE_SSE2 +#endif // CONFIG_VP9_HIGHBITDEPTH +} // namespace diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 968c8db79..273a09516 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1855,7 +1855,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # ENCODEMB INVOKE add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; - specialize qw/vp9_highbd_block_error/; + specialize qw/vp9_highbd_block_error sse2/; add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; specialize qw/vp9_highbd_subtract_block/; @@ -1867,10 +1867,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_quantize_fp_32x32/; add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_highbd_quantize_b/; + specialize qw/vp9_highbd_quantize_b sse2/; add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_highbd_quantize_b_32x32/; + specialize qw/vp9_highbd_quantize_b_32x32 sse2/; # # Structured Similarity (SSIM) diff --git a/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c b/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c new file mode 100644 index 000000000..c245ccafa --- /dev/null +++ b/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vp9/common/vp9_common.h" + +int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, + int bps) { + int i, j, test; + uint32_t temp[4]; + __m128i max, min, cmp0, cmp1, cmp2, cmp3; + int64_t error = 0, sqcoeff = 0; + const int shift = 2 * (bps - 8); + const int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + for (i = 0; i < block_size; i+=8) { + // Load the data into xmm registers + __m128i mm_coeff = _mm_load_si128((__m128i*) (coeff + i)); + __m128i mm_coeff2 = _mm_load_si128((__m128i*) (coeff + i + 4)); + __m128i mm_dqcoeff = _mm_load_si128((__m128i*) (dqcoeff + i)); + __m128i mm_dqcoeff2 = _mm_load_si128((__m128i*) (dqcoeff + i + 4)); + // Check if any values require more than 15 bit + max = _mm_set1_epi32(0x3fff); + min = _mm_set1_epi32(0xffffc000); + cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max), + _mm_cmplt_epi32(mm_coeff, min)); + cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max), + _mm_cmplt_epi32(mm_coeff2, min)); + cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max), + _mm_cmplt_epi32(mm_dqcoeff, min)); + cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max), + _mm_cmplt_epi32(mm_dqcoeff2, min)); + test = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(cmp0, cmp1), + _mm_or_si128(cmp2, cmp3))); + + if (!test) { + __m128i mm_diff, error_sse2, sqcoeff_sse2;; + mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2); + mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2); + mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff); + error_sse2 = _mm_madd_epi16(mm_diff, mm_diff); + sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff); + _mm_storeu_si128((__m128i*)temp, error_sse2); + error = error + temp[0] + temp[1] + temp[2] + temp[3]; + _mm_storeu_si128((__m128i*)temp, sqcoeff_sse2); + sqcoeff += temp[0] + temp[1] + temp[2] + temp[3]; + } else { + for (j = 0; j < 8; j++) { + const int64_t diff = coeff[i + j] - dqcoeff[i + j]; + error += diff * diff; + sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j]; + } + } + } + assert(error >= 0 && sqcoeff >= 0); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + + *ssz = sqcoeff; + return error; +} diff --git a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c b/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c new file mode 100644 index 000000000..55c6ed71f --- /dev/null +++ b/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_common.h" + +#if CONFIG_VP9_HIGHBITDEPTH +// from vp9_idct.h: typedef int32_t tran_low_t; +void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, + intptr_t count, + int skip_block, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, + int zbin_oq_value, + uint16_t *eob_ptr, + const int16_t *scan, + const int16_t *iscan) { + int i, j, non_zero_regs = (int)count / 4, eob_i = -1; + __m128i zbins[2]; + __m128i nzbins[2]; + + zbins[0] = _mm_set_epi32((int)(zbin_ptr[1] + zbin_oq_value), + (int)(zbin_ptr[1] + zbin_oq_value), + (int)(zbin_ptr[1] + zbin_oq_value), + (int)(zbin_ptr[0] + zbin_oq_value)); + zbins[1] = _mm_set1_epi32((int)(zbin_ptr[1] + zbin_oq_value)); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + (void)scan; + + vpx_memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); + vpx_memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = ((int)count / 4) - 1; i >= 0; i--) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (test == 0xffff) + non_zero_regs--; + else + break; + } + + // Quantization pass: + for (i = 0; i < non_zero_regs; i++) { + __m128i coeffs, coeffs_sign, tmp1, tmp2; + int test; + int abs_coeff[4]; + int coeff_sign[4]; + + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + coeffs_sign = _mm_srai_epi32(coeffs, 31); + coeffs = _mm_sub_epi32( + _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); + tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); + tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); + tmp1 = _mm_or_si128(tmp1, tmp2); + test = _mm_movemask_epi8(tmp1); + _mm_storeu_si128((__m128i*)abs_coeff, coeffs); + _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign); + + for (j = 0; j < 4; j++) { + if (test & (1 << (4 * j))) { + int k = 4 * i + j; + int64_t tmp = clamp(abs_coeff[j] + round_ptr[k != 0], + INT32_MIN, INT32_MAX); + tmp = ((((tmp * quant_ptr[k != 0]) >> 16) + tmp) * + quant_shift_ptr[k != 0]) >> 16; // quantization + qcoeff_ptr[k] = (tmp ^ coeff_sign[j]) - coeff_sign[j]; + dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; + if (tmp) + eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; + } + } + } + } + *eob_ptr = eob_i + 1; +} + + +void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, + int skip_block, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, + int zbin_oq_value, + uint16_t *eob_ptr, + const int16_t *scan, + const int16_t *iscan) { + __m128i zbins[2]; + __m128i nzbins[2]; + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1); + const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1); + (void)scan; + zbins[0] = _mm_set_epi32((zbin1_tmp + zbin_oq_value), + (zbin1_tmp + zbin_oq_value), + (zbin1_tmp + zbin_oq_value), + (zbin0_tmp + zbin_oq_value)); + zbins[1] = _mm_set1_epi32((zbin1_tmp + zbin_oq_value)); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) + idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) + idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) + idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) + idx_arr[idx++] = i * 4 + 3; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int64_t tmp = clamp(abs_coeff + + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), + INT32_MIN, INT32_MAX); + tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> 15; + + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + + if (tmp) + eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; + } + } + *eob_ptr = eob + 1; +} +#endif diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 651b4c168..67a9fa1a7 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -105,6 +105,8 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c endif ifeq ($(CONFIG_USE_X86INC),yes) -- 2.40.0