From e3f12b520fe2d963d4aecf18c30ebbb594d50899 Mon Sep 17 00:00:00 2001 From: Johann Date: Mon, 29 Apr 2019 13:05:30 -0700 Subject: [PATCH] vp8 quantize: use native abs/sign implementations ~4% improvement with a very rudimentary speed test Change-Id: Iad8868327e3276dbead783a79849295b0e4b135c --- test/quantize_test.cc | 17 +++++++++++++++-- vp8/encoder/x86/quantize_sse4.c | 25 ++++++++----------------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/test/quantize_test.cc b/test/quantize_test.cc index 1415ce18e..a7497742c 100644 --- a/test/quantize_test.cc +++ b/test/quantize_test.cc @@ -13,9 +13,10 @@ #include "third_party/googletest/src/include/gtest/gtest.h" -#include "./vpx_config.h" #include "./vp8_rtcd.h" +#include "./vpx_config.h" #include "test/acm_random.h" +#include "test/bench.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" @@ -117,7 +118,8 @@ class QuantizeTestBase { }; class QuantizeTest : public QuantizeTestBase, - public ::testing::TestWithParam { + public ::testing::TestWithParam, + public AbstractBench { protected: virtual void SetUp() { SetupCompressor(); @@ -125,6 +127,10 @@ class QuantizeTest : public QuantizeTestBase, c_quant_ = GET_PARAM(1); } + virtual void Run() { + asm_quant_(&vp8_comp_->mb.block[0], ¯oblockd_dst_->block[0]); + } + void RunComparison() { for (int i = 0; i < kNumBlocks; ++i) { ASM_REGISTER_STATE_CHECK( @@ -167,6 +173,13 @@ TEST_P(QuantizeTest, TestMultipleQ) { } } +TEST_P(QuantizeTest, DISABLED_Speed) { + FillCoeffRandom(); + + RunNTimes(10000000); + PrintMedian("vp8 quantize"); +} + #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( SSE2, QuantizeTest, diff --git a/vp8/encoder/x86/quantize_sse4.c b/vp8/encoder/x86/quantize_sse4.c index 6f2c16349..13dd1abc3 100644 --- a/vp8/encoder/x86/quantize_sse4.c +++ b/vp8/encoder/x86/quantize_sse4.c @@ -11,8 +11,8 @@ #include /* SSE4.1 */ #include "./vp8_rtcd.h" -#include "vp8/encoder/block.h" #include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ +#include "vp8/encoder/block.h" #define SELECT_EOB(i, z, x, y, q) \ do { \ @@ -31,8 +31,7 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { char eob = 0; short *zbin_boost_ptr = b->zrun_zbin_boost; - __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, - dqcoeff1; + __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1; __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); @@ -53,15 +52,9 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); - /* Sign of z: z >> 15 */ - sz0 = _mm_srai_epi16(z0, 15); - sz1 = _mm_srai_epi16(z1, 15); - - /* x = abs(z): (z ^ sz) - sz */ - x0 = _mm_xor_si128(z0, sz0); - x1 = _mm_xor_si128(z1, sz1); - x0 = _mm_sub_epi16(x0, sz0); - x1 = _mm_sub_epi16(x1, sz1); + /* x = abs(z) */ + x0 = _mm_abs_epi16(z0); + x1 = _mm_abs_epi16(z1); /* zbin[] + zbin_extra */ zbin0 = _mm_add_epi16(zbin0, zbin_extra); @@ -89,11 +82,9 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { y0 = _mm_mulhi_epi16(y0, quant_shift0); y1 = _mm_mulhi_epi16(y1, quant_shift1); - /* Return the sign: (y ^ sz) - sz */ - y0 = _mm_xor_si128(y0, sz0); - y1 = _mm_xor_si128(y1, sz1); - y0 = _mm_sub_epi16(y0, sz0); - y1 = _mm_sub_epi16(y1, sz1); + /* Restore the sign. */ + y0 = _mm_sign_epi16(y0, z0); + y1 = _mm_sign_epi16(y1, z1); /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0); -- 2.50.1