From f2b311f580e068fc12d78c3b6233e53af9338e7c Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Wed, 23 Mar 2016 10:59:32 -0700 Subject: [PATCH] Simplify rounding in vp10_[fwd/inv]_txfm[1/2]d_#x# Change-Id: I24ce46e157dc5b9c0d75000a1a48e9c136ed4ee1 --- test/vp10_fwd_txfm1d_test.cc | 13 +---------- vp10/common/vp10_txfm.h | 28 ++++++++--------------- vp10/common/x86/vp10_txfm1d_sse2.h | 36 ++++++++++-------------------- 3 files changed, 22 insertions(+), 55 deletions(-) diff --git a/test/vp10_fwd_txfm1d_test.cc b/test/vp10_fwd_txfm1d_test.cc index bcbc6178e..2d09e0d32 100644 --- a/test/vp10_fwd_txfm1d_test.cc +++ b/test/vp10_fwd_txfm1d_test.cc @@ -31,7 +31,7 @@ static int8_t cos_bit[12] = {14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; static int8_t range_bit[12] = {32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}; TEST(vp10_fwd_txfm1d, round_shift) { - EXPECT_EQ(round_shift(7, 1), 3); + EXPECT_EQ(round_shift(7, 1), 4); EXPECT_EQ(round_shift(-7, 1), -3); EXPECT_EQ(round_shift(7, 2), 2); @@ -46,17 +46,6 @@ TEST(vp10_fwd_txfm1d, get_max_bit) { EXPECT_EQ(max_bit, 3); } -TEST(vp10_fwd_txfm1d, half_btf) { - int32_t max = (1 << 15) - 1; - int32_t w0 = max; - int32_t in0 = max; - int32_t w1 = max; - int32_t in1 = max; - int32_t result_32 = half_btf(w0, in0, w1, in1, 0); - int64_t result_64 = (int64_t)w0 * (int64_t)in0 + (int64_t)w1 * (int64_t)in1; - EXPECT_EQ(result_32, result_64); -} - TEST(vp10_fwd_txfm1d, cospi_arr) { for (int i = 0; i < 7; i++) { for (int j = 0; j < 64; j++) { diff --git a/vp10/common/vp10_txfm.h b/vp10/common/vp10_txfm.h index ad7b38f4d..9944bdda4 100644 --- a/vp10/common/vp10_txfm.h +++ b/vp10/common/vp10_txfm.h @@ -81,23 +81,7 @@ static const int32_t cospi_arr[7][64] = 12785, 11204, 9616, 8022, 6424, 4821, 3216, 1608}}; static INLINE int32_t round_shift(int32_t value, int bit) { - // For value >= 0, - // there are twe version of rounding - // 1) (value + (1 << (bit - 1)) - 1) >> bit - // 2) (value + (1 << (bit - 1))) >> bit - // boath methods are mild unbiased - // however, the first version has slightly advantage because - // it rounds number toward zero. - // For value < 0, we also choose the version that rounds number - // toward zero. - if (bit > 0) { - if (value >= 0) - return (value + (1 << (bit - 1)) - 1) >> bit; - else - return ((value - (1 << (bit - 1))) >> bit) + 1; - } else { - return value << (-bit); - } + return (value + (1 << (bit - 1))) >> bit; } static INLINE void round_shift_array(int32_t *arr, int size, int bit) { @@ -105,8 +89,14 @@ static INLINE void round_shift_array(int32_t *arr, int size, int bit) { if (bit == 0) { return; } else { - for (i = 0; i < size; i++) { - arr[i] = round_shift(arr[i], bit); + if (bit > 0) { + for (i = 0; i < size; i++) { + arr[i] = round_shift(arr[i], bit); + } + } else { + for (i = 0; i < size; i++) { + arr[i] = arr[i] << (-bit); + } } } } diff --git a/vp10/common/x86/vp10_txfm1d_sse2.h b/vp10/common/x86/vp10_txfm1d_sse2.h index bc99327e5..fc25013d6 100644 --- a/vp10/common/x86/vp10_txfm1d_sse2.h +++ b/vp10/common/x86/vp10_txfm1d_sse2.h @@ -81,32 +81,20 @@ static INLINE void transpose_32(int txfm_size, const __m128i* input, } } -#define mullo_epi32(a, b) \ - ({ \ +#define mullo_epi32(a, b) \ + ({ \ __m128i tmp1 = _mm_mul_epu32(a, b); \ __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); \ - _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), \ - _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))); \ + _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), \ + _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))); \ }) -#define round_shift_32_simple_sse2(input, bit) \ - ({ \ - __m128i round = _mm_set1_epi32((1 << (bit - 1)) - 1); \ - __m128i tmp1 = _mm_add_epi32(input, round); \ - _mm_srai_epi32(tmp1, bit); \ - }) - -#define round_shift_32_sse2(vec, bit) \ - ({ \ - __m128i sign, tmp, round; \ - sign = _mm_srai_epi32(vec, 31); \ - tmp = _mm_add_epi32(vec, sign); \ - tmp = _mm_xor_si128(tmp, sign); \ - round = _mm_set1_epi32((1 << (bit - 1)) - 1); \ - tmp = _mm_add_epi32(tmp, round); \ - tmp = _mm_srli_epi32(tmp, bit); \ - tmp = _mm_xor_si128(tmp, sign); \ - _mm_sub_epi32(tmp, sign); \ +#define round_shift_32_sse2(vec, bit) \ + ({ \ + __m128i tmp, round; \ + round = _mm_set1_epi32(1 << (bit - 1)); \ + tmp = _mm_add_epi32(vec, round); \ + _mm_srai_epi32(tmp, bit); \ }) #define round_shift_array_32_sse2(input, output, size, bit) \ @@ -128,7 +116,7 @@ static INLINE void transpose_32(int txfm_size, const __m128i* input, // out1 = -in1*w0 + in0*w1 #define btf_32_sse2_type0(w0, w1, in0, in1, out0, out1, bit) \ ({ \ - __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0; \ + __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0; \ ww0 = _mm_set1_epi32(w0); \ ww1 = _mm_set1_epi32(w1); \ in0_w0 = mullo_epi32(in0, ww0); \ @@ -145,7 +133,7 @@ static INLINE void transpose_32(int txfm_size, const __m128i* input, // out1 = in1*w0 - in0*w1 #define btf_32_sse2_type1(w0, w1, in0, in1, out0, out1, bit) \ ({ \ - __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0; \ + __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0; \ ww0 = _mm_set1_epi32(w0); \ ww1 = _mm_set1_epi32(w1); \ in0_w0 = mullo_epi32(in0, ww0); \ -- 2.40.0