From f2b311f580e068fc12d78c3b6233e53af9338e7c Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Wed, 23 Mar 2016 10:59:32 -0700
Subject: [PATCH] Simplify rounding in vp10_[fwd/inv]_txfm[1/2]d_#x#

Change-Id: I24ce46e157dc5b9c0d75000a1a48e9c136ed4ee1
---
 test/vp10_fwd_txfm1d_test.cc       | 13 +----------
 vp10/common/vp10_txfm.h            | 28 ++++++++---------------
 vp10/common/x86/vp10_txfm1d_sse2.h | 36 ++++++++++--------------------
 3 files changed, 22 insertions(+), 55 deletions(-)

diff --git a/test/vp10_fwd_txfm1d_test.cc b/test/vp10_fwd_txfm1d_test.cc
index bcbc6178e..2d09e0d32 100644
--- a/test/vp10_fwd_txfm1d_test.cc
+++ b/test/vp10_fwd_txfm1d_test.cc
@@ -31,7 +31,7 @@ static int8_t cos_bit[12] = {14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
 static int8_t range_bit[12] = {32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
 
 TEST(vp10_fwd_txfm1d, round_shift) {
-  EXPECT_EQ(round_shift(7, 1), 3);
+  EXPECT_EQ(round_shift(7, 1), 4);
   EXPECT_EQ(round_shift(-7, 1), -3);
 
   EXPECT_EQ(round_shift(7, 2), 2);
@@ -46,17 +46,6 @@ TEST(vp10_fwd_txfm1d, get_max_bit) {
   EXPECT_EQ(max_bit, 3);
 }
 
-TEST(vp10_fwd_txfm1d, half_btf) {
-  int32_t max = (1 << 15) - 1;
-  int32_t w0 = max;
-  int32_t in0 = max;
-  int32_t w1 = max;
-  int32_t in1 = max;
-  int32_t result_32 = half_btf(w0, in0, w1, in1, 0);
-  int64_t result_64 = (int64_t)w0 * (int64_t)in0 + (int64_t)w1 * (int64_t)in1;
-  EXPECT_EQ(result_32, result_64);
-}
-
 TEST(vp10_fwd_txfm1d, cospi_arr) {
   for (int i = 0; i < 7; i++) {
     for (int j = 0; j < 64; j++) {
diff --git a/vp10/common/vp10_txfm.h b/vp10/common/vp10_txfm.h
index ad7b38f4d..9944bdda4 100644
--- a/vp10/common/vp10_txfm.h
+++ b/vp10/common/vp10_txfm.h
@@ -81,23 +81,7 @@ static const int32_t cospi_arr[7][64] =
     12785, 11204,  9616,  8022,  6424,  4821,  3216,  1608}};
 
 static INLINE int32_t round_shift(int32_t value, int bit) {
-  // For value >= 0,
-  // there are twe version of rounding
-  // 1) (value + (1 << (bit - 1)) - 1) >> bit
-  // 2) (value + (1 << (bit - 1))) >> bit
-  // boath methods are mild unbiased
-  // however, the first version has slightly advantage because
-  // it rounds number toward zero.
-  // For value < 0, we also choose the version that rounds number
-  // toward zero.
-  if (bit > 0) {
-    if (value >= 0)
-      return (value + (1 << (bit - 1)) - 1) >> bit;
-    else
-      return ((value - (1 << (bit - 1))) >> bit) + 1;
-  } else {
-    return value << (-bit);
-  }
+  return (value + (1 << (bit - 1))) >> bit;
 }
 
 static INLINE void round_shift_array(int32_t *arr, int size, int bit) {
@@ -105,8 +89,14 @@ static INLINE void round_shift_array(int32_t *arr, int size, int bit) {
   if (bit == 0) {
     return;
   } else {
-    for (i = 0; i < size; i++) {
-      arr[i] = round_shift(arr[i], bit);
+    if (bit > 0) {
+      for (i = 0; i < size; i++) {
+        arr[i] = round_shift(arr[i], bit);
+      }
+    } else {
+      for (i = 0; i < size; i++) {
+        arr[i] = arr[i] << (-bit);
+      }
     }
   }
 }
diff --git a/vp10/common/x86/vp10_txfm1d_sse2.h b/vp10/common/x86/vp10_txfm1d_sse2.h
index bc99327e5..fc25013d6 100644
--- a/vp10/common/x86/vp10_txfm1d_sse2.h
+++ b/vp10/common/x86/vp10_txfm1d_sse2.h
@@ -81,32 +81,20 @@ static INLINE void transpose_32(int txfm_size, const __m128i* input,
   }
 }
 
-#define mullo_epi32(a, b)                                                   \
-  ({                                                                        \
+#define mullo_epi32(a, b)                                                     \
+  ({                                                                          \
     __m128i tmp1 = _mm_mul_epu32(a, b);                                       \
     __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); \
-    _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)),    \
-                       _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));   \
+    _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)),      \
+                       _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));     \
   })
 
-#define round_shift_32_simple_sse2(input, bit)          \
-  ({                                                    \
-    __m128i round = _mm_set1_epi32((1 << (bit - 1)) - 1); \
-    __m128i tmp1 = _mm_add_epi32(input, round);           \
-    _mm_srai_epi32(tmp1, bit);                          \
-  })
-
-#define round_shift_32_sse2(vec, bit)             \
-  ({                                              \
-    __m128i sign, tmp, round;                       \
-    sign = _mm_srai_epi32(vec, 31);               \
-    tmp = _mm_add_epi32(vec, sign);               \
-    tmp = _mm_xor_si128(tmp, sign);               \
-    round = _mm_set1_epi32((1 << (bit - 1)) - 1); \
-    tmp = _mm_add_epi32(tmp, round);              \
-    tmp = _mm_srli_epi32(tmp, bit);               \
-    tmp = _mm_xor_si128(tmp, sign);               \
-    _mm_sub_epi32(tmp, sign);                     \
+#define round_shift_32_sse2(vec, bit)       \
+  ({                                        \
+    __m128i tmp, round;                     \
+    round = _mm_set1_epi32(1 << (bit - 1)); \
+    tmp = _mm_add_epi32(vec, round);        \
+    _mm_srai_epi32(tmp, bit);               \
   })
 
 #define round_shift_array_32_sse2(input, output, size, bit) \
@@ -128,7 +116,7 @@ static INLINE void transpose_32(int txfm_size, const __m128i* input,
 // out1 = -in1*w0 + in0*w1
 #define btf_32_sse2_type0(w0, w1, in0, in1, out0, out1, bit) \
   ({                                                         \
-    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
+    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;        \
     ww0 = _mm_set1_epi32(w0);                                \
     ww1 = _mm_set1_epi32(w1);                                \
     in0_w0 = mullo_epi32(in0, ww0);                          \
@@ -145,7 +133,7 @@ static INLINE void transpose_32(int txfm_size, const __m128i* input,
 // out1 = in1*w0 - in0*w1
 #define btf_32_sse2_type1(w0, w1, in0, in1, out0, out1, bit) \
   ({                                                         \
-    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
+    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;        \
     ww0 = _mm_set1_epi32(w0);                                \
     ww1 = _mm_set1_epi32(w1);                                \
     in0_w0 = mullo_epi32(in0, ww0);                          \
-- 
2.40.0