Use shifted value for sinpi8sqrt2

author Johann <johannkoenig@google.com>

Tue, 2 Aug 2016 22:59:35 +0000 (15:59 -0700)

committer Johann <johannkoenig@google.com>

Sat, 24 Sep 2016 00:04:18 +0000 (17:04 -0700)
author Johann <johannkoenig@google.com>
Tue, 2 Aug 2016 22:59:35 +0000 (15:59 -0700)
committer Johann <johannkoenig@google.com>
Sat, 24 Sep 2016 00:04:18 +0000 (17:04 -0700)
diff --git a/test/idct_test.cc b/test/idct_test.cc

index f54f2c005ac7584d4318d404566cb80d3432620c..700da77e3e2b0a800f35ea6727129e58261fdb1f 100644 (file)
--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -115,6 +115,10 @@ TEST_P(IDCTTest, TestWithData) {
  }
  
  INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c));
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, IDCTTest,
+                        ::testing::Values(vp8_short_idct4x4llm_neon));
+#endif
  #if HAVE_MMX
  INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
                          ::testing::Values(vp8_short_idct4x4llm_mmx));
diff --git a/vp8/common/arm/neon/dequant_idct_neon.c b/vp8/common/arm/neon/dequant_idct_neon.c

index ff5981eaaccc7463a1460f4d65e2e45881f5715f..753051c77d00ed75efa2ada9ea35732114563735 100644 (file)
--- a/vp8/common/arm/neon/dequant_idct_neon.c
+++ b/vp8/common/arm/neon/dequant_idct_neon.c
@@ -11,7 +11,11 @@
  #include <arm_neon.h>
  
  static const int16_t cospi8sqrt2minus1 = 20091;
-static const int16_t sinpi8sqrt2 = 35468;
+// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of
+// the way it is used in vqdmulh, where the result is doubled, it can be divided
+// by 2 beforehand. This saves compensating for the negative value as well as
+// shifting the result.
+static const int16_t sinpi8sqrt2 = 35468 >> 1;
  
  void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
                                 int stride) {
@@ -60,10 +64,8 @@ void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
  
-  q3 = vshrq_n_s16(q3, 1);
    q4 = vshrq_n_s16(q4, 1);
  
-  q3 = vqaddq_s16(q3, q2);
    q4 = vqaddq_s16(q4, q2);
  
    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
@@ -90,10 +92,8 @@ void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
    d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
    d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
  
-  q3 = vshrq_n_s16(q3, 1);
    q4 = vshrq_n_s16(q4, 1);
  
-  q3 = vqaddq_s16(q3, q2);
    q4 = vqaddq_s16(q4, q2);
  
    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
diff --git a/vp8/common/arm/neon/shortidct4x4llm_neon.c b/vp8/common/arm/neon/shortidct4x4llm_neon.c

index a36c0c1cacce2647575df42005ef23e295e44ff5..1adb1c3171fca3190b6ba6d4927f1069b2346b44 100644 (file)
--- a/vp8/common/arm/neon/shortidct4x4llm_neon.c
+++ b/vp8/common/arm/neon/shortidct4x4llm_neon.c
@@ -11,7 +11,11 @@
  #include <arm_neon.h>
  
  static const int16_t cospi8sqrt2minus1 = 20091;
-static const int16_t sinpi8sqrt2 = 35468;
+// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of
+// the way it is used in vqdmulh, where the result is doubled, it can be divided
+// by 2 beforehand. This saves compensating for the negative value as well as
+// shifting the result.
+static const int16_t sinpi8sqrt2 = 35468 >> 1;
  
  void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
                                 int pred_stride, unsigned char *dst_ptr,
@@ -40,10 +44,8 @@ void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
    d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
    d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
  
-  q3s16 = vshrq_n_s16(q3s16, 1);
    q4s16 = vshrq_n_s16(q4s16, 1);
  
-  q3s16 = vqaddq_s16(q3s16, q2s16);
    q4s16 = vqaddq_s16(q4s16, q2s16);
  
    d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
@@ -71,10 +73,8 @@ void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
    d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
    d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
  
-  q3s16 = vshrq_n_s16(q3s16, 1);
    q4s16 = vshrq_n_s16(q4s16, 1);
  
-  q3s16 = vqaddq_s16(q3s16, q2s16);
    q4s16 = vqaddq_s16(q4s16, q2s16);
  
    d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
author	Johann <johannkoenig@google.com>
	Tue, 2 Aug 2016 22:59:35 +0000 (15:59 -0700)
committer	Johann <johannkoenig@google.com>
	Sat, 24 Sep 2016 00:04:18 +0000 (17:04 -0700)
test/idct_test.cc		patch \| blob \| history
vp8/common/arm/neon/dequant_idct_neon.c		patch \| blob \| history
vp8/common/arm/neon/shortidct4x4llm_neon.c		patch \| blob \| history