Fix a bug in vp9_iht16x16_256_add_neon()

author Linfeng Zhang <linfengz@google.com>

Tue, 6 Mar 2018 00:14:35 +0000 (16:14 -0800)

committer Linfeng Zhang <linfengz@google.com>

Tue, 6 Mar 2018 00:14:35 +0000 (16:14 -0800)
author Linfeng Zhang <linfengz@google.com>
Tue, 6 Mar 2018 00:14:35 +0000 (16:14 -0800)
committer Linfeng Zhang <linfengz@google.com>
Tue, 6 Mar 2018 00:14:35 +0000 (16:14 -0800)
diff --git a/test/dct_test.cc b/test/dct_test.cc

index bc1afbbe226fb99b3712edcba8f603cd22ca2d3b..6ec8a874a0d5c4695c0240d204ff9a40e6d9dc85 100644 (file)
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -639,12 +639,8 @@ static const FuncInfo ht_neon_func_info[] = {
  #endif
  #endif
    { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },
-  // TODO(linfengz): reenable these functions once test vector failures are
-  // addressed.
    { &vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 1 },
-#if 0
    { &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_neon>, 16, 1 }
-#endif
  };
  
  INSTANTIATE_TEST_CASE_P(
diff --git a/vp9/common/arm/neon/vp9_iht16x16_add_neon.c b/vp9/common/arm/neon/vp9_iht16x16_add_neon.c

index fc9824ae9cde61bbba38268e0947aba8f3d1fa71..a7d5a53c738a5ed86d6ecd13f8dd24b379579ec9 100644 (file)
--- a/vp9/common/arm/neon/vp9_iht16x16_add_neon.c
+++ b/vp9/common/arm/neon/vp9_iht16x16_add_neon.c
@@ -221,30 +221,10 @@ static void iadst16x16_256_add_half1d(const void *const input, int16_t *output,
    x[15] = sub_dct_const_round_shift_low_8(s13, s15);
  
    // stage 4
-  {
-    const int16x8_t sum = vaddq_s16(x[2], x[3]);
-    const int16x8_t sub = vsubq_s16(x[2], x[3]);
-    x[2] = iadst_half_butterfly_neg_neon(sum, c_16_n16_8_24);
-    x[3] = iadst_half_butterfly_pos_neon(sub, c_16_n16_8_24);
-  }
-  {
-    const int16x8_t sum = vaddq_s16(x[7], x[6]);
-    const int16x8_t sub = vsubq_s16(x[7], x[6]);
-    x[6] = iadst_half_butterfly_pos_neon(sum, c_16_n16_8_24);
-    x[7] = iadst_half_butterfly_pos_neon(sub, c_16_n16_8_24);
-  }
-  {
-    const int16x8_t sum = vaddq_s16(x[11], x[10]);
-    const int16x8_t sub = vsubq_s16(x[11], x[10]);
-    x[10] = iadst_half_butterfly_pos_neon(sum, c_16_n16_8_24);
-    x[11] = iadst_half_butterfly_pos_neon(sub, c_16_n16_8_24);
-  }
-  {
-    const int16x8_t sum = vaddq_s16(x[14], x[15]);
-    const int16x8_t sub = vsubq_s16(x[14], x[15]);
-    x[14] = iadst_half_butterfly_neg_neon(sum, c_16_n16_8_24);
-    x[15] = iadst_half_butterfly_pos_neon(sub, c_16_n16_8_24);
-  }
+  iadst_half_butterfly_neg_neon(&x[3], &x[2], c_16_n16_8_24);
+  iadst_half_butterfly_pos_neon(&x[7], &x[6], c_16_n16_8_24);
+  iadst_half_butterfly_pos_neon(&x[11], &x[10], c_16_n16_8_24);
+  iadst_half_butterfly_neg_neon(&x[15], &x[14], c_16_n16_8_24);
  
    out[0] = x[0];
    out[1] = vnegq_s16(x[8]);
diff --git a/vp9/common/arm/neon/vp9_iht_neon.h b/vp9/common/arm/neon/vp9_iht_neon.h

index 965eff36b1c84cde2afdb434dd93c0627d62b449..b09b96a4d676b9ea9f65428ceb05ad64da4d192e 100644 (file)
--- a/vp9/common/arm/neon/vp9_iht_neon.h
+++ b/vp9/common/arm/neon/vp9_iht_neon.h
@@ -74,22 +74,40 @@ static INLINE void iadst_half_butterfly_neon(int16x8_t *const x,
    x[1] = dct_const_round_shift_low_8(t1);
  }
  
-static INLINE int16x8_t iadst_half_butterfly_neg_neon(const int16x8_t in,
-                                                      const int16x4_t c) {
-  int32x4_t t[2];
+static INLINE void iadst_half_butterfly_neg_neon(int16x8_t *const x0,
+                                                 int16x8_t *const x1,
+                                                 const int16x4_t c) {
+  // Don't add/sub before multiply, which will overflow in iadst8.
+  const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 1);
+  const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 1);
+  const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 1);
+  const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 1);
+  int32x4_t t0[2], t1[2];
  
-  t[0] = vmull_lane_s16(vget_low_s16(in), c, 1);
-  t[1] = vmull_lane_s16(vget_high_s16(in), c, 1);
-  return dct_const_round_shift_low_8(t);
+  t0[0] = vaddq_s32(x0_lo, x1_lo);
+  t0[1] = vaddq_s32(x0_hi, x1_hi);
+  t1[0] = vsubq_s32(x0_lo, x1_lo);
+  t1[1] = vsubq_s32(x0_hi, x1_hi);
+  *x1 = dct_const_round_shift_low_8(t0);
+  *x0 = dct_const_round_shift_low_8(t1);
  }
  
-static INLINE int16x8_t iadst_half_butterfly_pos_neon(const int16x8_t in,
-                                                      const int16x4_t c) {
-  int32x4_t t[2];
+static INLINE void iadst_half_butterfly_pos_neon(int16x8_t *const x0,
+                                                 int16x8_t *const x1,
+                                                 const int16x4_t c) {
+  // Don't add/sub before multiply, which will overflow in iadst8.
+  const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 0);
+  const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 0);
+  const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 0);
+  const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 0);
+  int32x4_t t0[2], t1[2];
  
-  t[0] = vmull_lane_s16(vget_low_s16(in), c, 0);
-  t[1] = vmull_lane_s16(vget_high_s16(in), c, 0);
-  return dct_const_round_shift_low_8(t);
+  t0[0] = vaddq_s32(x0_lo, x1_lo);
+  t0[1] = vaddq_s32(x0_hi, x1_hi);
+  t1[0] = vsubq_s32(x0_lo, x1_lo);
+  t1[1] = vsubq_s32(x0_hi, x1_hi);
+  *x1 = dct_const_round_shift_low_8(t0);
+  *x0 = dct_const_round_shift_low_8(t1);
  }
  
  static INLINE void iadst_butterfly_lane_0_1_neon(const int16x8_t in0,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl

index 2b15b661cd40b63364ac215b344eb0d58c82356e..23732e214eb13c5f81795ac8e0e5a218f2d82214 100644 (file)
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -69,7 +69,7 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
    # CONFIG_VP9_HIGHBITDEPTH is off.
    specialize qw/vp9_iht4x4_16_add neon sse2/;
    specialize qw/vp9_iht8x8_64_add neon sse2/;
-  specialize qw/vp9_iht16x16_256_add sse2/;
+  specialize qw/vp9_iht16x16_256_add neon sse2/;
    if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
      # Note that these specializations are appended to the above ones.
      specialize qw/vp9_iht4x4_16_add dspr2 msa/;
author	Linfeng Zhang <linfengz@google.com>
	Tue, 6 Mar 2018 00:14:35 +0000 (16:14 -0800)
committer	Linfeng Zhang <linfengz@google.com>
	Tue, 6 Mar 2018 00:14:35 +0000 (16:14 -0800)
test/dct_test.cc		patch \| blob \| history
vp9/common/arm/neon/vp9_iht16x16_add_neon.c		patch \| blob \| history
vp9/common/arm/neon/vp9_iht_neon.h		patch \| blob \| history
vp9/common/vp9_rtcd_defs.pl		patch \| blob \| history