]> granicus.if.org Git - libvpx/commitdiff
Fix a bug in vp9_highbd_iht4x4_16_add_neon()
authorLinfeng Zhang <linfengz@google.com>
Tue, 13 Mar 2018 23:10:00 +0000 (16:10 -0700)
committerLinfeng Zhang <linfengz@google.com>
Tue, 13 Mar 2018 23:10:00 +0000 (16:10 -0700)
This bug was introduced in 36363304.

BUG=webm:1403

Change-Id: I695b409047e41ab7e0460981524310d78753751a

test/dct_test.cc
vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
vp9/common/vp9_rtcd_defs.pl

index 6ec8a874a0d5c4695c0240d204ff9a40e6d9dc85..bfa05e5d825037ede2bb0ed4403b16a69855f485 100644 (file)
@@ -629,11 +629,11 @@ INSTANTIATE_TEST_CASE_P(
 
 static const FuncInfo ht_neon_func_info[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
+  { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4,
+    2 },
 // TODO(linfengz): reenable these functions once test vector failures are
 // addressed.
 #if 0
-  { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4,
-    2 },
   { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>, 8,
     2 },
 #endif
index 46284238d61a0f0acf9ba4aeedd463fac7390694..52c4f1937dfb1716b171dbb1555c16a950680f91 100644 (file)
 static INLINE void highbd_iadst4(int32x4_t *const io) {
   const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 };
   const int32x4_t sinpi = vld1q_s32(sinpis);
-  int32x4_t s[8];
-
-  s[0] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 0);
-  s[1] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 1);
-  s[2] = vmulq_lane_s32(io[1], vget_high_s32(sinpi), 0);
-  s[3] = vmulq_lane_s32(io[2], vget_high_s32(sinpi), 1);
-  s[4] = vmulq_lane_s32(io[2], vget_low_s32(sinpi), 0);
-  s[5] = vmulq_lane_s32(io[3], vget_low_s32(sinpi), 1);
-  s[6] = vmulq_lane_s32(io[3], vget_high_s32(sinpi), 1);
-  s[7] = vsubq_s32(io[0], io[2]);
-  s[7] = vaddq_s32(s[7], io[3]);
-
-  s[0] = vaddq_s32(s[0], s[3]);
-  s[0] = vaddq_s32(s[0], s[5]);
-  s[1] = vsubq_s32(s[1], s[4]);
-  s[1] = vsubq_s32(s[1], s[6]);
+  int64x2x2_t s[7], t[4];
+  int32x4_t s7;
+
+  s[0].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 0);
+  s[0].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 0);
+  s[1].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 1);
+  s[1].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 1);
+  s[2].val[0] = vmull_lane_s32(vget_low_s32(io[1]), vget_high_s32(sinpi), 0);
+  s[2].val[1] = vmull_lane_s32(vget_high_s32(io[1]), vget_high_s32(sinpi), 0);
+  s[3].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_high_s32(sinpi), 1);
+  s[3].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_high_s32(sinpi), 1);
+  s[4].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_low_s32(sinpi), 0);
+  s[4].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_low_s32(sinpi), 0);
+  s[5].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_low_s32(sinpi), 1);
+  s[5].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_low_s32(sinpi), 1);
+  s[6].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_high_s32(sinpi), 1);
+  s[6].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_high_s32(sinpi), 1);
+  s7 = vsubq_s32(io[0], io[2]);
+  s7 = vaddq_s32(s7, io[3]);
+
+  s[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]);
+  s[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]);
+  s[0].val[0] = vaddq_s64(s[0].val[0], s[5].val[0]);
+  s[0].val[1] = vaddq_s64(s[0].val[1], s[5].val[1]);
+  s[1].val[0] = vsubq_s64(s[1].val[0], s[4].val[0]);
+  s[1].val[1] = vsubq_s64(s[1].val[1], s[4].val[1]);
+  s[1].val[0] = vsubq_s64(s[1].val[0], s[6].val[0]);
+  s[1].val[1] = vsubq_s64(s[1].val[1], s[6].val[1]);
   s[3] = s[2];
-  s[2] = vmulq_lane_s32(s[7], vget_high_s32(sinpi), 0);
-
-  io[0] = vaddq_s32(s[0], s[3]);
-  io[1] = vaddq_s32(s[1], s[3]);
-  io[2] = s[2];
-  io[3] = vaddq_s32(s[0], s[1]);
-  io[3] = vsubq_s32(io[3], s[3]);
-  io[0] = vrshrq_n_s32(io[0], DCT_CONST_BITS);
-  io[1] = vrshrq_n_s32(io[1], DCT_CONST_BITS);
-  io[2] = vrshrq_n_s32(io[2], DCT_CONST_BITS);
-  io[3] = vrshrq_n_s32(io[3], DCT_CONST_BITS);
+  s[2].val[0] = vmull_lane_s32(vget_low_s32(s7), vget_high_s32(sinpi), 0);
+  s[2].val[1] = vmull_lane_s32(vget_high_s32(s7), vget_high_s32(sinpi), 0);
+
+  t[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]);
+  t[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]);
+  t[1].val[0] = vaddq_s64(s[1].val[0], s[3].val[0]);
+  t[1].val[1] = vaddq_s64(s[1].val[1], s[3].val[1]);
+  t[2] = s[2];
+  t[3].val[0] = vaddq_s64(s[0].val[0], s[1].val[0]);
+  t[3].val[1] = vaddq_s64(s[0].val[1], s[1].val[1]);
+  t[3].val[0] = vsubq_s64(t[3].val[0], s[3].val[0]);
+  t[3].val[1] = vsubq_s64(t[3].val[1], s[3].val[1]);
+  io[0] = vcombine_s32(vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS));
+  io[1] = vcombine_s32(vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS));
+  io[2] = vcombine_s32(vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS));
+  io[3] = vcombine_s32(vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS));
 }
 
 void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
index 23732e214eb13c5f81795ac8e0e5a218f2d82214..fcd8f7e62f6b18b8760046bcc3e360bc88463868 100644 (file)
@@ -103,7 +103,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
 
   if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
-    specialize qw/vp9_highbd_iht4x4_16_add sse4_1/;
+    specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/;
     specialize qw/vp9_highbd_iht8x8_64_add sse4_1/;
     specialize qw/vp9_highbd_iht16x16_256_add sse4_1/;
   }