From: Linfeng Zhang Date: Mon, 5 Mar 2018 23:16:49 +0000 (-0800) Subject: Fix a bug in vp9_iht8x8_64_add_neon() X-Git-Tag: v1.8.0~817^2~1 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c244a862341826713c4ad17b1f7c637e8395cdfc;p=libvpx Fix a bug in vp9_iht8x8_64_add_neon() This bug was introduced in b14b616d. BUG=webm:1403 Change-Id: I84b2733734982e52b66548850d61758c772b5494 --- diff --git a/test/dct_test.cc b/test/dct_test.cc index 2a6fccb67..bc1afbbe2 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -639,10 +639,10 @@ static const FuncInfo ht_neon_func_info[] = { #endif #endif { &vp9_fht4x4_c, &iht_wrapper, 4, 1 }, -// TODO(linfengz): reenable these functions once test vector failures are -// addressed. -#if 0 + // TODO(linfengz): reenable these functions once test vector failures are + // addressed. { &vp9_fht8x8_c, &iht_wrapper, 8, 1 }, +#if 0 { &vp9_fht16x16_c, &iht_wrapper, 16, 1 } #endif }; diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 15033dbc1..d1b49dfa7 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -675,9 +675,8 @@ INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT, ::testing::Values(make_tuple(&vpx_fdct8x8_neon, &vpx_idct8x8_64_add_neon, 0, VPX_BITS_8))); -// TODO(linfengz): reenable these functions once test vector failures are -// addressed. -#if 0 // !CONFIG_VP9_HIGHBITDEPTH + +#if !CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P( NEON, FwdTrans8x8HT, ::testing::Values( diff --git a/vp9/common/arm/neon/vp9_iht_neon.h b/vp9/common/arm/neon/vp9_iht_neon.h index e918ebc7f..965eff36b 100644 --- a/vp9/common/arm/neon/vp9_iht_neon.h +++ b/vp9/common/arm/neon/vp9_iht_neon.h @@ -59,14 +59,17 @@ static INLINE void iadst4(int16x8_t *const io) { static INLINE void iadst_half_butterfly_neon(int16x8_t *const x, const int16x4_t c) { - const int16x8_t sum = vaddq_s16(x[0], x[1]); - const int16x8_t sub = vsubq_s16(x[0], x[1]); + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0); int32x4_t t0[2], t1[2]; - t0[0] = vmull_lane_s16(vget_low_s16(sum), c, 0); - t0[1] = vmull_lane_s16(vget_high_s16(sum), c, 0); - t1[0] = vmull_lane_s16(vget_low_s16(sub), c, 0); - t1[1] = vmull_lane_s16(vget_high_s16(sub), c, 0); + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); x[0] = dct_const_round_shift_low_8(t0); x[1] = dct_const_round_shift_low_8(t1); } diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 274edfd44..2b15b661c 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -68,7 +68,7 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { # Note that there are more specializations appended when # CONFIG_VP9_HIGHBITDEPTH is off. specialize qw/vp9_iht4x4_16_add neon sse2/; - specialize qw/vp9_iht8x8_64_add sse2/; + specialize qw/vp9_iht8x8_64_add neon sse2/; specialize qw/vp9_iht16x16_256_add sse2/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { # Note that these specializations are appended to the above ones.