From 88dc0d606255a5cba721c40b078b5802e891df57 Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Tue, 13 Mar 2018 16:10:00 -0700 Subject: [PATCH] Fix a bug in vp9_highbd_iht4x4_16_add_neon() This bug was introduced in 36363304. BUG=webm:1403 Change-Id: I695b409047e41ab7e0460981524310d78753751a --- test/dct_test.cc | 4 +- .../arm/neon/vp9_highbd_iht4x4_add_neon.c | 75 ++++++++++++------- vp9/common/vp9_rtcd_defs.pl | 2 +- 3 files changed, 51 insertions(+), 30 deletions(-) diff --git a/test/dct_test.cc b/test/dct_test.cc index 6ec8a874a..bfa05e5d8 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -629,11 +629,11 @@ INSTANTIATE_TEST_CASE_P( static const FuncInfo ht_neon_func_info[] = { #if CONFIG_VP9_HIGHBITDEPTH + { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper, 4, + 2 }, // TODO(linfengz): reenable these functions once test vector failures are // addressed. #if 0 - { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper, 4, - 2 }, { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper, 8, 2 }, #endif diff --git a/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c index 46284238d..52c4f1937 100644 --- a/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c +++ b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c @@ -23,34 +23,55 @@ static INLINE void highbd_iadst4(int32x4_t *const io) { const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 }; const int32x4_t sinpi = vld1q_s32(sinpis); - int32x4_t s[8]; - - s[0] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 0); - s[1] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 1); - s[2] = vmulq_lane_s32(io[1], vget_high_s32(sinpi), 0); - s[3] = vmulq_lane_s32(io[2], vget_high_s32(sinpi), 1); - s[4] = vmulq_lane_s32(io[2], vget_low_s32(sinpi), 0); - s[5] = vmulq_lane_s32(io[3], vget_low_s32(sinpi), 1); - s[6] = vmulq_lane_s32(io[3], vget_high_s32(sinpi), 1); - s[7] = vsubq_s32(io[0], io[2]); - s[7] = vaddq_s32(s[7], io[3]); - - s[0] = vaddq_s32(s[0], s[3]); - s[0] = vaddq_s32(s[0], s[5]); - s[1] = vsubq_s32(s[1], s[4]); - s[1] = vsubq_s32(s[1], s[6]); + int64x2x2_t s[7], t[4]; + int32x4_t s7; + + s[0].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 0); + s[0].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 0); + s[1].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 1); + s[1].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 1); + s[2].val[0] = vmull_lane_s32(vget_low_s32(io[1]), vget_high_s32(sinpi), 0); + s[2].val[1] = vmull_lane_s32(vget_high_s32(io[1]), vget_high_s32(sinpi), 0); + s[3].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_high_s32(sinpi), 1); + s[3].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_high_s32(sinpi), 1); + s[4].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_low_s32(sinpi), 0); + s[4].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_low_s32(sinpi), 0); + s[5].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_low_s32(sinpi), 1); + s[5].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_low_s32(sinpi), 1); + s[6].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_high_s32(sinpi), 1); + s[6].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_high_s32(sinpi), 1); + s7 = vsubq_s32(io[0], io[2]); + s7 = vaddq_s32(s7, io[3]); + + s[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]); + s[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]); + s[0].val[0] = vaddq_s64(s[0].val[0], s[5].val[0]); + s[0].val[1] = vaddq_s64(s[0].val[1], s[5].val[1]); + s[1].val[0] = vsubq_s64(s[1].val[0], s[4].val[0]); + s[1].val[1] = vsubq_s64(s[1].val[1], s[4].val[1]); + s[1].val[0] = vsubq_s64(s[1].val[0], s[6].val[0]); + s[1].val[1] = vsubq_s64(s[1].val[1], s[6].val[1]); s[3] = s[2]; - s[2] = vmulq_lane_s32(s[7], vget_high_s32(sinpi), 0); - - io[0] = vaddq_s32(s[0], s[3]); - io[1] = vaddq_s32(s[1], s[3]); - io[2] = s[2]; - io[3] = vaddq_s32(s[0], s[1]); - io[3] = vsubq_s32(io[3], s[3]); - io[0] = vrshrq_n_s32(io[0], DCT_CONST_BITS); - io[1] = vrshrq_n_s32(io[1], DCT_CONST_BITS); - io[2] = vrshrq_n_s32(io[2], DCT_CONST_BITS); - io[3] = vrshrq_n_s32(io[3], DCT_CONST_BITS); + s[2].val[0] = vmull_lane_s32(vget_low_s32(s7), vget_high_s32(sinpi), 0); + s[2].val[1] = vmull_lane_s32(vget_high_s32(s7), vget_high_s32(sinpi), 0); + + t[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]); + t[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]); + t[1].val[0] = vaddq_s64(s[1].val[0], s[3].val[0]); + t[1].val[1] = vaddq_s64(s[1].val[1], s[3].val[1]); + t[2] = s[2]; + t[3].val[0] = vaddq_s64(s[0].val[0], s[1].val[0]); + t[3].val[1] = vaddq_s64(s[0].val[1], s[1].val[1]); + t[3].val[0] = vsubq_s64(t[3].val[0], s[3].val[0]); + t[3].val[1] = vsubq_s64(t[3].val[1], s[3].val[1]); + io[0] = vcombine_s32(vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS)); + io[1] = vcombine_s32(vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS)); + io[2] = vcombine_s32(vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS)); + io[3] = vcombine_s32(vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS)); } void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 23732e214..fcd8f7e62 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -103,7 +103,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd"; if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { - specialize qw/vp9_highbd_iht4x4_16_add sse4_1/; + specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/; specialize qw/vp9_highbd_iht8x8_64_add sse4_1/; specialize qw/vp9_highbd_iht16x16_256_add sse4_1/; } -- 2.40.0