From: Johann Koenig Date: Wed, 10 May 2017 18:19:50 +0000 (+0000) Subject: Merge changes I92eb4312,Ibb2afe4e X-Git-Tag: v1.7.0~482 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d713ec3c46c5c8e57b2c8d3fe36623b48c863189;hp=2346a6da4a3703eb2cb346f3b4a8e6d8a25c70f6;p=libvpx Merge changes I92eb4312,Ibb2afe4e * changes: subpel variance neon: add mixed sizes sub pixel variance neon: use generic variance --- diff --git a/test/datarate_test.cc b/test/datarate_test.cc index dff75357e..c1a925f7b 100644 --- a/test/datarate_test.cc +++ b/test/datarate_test.cc @@ -1041,7 +1041,7 @@ TEST_P(DatarateTestVP9LargeDenoiser, 4threads) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.28) + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.29) << " The datarate for the file is greater than target by too much!"; } diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index f9745ed81..6ea77fde2 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -255,11 +255,11 @@ void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride, #if CONFIG_VP9_HIGHBITDEPTH void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_256_add_c(in, out, stride, 10); + vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_256_add_c(in, out, stride, 12); + vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride, @@ -273,36 +273,36 @@ void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride, } void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 10); + vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10); } void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12); + vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12); } #if HAVE_SSE2 void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_10_add_c(in, out, stride, 10); + vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_10_add_c(in, out, stride, 12); + vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 10); + vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 12); + vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 10); + vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 12); + vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } #endif // HAVE_SSE2 #endif // CONFIG_VP9_HIGHBITDEPTH @@ -353,7 +353,7 @@ class Trans16x16TestBase { #if CONFIG_VP9_HIGHBITDEPTH } else { ASM_REGISTER_STATE_CHECK( - RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } @@ -475,10 +475,10 @@ class Trans16x16TestBase { ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_)); #if CONFIG_VP9_HIGHBITDEPTH } else { - inv_txfm_ref(output_ref_block, CONVERT_TO_BYTEPTR(ref16), pitch_, + inv_txfm_ref(output_ref_block, CAST_TO_BYTEPTR(ref16), pitch_, tx_type_); ASM_REGISTER_STATE_CHECK( - RunInvTxfm(output_ref_block, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(output_ref_block, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } if (bit_depth_ == VPX_BITS_8) { @@ -530,8 +530,7 @@ class Trans16x16TestBase { ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16)); #if CONFIG_VP9_HIGHBITDEPTH } else { - ASM_REGISTER_STATE_CHECK( - RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), 16)); + ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), 16)); #endif // CONFIG_VP9_HIGHBITDEPTH } @@ -585,9 +584,9 @@ class Trans16x16TestBase { ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_)); } else { #if CONFIG_VP9_HIGHBITDEPTH - ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_); + ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); ASM_REGISTER_STATE_CHECK( - RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); #endif // CONFIG_VP9_HIGHBITDEPTH } diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index a168e690e..d8054c4eb 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -71,11 +71,11 @@ typedef std::tr1::tuple #if CONFIG_VP9_HIGHBITDEPTH void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct32x32_1024_add_c(in, out, stride, 10); + vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct32x32_1024_add_c(in, out, stride, 12); + vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -137,7 +137,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) { #if CONFIG_VP9_HIGHBITDEPTH } else { ASM_REGISTER_STATE_CHECK( - inv_txfm_(test_temp_block, CONVERT_TO_BYTEPTR(dst16), 32)); + inv_txfm_(test_temp_block, CAST_TO_BYTEPTR(dst16), 32)); #endif } @@ -275,7 +275,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) { ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32)); #if CONFIG_VP9_HIGHBITDEPTH } else { - ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CONVERT_TO_BYTEPTR(dst16), 32)); + ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CAST_TO_BYTEPTR(dst16), 32)); #endif } for (int j = 0; j < kNumCoeffs; ++j) { diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc index 444b0209d..bd2327520 100644 --- a/test/fdct4x4_test.cc +++ b/test/fdct4x4_test.cc @@ -55,36 +55,36 @@ void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride, #if CONFIG_VP9_HIGHBITDEPTH void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_c(in, out, stride, 10); + vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_c(in, out, stride, 12); + vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 10); + vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10); } void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 12); + vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12); } void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_iwht4x4_16_add_c(in, out, stride, 10); + vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_iwht4x4_16_add_c(in, out, stride, 12); + vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } #if HAVE_SSE2 void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 10); + vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 12); + vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } #endif // HAVE_SSE2 #endif // CONFIG_VP9_HIGHBITDEPTH @@ -135,7 +135,7 @@ class Trans4x4TestBase { #if CONFIG_VP9_HIGHBITDEPTH } else { ASM_REGISTER_STATE_CHECK( - RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } @@ -249,7 +249,7 @@ class Trans4x4TestBase { #if CONFIG_VP9_HIGHBITDEPTH } else { ASM_REGISTER_STATE_CHECK( - RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 7c2df67f6..dfbb5dc3d 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -88,45 +88,45 @@ void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) { #if CONFIG_VP9_HIGHBITDEPTH void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_64_add_c(in, out, stride, 10); + vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_64_add_c(in, out, stride, 12); + vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 10); + vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10); } void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12); + vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12); } #if HAVE_SSE2 void idct8x8_12_add_10_c(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_12_add_c(in, out, stride, 10); + vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct8x8_12_add_12_c(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_12_add_c(in, out, stride, 12); + vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct8x8_12_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 10); + vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct8x8_12_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 12); + vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 10); + vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 12); + vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } #endif // HAVE_SSE2 #endif // CONFIG_VP9_HIGHBITDEPTH @@ -257,7 +257,7 @@ class FwdTrans8x8TestBase { #if CONFIG_VP9_HIGHBITDEPTH } else { ASM_REGISTER_STATE_CHECK( - RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } @@ -340,7 +340,7 @@ class FwdTrans8x8TestBase { #if CONFIG_VP9_HIGHBITDEPTH } else { ASM_REGISTER_STATE_CHECK( - RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } @@ -413,7 +413,7 @@ class FwdTrans8x8TestBase { #if CONFIG_VP9_HIGHBITDEPTH } else { ASM_REGISTER_STATE_CHECK( - RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } @@ -497,9 +497,9 @@ class FwdTrans8x8TestBase { ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_)); #if CONFIG_VP9_HIGHBITDEPTH } else { - ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_); + ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); ASM_REGISTER_STATE_CHECK( - RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } @@ -671,16 +671,11 @@ INSTANTIATE_TEST_CASE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON && !CONFIG_EMULATE_HARDWARE -#if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT, - ::testing::Values(make_tuple(&vpx_fdct8x8_neon, - &vpx_idct8x8_64_add_neon, - 0, VPX_BITS_8))); -#else // !CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT, ::testing::Values(make_tuple(&vpx_fdct8x8_neon, &vpx_idct8x8_64_add_neon, 0, VPX_BITS_8))); +#if !CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P( NEON, FwdTrans8x8HT, ::testing::Values( @@ -688,8 +683,8 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8), make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8), make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8))); -#endif // CONFIG_VP9_HIGHBITDEPTH -#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#endif // !CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P(SSE2, FwdTrans8x8DCT, diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 0759cd41c..2dda7da45 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -43,9 +43,11 @@ void wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) { } #if CONFIG_VP9_HIGHBITDEPTH -template +typedef void (*InvTxfmHighbdFunc)(const tran_low_t *in, uint16_t *out, + int stride, int bd); +template void highbd_wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) { - fn(in, CONVERT_TO_BYTEPTR(out), stride, bd); + fn(in, CAST_TO_SHORTPTR(out), stride, bd); } #endif @@ -452,31 +454,31 @@ const PartialInvTxfmParam neon_partial_idct_tests[] = { &highbd_wrapper, TX_32X32, 1024, 12, 2), make_tuple( - &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &vpx_highbd_fdct32x32_c, &highbd_wrapper, &highbd_wrapper, TX_32X32, 135, 8, 2), make_tuple( - &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &vpx_highbd_fdct32x32_c, &highbd_wrapper, &highbd_wrapper, TX_32X32, 135, 10, 2), make_tuple( - &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &vpx_highbd_fdct32x32_c, &highbd_wrapper, &highbd_wrapper, TX_32X32, 135, 12, 2), make_tuple( - &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &vpx_highbd_fdct32x32_c, &highbd_wrapper, &highbd_wrapper, TX_32X32, 34, 8, 2), make_tuple( - &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &vpx_highbd_fdct32x32_c, &highbd_wrapper, &highbd_wrapper, TX_32X32, 34, 10, 2), make_tuple( - &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &vpx_highbd_fdct32x32_c, &highbd_wrapper, &highbd_wrapper, TX_32X32, 34, 12, 2), make_tuple( - &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &vpx_highbd_fdct32x32_c, &highbd_wrapper, &highbd_wrapper, TX_32X32, 1, 8, 2), make_tuple( - &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &vpx_highbd_fdct32x32_c, &highbd_wrapper, &highbd_wrapper, TX_32X32, 1, 10, 2), make_tuple( - &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &vpx_highbd_fdct32x32_c, &highbd_wrapper, &highbd_wrapper, TX_32X32, 1, 12, 2), make_tuple( &vpx_highbd_fdct16x16_c, &highbd_wrapper, @@ -488,31 +490,31 @@ const PartialInvTxfmParam neon_partial_idct_tests[] = { &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 256, 12, 2), make_tuple( - &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 38, 8, 2), make_tuple( - &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 38, 10, 2), make_tuple( - &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 38, 12, 2), make_tuple( - &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 10, 8, 2), make_tuple( - &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 10, 10, 2), make_tuple( - &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 10, 12, 2), make_tuple( - &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 1, 8, 2), make_tuple( - &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 1, 10, 2), make_tuple( - &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 1, 12, 2), make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, @@ -556,29 +558,29 @@ const PartialInvTxfmParam neon_partial_idct_tests[] = { #endif // CONFIG_VP9_HIGHBITDEPTH make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1024, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 135, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 34, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1, 8, 1), make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 256, 8, 1), - make_tuple(&vpx_fdct16x16_c, &wrapper, + make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 38, 8, 1), - make_tuple(&vpx_fdct16x16_c, &wrapper, + make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 10, 8, 1), - make_tuple(&vpx_fdct16x16_c, &wrapper, + make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 1, 8, 1), make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 64, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 12, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 1, 8, 1), make_tuple(&vpx_fdct4x4_c, &wrapper, &wrapper, TX_4X4, 16, 8, 1), - make_tuple(&vpx_fdct4x4_c, &wrapper, + make_tuple(&vpx_fdct4x4_c, &wrapper, &wrapper, TX_4X4, 1, 8, 1) }; @@ -591,13 +593,13 @@ INSTANTIATE_TEST_CASE_P(NEON, PartialIDctTest, const PartialInvTxfmParam sse2_partial_idct_tests[] = { #if CONFIG_VP9_HIGHBITDEPTH make_tuple( - &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &vpx_highbd_fdct32x32_c, &highbd_wrapper, &highbd_wrapper, TX_32X32, 1, 8, 2), make_tuple( - &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &vpx_highbd_fdct32x32_c, &highbd_wrapper, &highbd_wrapper, TX_32X32, 1, 10, 2), make_tuple( - &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &vpx_highbd_fdct32x32_c, &highbd_wrapper, &highbd_wrapper, TX_32X32, 1, 12, 2), make_tuple( &vpx_highbd_fdct16x16_c, &highbd_wrapper, @@ -609,13 +611,13 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = { &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 256, 12, 2), make_tuple( - &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 10, 8, 2), make_tuple( - &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 10, 10, 2), make_tuple( - &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 10, 12, 2), make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, @@ -627,13 +629,13 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = { &vpx_highbd_fdct8x8_c, &highbd_wrapper, &highbd_wrapper, TX_8X8, 64, 12, 2), make_tuple(&vpx_highbd_fdct8x8_c, - &highbd_wrapper, + &highbd_wrapper, &highbd_wrapper, TX_8X8, 12, 8, 2), make_tuple( - &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &vpx_highbd_fdct8x8_c, &highbd_wrapper, &highbd_wrapper, TX_8X8, 12, 10, 2), make_tuple( - &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &vpx_highbd_fdct8x8_c, &highbd_wrapper, &highbd_wrapper, TX_8X8, 12, 12, 2), make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, @@ -647,27 +649,25 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = { #endif // CONFIG_VP9_HIGHBITDEPTH make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1024, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, - &wrapper, TX_32X32, 135, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 34, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1, 8, 1), make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 256, 8, 1), - make_tuple(&vpx_fdct16x16_c, &wrapper, + make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 10, 8, 1), - make_tuple(&vpx_fdct16x16_c, &wrapper, + make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 1, 8, 1), make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 64, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 12, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 1, 8, 1), make_tuple(&vpx_fdct4x4_c, &wrapper, &wrapper, TX_4X4, 16, 8, 1), - make_tuple(&vpx_fdct4x4_c, &wrapper, + make_tuple(&vpx_fdct4x4_c, &wrapper, &wrapper, TX_4X4, 1, 8, 1) }; @@ -680,13 +680,13 @@ INSTANTIATE_TEST_CASE_P(SSE2, PartialIDctTest, const PartialInvTxfmParam ssse3_partial_idct_tests[] = { make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1024, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 135, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 34, 8, 1), make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 64, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 12, 8, 1) }; @@ -698,27 +698,25 @@ INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest, const PartialInvTxfmParam dspr2_partial_idct_tests[] = { make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1024, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, - &wrapper, TX_32X32, 135, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 34, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1, 8, 1), make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 256, 8, 1), - make_tuple(&vpx_fdct16x16_c, &wrapper, + make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 10, 8, 1), - make_tuple(&vpx_fdct16x16_c, &wrapper, + make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 1, 8, 1), make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 64, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 12, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 1, 8, 1), make_tuple(&vpx_fdct4x4_c, &wrapper, &wrapper, TX_4X4, 16, 8, 1), - make_tuple(&vpx_fdct4x4_c, &wrapper, + make_tuple(&vpx_fdct4x4_c, &wrapper, &wrapper, TX_4X4, 1, 8, 1) }; @@ -731,27 +729,25 @@ INSTANTIATE_TEST_CASE_P(DSPR2, PartialIDctTest, const PartialInvTxfmParam msa_partial_idct_tests[] = { make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1024, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, - &wrapper, TX_32X32, 135, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 34, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1, 8, 1), make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 256, 8, 1), - make_tuple(&vpx_fdct16x16_c, &wrapper, + make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 10, 8, 1), - make_tuple(&vpx_fdct16x16_c, &wrapper, + make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 1, 8, 1), make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 64, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 12, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 1, 8, 1), make_tuple(&vpx_fdct4x4_c, &wrapper, &wrapper, TX_4X4, 16, 8, 1), - make_tuple(&vpx_fdct4x4_c, &wrapper, + make_tuple(&vpx_fdct4x4_c, &wrapper, &wrapper, TX_4X4, 1, 8, 1) }; diff --git a/test/variance_test.cc b/test/variance_test.cc index 9eb9be3a1..57d57c972 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -22,6 +22,7 @@ #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vpx_ports/vpx_timer.h" namespace { @@ -345,6 +346,7 @@ class MainTestClass void RefTest(); void RefStrideTest(); void OneQuarterTest(); + void SpeedTest(); // MSE/SSE tests void RefTestMse(); @@ -363,6 +365,7 @@ class MainTestClass int byte_shift() const { return params_.bit_depth - 8; } int block_size() const { return params_.block_size; } int width() const { return params_.width; } + int height() const { return params_.height; } uint32_t mask() const { return params_.mask; } }; @@ -471,6 +474,35 @@ void MainTestClass::OneQuarterTest() { EXPECT_EQ(expected, var); } +template +void MainTestClass::SpeedTest() { + const int half = block_size() / 2; + if (!use_high_bit_depth()) { + memset(src_, 255, block_size()); + memset(ref_, 255, half); + memset(ref_ + half, 0, half); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size()); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << byte_shift(), half); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + unsigned int sse; + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < 100000000 / block_size(); ++i) { + const uint32_t variance = params_.func(src_, width(), ref_, width(), &sse); + // Ignore return value. + (void)variance; + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("Variance %dx%d time: %5d ms\n", width(), height(), + elapsed_time / 1000); +} + //////////////////////////////////////////////////////////////////////////////// // Tests related to MSE / SSE. @@ -727,6 +759,7 @@ TEST_P(VpxVarianceTest, Zero) { ZeroTest(); } TEST_P(VpxVarianceTest, Ref) { RefTest(); } TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); } TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); } +TEST_P(VpxVarianceTest, DISABLED_Speed) { SpeedTest(); } TEST_P(SumOfSquaresTest, Const) { ConstTest(); } TEST_P(SumOfSquaresTest, Ref) { RefTest(); } TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); } @@ -809,6 +842,7 @@ TEST_P(VpxHBDVarianceTest, Zero) { ZeroTest(); } TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); } TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); } TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); } +TEST_P(VpxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); } TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); } TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); } @@ -1219,10 +1253,13 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(6, 5, &vpx_variance64x32_neon), VarianceParams(5, 6, &vpx_variance32x64_neon), VarianceParams(5, 5, &vpx_variance32x32_neon), + VarianceParams(5, 4, &vpx_variance32x16_neon), + VarianceParams(4, 5, &vpx_variance16x32_neon), VarianceParams(4, 4, &vpx_variance16x16_neon), VarianceParams(4, 3, &vpx_variance16x8_neon), VarianceParams(3, 4, &vpx_variance8x16_neon), - VarianceParams(3, 3, &vpx_variance8x8_neon))); + VarianceParams(3, 3, &vpx_variance8x8_neon), + VarianceParams(3, 2, &vpx_variance8x4_neon))); INSTANTIATE_TEST_CASE_P( NEON, VpxSubpelVarianceTest, diff --git a/test/vp9_denoiser_test.cc b/test/vp9_denoiser_test.cc index 4a10e187b..56ca257c5 100644 --- a/test/vp9_denoiser_test.cc +++ b/test/vp9_denoiser_test.cc @@ -116,4 +116,19 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_denoiser_filter_sse2, BLOCK_64X32), make_tuple(&vp9_denoiser_filter_sse2, BLOCK_64X64))); #endif // HAVE_SSE2 + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P( + NEON, VP9DenoiserTest, + ::testing::Values(make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X8), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X16), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X8), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X16), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X32), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X16), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X32), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X64), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_64X32), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_64X64))); +#endif } // namespace diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index 55957414c..69069042c 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -205,7 +205,7 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd) { const highbd_transform_2d IHT_4[] = { { vpx_highbd_idct4_c, vpx_highbd_idct4_c }, // DCT_DCT = 0 @@ -213,7 +213,6 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, { vpx_highbd_idct4_c, vpx_highbd_iadst4_c }, // DCT_ADST = 2 { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c } // ADST_ADST = 3 }; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); int i, j; tran_low_t out[4 * 4]; @@ -245,14 +244,13 @@ static const highbd_transform_2d HIGH_IHT_8[] = { { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c } // ADST_ADST = 3 }; -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd) { int i, j; tran_low_t out[8 * 8]; tran_low_t *outptr = out; tran_low_t temp_in[8], temp_out[8]; const highbd_transform_2d ht = HIGH_IHT_8[tx_type]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Inverse transform row vectors. for (i = 0; i < 8; ++i) { @@ -279,14 +277,13 @@ static const highbd_transform_2d HIGH_IHT_16[] = { { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c } // ADST_ADST = 3 }; -void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd) { int i, j; tran_low_t out[16 * 16]; tran_low_t *outptr = out; tran_low_t temp_in[16], temp_out[16]; const highbd_transform_2d ht = HIGH_IHT_16[tx_type]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Rows for (i = 0; i < 16; ++i) { @@ -307,7 +304,7 @@ void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, } // idct -void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { if (eob > 1) vpx_highbd_idct4x4_16_add(input, dest, stride, bd); @@ -315,7 +312,7 @@ void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, vpx_highbd_idct4x4_1_add(input, dest, stride, bd); } -void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { if (eob > 1) vpx_highbd_iwht4x4_16_add(input, dest, stride, bd); @@ -323,7 +320,7 @@ void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, vpx_highbd_iwht4x4_1_add(input, dest, stride, bd); } -void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { // If dc is 1, then input[0] is the reconstructed value, do not need // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. @@ -340,7 +337,7 @@ void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, } } -void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { // The calculation can be simplified if there are not many non-zero dct // coefficients. Use eobs to separate different cases. @@ -356,7 +353,7 @@ void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest, } } -void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { // Non-zero coeff only in upper-left 8x8 if (eob == 1) { @@ -372,7 +369,7 @@ void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, // iht void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd) { + uint16_t *dest, int stride, int eob, int bd) { if (tx_type == DCT_DCT) vp9_highbd_idct4x4_add(input, dest, stride, eob, bd); else @@ -380,7 +377,7 @@ void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, } void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd) { + uint16_t *dest, int stride, int eob, int bd) { if (tx_type == DCT_DCT) { vp9_highbd_idct8x8_add(input, dest, stride, eob, bd); } else { @@ -389,7 +386,7 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, } void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd) { + uint16_t *dest, int stride, int eob, int bd) { if (tx_type == DCT_DCT) { vp9_highbd_idct16x16_add(input, dest, stride, eob, bd); } else { diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h index ea958a38c..3e83b8402 100644 --- a/vp9/common/vp9_idct.h +++ b/vp9/common/vp9_idct.h @@ -57,22 +57,22 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, int stride, int eob); #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd); + uint16_t *dest, int stride, int eob, int bd); void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd); + uint16_t *dest, int stride, int eob, int bd); void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd); + uint16_t *dest, int stride, int eob, int bd); #endif // CONFIG_VP9_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index da449e254..20013cb9e 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -101,11 +101,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Note as optimized versions of these functions are added we need to add a check to ensure # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. - add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd"; + add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd"; - add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd"; + add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd"; - add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd"; + add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd"; } # @@ -120,7 +120,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { # if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") { add_proto qw/int vp9_denoiser_filter/, "const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude"; - specialize qw/vp9_denoiser_filter sse2/; + specialize qw/vp9_denoiser_filter neon sse2/; } if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { @@ -225,7 +225,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # frame based scale # -add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, int phase_scaler"; +add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, uint8_t filter_type, int phase_scaler"; specialize qw/vp9_scale_and_extend_frame ssse3/; } diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index dcfc454aa..bb2dcf52b 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -16,7 +16,6 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[2]; - const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); in[0] = load_input_data(input); @@ -49,31 +48,7 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, in[0] = _mm_srai_epi16(in[0], 4); in[1] = _mm_srai_epi16(in[1], 4); - // Reconstruction and Store - { - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); - __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi32(d0, - _mm_cvtsi32_si128(*(const int *)(dest + stride))); - d2 = _mm_unpacklo_epi32( - d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3))); - d0 = _mm_unpacklo_epi8(d0, zero); - d2 = _mm_unpacklo_epi8(d2, zero); - d0 = _mm_add_epi16(d0, in[0]); - d2 = _mm_add_epi16(d2, in[1]); - d0 = _mm_packus_epi16(d0, d2); - // store result[0] - *(int *)dest = _mm_cvtsi128_si32(d0); - // store result[1] - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); - // store result[2] - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); - // store result[3] - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); - } + recon_and_store4x4_sse2(in, dest, stride); } void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index a9e5eebba..0760f8c23 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -189,21 +189,22 @@ static void inverse_transform_block_inter(MACROBLOCKD *xd, int plane, assert(eob > 0); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); if (xd->lossless) { - vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd); } else { switch (tx_size) { case TX_4X4: - vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct4x4_add(dqcoeff, dst16, stride, eob, xd->bd); break; case TX_8X8: - vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct8x8_add(dqcoeff, dst16, stride, eob, xd->bd); break; case TX_16X16: - vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct16x16_add(dqcoeff, dst16, stride, eob, xd->bd); break; case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd); break; default: assert(0 && "Invalid transform size"); } @@ -256,21 +257,22 @@ static void inverse_transform_block_intra(MACROBLOCKD *xd, int plane, assert(eob > 0); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); if (xd->lossless) { - vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd); } else { switch (tx_size) { case TX_4X4: - vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd); break; case TX_8X8: - vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd); break; case TX_16X16: - vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd); break; case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd); break; default: assert(0 && "Invalid transform size"); } diff --git a/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/vp9/encoder/arm/neon/vp9_denoiser_neon.c new file mode 100644 index 000000000..4152e7bb5 --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_denoiser_neon.c @@ -0,0 +1,352 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_denoiser.h" +#include "vpx_mem/vpx_mem.h" + +// Compute the sum of all pixel differences of this MB. +static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) { + const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total); + const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210); + const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210), + vget_low_s64(fedcba98_76543210)); + const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0); + return sum_diff; +} + +// Denoise a 16x1 vector. +static INLINE int8x16_t denoiser_16x1_neon( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold, + const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment, + const uint8x16_t v_delta_level_1_and_2, + const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) { + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + + /* Figure out which level that put us in. */ + const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff); + const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff); + const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff); + + /* Calculate absolute adjustments for level 1, 2 and 3. */ + const uint8x16_t v_level2_adjustment = + vandq_u8(v_level2_mask, v_delta_level_1_and_2); + const uint8x16_t v_level3_adjustment = + vandq_u8(v_level3_mask, v_delta_level_2_and_3); + const uint8x16_t v_level1and2_adjustment = + vaddq_u8(v_level1_adjustment, v_level2_adjustment); + const uint8x16_t v_level1and2and3_adjustment = + vaddq_u8(v_level1and2_adjustment, v_level3_adjustment); + + /* Figure adjustment absolute value by selecting between the absolute + * difference if in level0 or the value for level 1, 2 and 3. + */ + const uint8x16_t v_abs_adjustment = + vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff); + + /* Calculate positive and negative adjustments. Apply them to the signal + * and accumulate them. Adjustments are less than eight and the maximum + * sum of them (7 * 16) can fit in a signed char. + */ + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + + uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment); + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + /* Sum all the accumulators to have the sum of all pixel differences + * for this macroblock. + */ + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment), + vreinterpretq_s8_u8(v_neg_adjustment)); + v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff); + } + return v_sum_diff_total; +} + +static INLINE int8x16_t denoiser_adjust_16x1_neon( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const uint8x16_t k_delta, int8x16_t v_sum_diff_total) { + uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y); + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + // Clamp absolute difference to delta to get the adjustment. + const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta)); + + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment); + v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment), + vreinterpretq_s8_u8(v_pos_adjustment)); + v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff); + } + return v_sum_diff_total; +} + +// Denoise 8x8 and 8x16 blocks. +static int vp9_denoiser_8xN_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, uint8_t *running_avg_y, + int avg_y_stride, int increase_denoising, + BLOCK_SIZE bs, int motion_magnitude, + int width) { + int sum_diff_thresh, r, sum_diff = 0; + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16]; + + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + + const int b_height = (4 << b_height_log2_lookup[bs]) >> 1; + + int8x16_t v_sum_diff_total = vdupq_n_s8(0); + + for (r = 0; r < b_height; ++r) { + memcpy(sig_buffer[r], sig, width); + memcpy(sig_buffer[r] + width, sig + sig_stride, width); + memcpy(mc_running_buffer[r], mc_running_avg_y, width); + memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride, + width); + memcpy(running_buffer[r], running_avg_y, width); + memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width); + v_sum_diff_total = denoiser_16x1_neon( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], + v_level1_threshold, v_level2_threshold, v_level3_threshold, + v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3, + v_sum_diff_total); + { + const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]); + const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer); + const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer); + vst1_u8(running_avg_y, v_running_buffer_low); + vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high); + } + // Update pointers for next iteration. + sig += (sig_stride << 1); + mc_running_avg_y += (mc_avg_y_stride << 1); + running_avg_y += (avg_y_stride << 1); + } + + { + sum_diff = horizontal_add_s8x16(v_sum_diff_total); + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // check if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the acceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vmovq_n_u8(delta); + running_avg_y -= avg_y_stride * (b_height << 1); + for (r = 0; r < b_height; ++r) { + v_sum_diff_total = denoiser_adjust_16x1_neon( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta, + v_sum_diff_total); + { + const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]); + const uint8x8_t v_running_buffer_high = + vget_high_u8(v_running_buffer); + const uint8x8_t v_running_buffer_low = + vget_low_u8(v_running_buffer); + vst1_u8(running_avg_y, v_running_buffer_low); + vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high); + } + // Update pointers for next iteration. + running_avg_y += (avg_y_stride << 1); + } + sum_diff = horizontal_add_s8x16(v_sum_diff_total); + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + + return FILTER_BLOCK; +} + +// Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks. +static int vp9_denoiser_NxM_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, uint8_t *running_avg_y, + int avg_y_stride, int increase_denoising, + BLOCK_SIZE bs, int motion_magnitude) { + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + + const int b_width = (4 << b_width_log2_lookup[bs]); + const int b_height = (4 << b_height_log2_lookup[bs]); + const int b_width_shift4 = b_width >> 4; + + int8x16_t v_sum_diff_total[4][4]; + int r, c, sum_diff = 0; + + for (r = 0; r < 4; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r] = vdupq_n_s8(0); + } + } + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon( + sig, mc_running_avg_y, running_avg_y, v_level1_threshold, + v_level2_threshold, v_level3_threshold, v_level1_adjustment, + v_delta_level_1_and_2, v_delta_level_2_and_3, + v_sum_diff_total[c][r >> 4]); + + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]); + } + } + + // Update pointers for next iteration. + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + { + const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vdupq_n_u8(delta); + sig -= sig_stride * b_height; + mc_running_avg_y -= mc_avg_y_stride * b_height; + running_avg_y -= avg_y_stride * b_height; + sum_diff = 0; + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r >> 4] = + denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y, + k_delta, v_sum_diff_total[c][r >> 4]); + + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]); + } + } + + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude) { + // Rank by frequency of the block type to have an early termination. + if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 || + bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 || + bs == BLOCK_32X64 || bs == BLOCK_64X32) { + return vp9_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg, + avg_stride, increase_denoising, bs, + motion_magnitude); + } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) { + return vp9_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg, + avg_stride, increase_denoising, bs, + motion_magnitude, 8); + } + return COPY_BLOCK; +} diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index bdd666286..ab488f48f 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -184,7 +184,7 @@ struct macroblock { void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride); void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob); #if CONFIG_VP9_HIGHBITDEPTH - void (*highbd_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, + void (*highbd_itxm_add)(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); #endif }; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 16eea8fa2..7e30499c5 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -637,24 +637,25 @@ static void encode_block(int plane, int block, int row, int col, if (x->skip_encode || p->eobs[block] == 0) return; #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); switch (tx_size) { case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], + vp9_highbd_idct32x32_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; case TX_16X16: - vp9_highbd_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], + vp9_highbd_idct16x16_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; case TX_8X8: - vp9_highbd_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], + vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; case TX_4X4: // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], + x->highbd_itxm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; default: assert(0 && "Invalid transform size"); @@ -699,7 +700,8 @@ static void encode_block_pass1(int plane, int block, int row, int col, if (p->eobs[block] > 0) { #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd); + x->highbd_itxm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride, + p->eobs[block], xd->bd); return; } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -799,6 +801,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); switch (tx_size) { case TX_32X32: if (!x->skip_recode) { @@ -814,7 +817,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) { - vp9_highbd_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd); + vp9_highbd_idct32x32_add(dqcoeff, dst16, dst_stride, *eob, xd->bd); } break; case TX_16X16: @@ -834,7 +837,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) { - vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob, + vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, dst_stride, *eob, xd->bd); } break; @@ -855,7 +858,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) { - vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob, + vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, dst_stride, *eob, xd->bd); } break; @@ -880,9 +883,10 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, // this is like vp9_short_idct4x4 but has a special case around // eob<=1 which is significant (not just an optimization) for the // lossless case. - x->highbd_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd); + x->highbd_itxm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd); } else { - vp9_highbd_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd); + vp9_highbd_iht4x4_16_add(dqcoeff, dst16, dst_stride, tx_type, + xd->bd); } } break; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 62d887471..32878dcc8 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2391,6 +2391,7 @@ static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, #if CONFIG_VP9_HIGHBITDEPTH static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int bd, + INTERP_FILTER filter_type, int phase_scaler) { const int src_w = src->y_crop_width; const int src_h = src->y_crop_height; @@ -2401,7 +2402,7 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer }; const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride }; - const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; + const InterpKernel *const kernel = vp9_filter_kernels[filter_type]; int x, y, i; for (i = 0; i < MAX_MB_PLANE; ++i) { @@ -2714,7 +2715,8 @@ void vp9_scale_references(VP9_COMP *cpi) { cm->byte_alignment, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); - scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth, 0); + scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth, + EIGHTTAP, 0); cpi->scaled_ref_idx[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); } @@ -2737,7 +2739,7 @@ void vp9_scale_references(VP9_COMP *cpi) { cm->byte_alignment, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); - vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, 0); + vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, EIGHTTAP, 0); cpi->scaled_ref_idx[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); } @@ -3124,10 +3126,13 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest) { VP9_COMMON *const cm = &cpi->common; int q = 0, bottom_index = 0, top_index = 0; // Dummy variables. + const INTERP_FILTER filter_scaler = + (is_one_pass_cbr_svc(cpi)) + ? cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] + : EIGHTTAP; const int phase_scaler = - (is_one_pass_cbr_svc(cpi) && - cpi->svc.filtertype_downsample_source[cpi->svc.spatial_layer_id]) - ? 8 + (is_one_pass_cbr_svc(cpi)) + ? cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] : 0; // Flag to check if its valid to compute the source sad (used for @@ -3148,10 +3153,11 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, // For svc, if it is a 1/4x1/4 downscaling, do a two-stage scaling to take // advantage of the 1:2 optimized scaler. In the process, the 1/2x1/2 // result will be saved in scaled_temp and might be used later. - int phase_scaler2 = (cpi->svc.filtertype_downsample_source[1]) ? 8 : 0; + const INTERP_FILTER filter_scaler2 = cpi->svc.downsample_filter_type[1]; + const int phase_scaler2 = cpi->svc.downsample_filter_phase[1]; cpi->Source = vp9_svc_twostage_scale( cm, cpi->un_scaled_source, &cpi->scaled_source, &cpi->svc.scaled_temp, - phase_scaler, phase_scaler2); + filter_scaler, phase_scaler, filter_scaler2, phase_scaler2); cpi->svc.scaled_one_half = 1; } else if (is_one_pass_cbr_svc(cpi) && cpi->un_scaled_source->y_width == cm->width << 1 && @@ -3162,9 +3168,9 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, cpi->Source = &cpi->svc.scaled_temp; cpi->svc.scaled_one_half = 0; } else { - cpi->Source = - vp9_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source, - (cpi->oxcf.pass == 0), phase_scaler); + cpi->Source = vp9_scale_if_required( + cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0), + filter_scaler, phase_scaler); } // Unfiltered raw source used in metrics calculation if the source // has been filtered. @@ -3173,7 +3179,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, if (is_spatial_denoise_enabled(cpi)) { cpi->raw_source_frame = vp9_scale_if_required( cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source, - (cpi->oxcf.pass == 0), phase_scaler); + (cpi->oxcf.pass == 0), EIGHTTAP, phase_scaler); } else { cpi->raw_source_frame = cpi->Source; } @@ -3205,9 +3211,9 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION || (cpi->noise_estimate.enabled && !cpi->oxcf.noise_sensitivity) || cpi->compute_source_sad_onepass)) - cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source, - &cpi->scaled_last_source, - (cpi->oxcf.pass == 0), 0); + cpi->Last_Source = vp9_scale_if_required( + cm, cpi->unscaled_last_source, &cpi->scaled_last_source, + (cpi->oxcf.pass == 0), EIGHTTAP, 0); if (cpi->Last_Source == NULL || cpi->Last_Source->y_width != cpi->Source->y_width || @@ -3392,7 +3398,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source, - (cpi->oxcf.pass == 0), 0); + (cpi->oxcf.pass == 0), EIGHTTAP, 0); // Unfiltered raw source used in metrics calculation if the source // has been filtered. @@ -3401,7 +3407,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, if (is_spatial_denoise_enabled(cpi)) { cpi->raw_source_frame = vp9_scale_if_required( cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source, - (cpi->oxcf.pass == 0), 0); + (cpi->oxcf.pass == 0), EIGHTTAP, 0); } else { cpi->raw_source_frame = cpi->Source; } @@ -3411,9 +3417,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, } if (cpi->unscaled_last_source != NULL) - cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source, - &cpi->scaled_last_source, - (cpi->oxcf.pass == 0), 0); + cpi->Last_Source = vp9_scale_if_required( + cm, cpi->unscaled_last_source, &cpi->scaled_last_source, + (cpi->oxcf.pass == 0), EIGHTTAP, 0); if (frame_is_intra_only(cm) == 0) { if (loop_count > 0) { @@ -3693,22 +3699,26 @@ static void set_ext_overrides(VP9_COMP *cpi) { YV12_BUFFER_CONFIG *vp9_svc_twostage_scale( VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, - YV12_BUFFER_CONFIG *scaled_temp, int phase_scaler, int phase_scaler2) { + YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type, + int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) { if (cm->mi_cols * MI_SIZE != unscaled->y_width || cm->mi_rows * MI_SIZE != unscaled->y_height) { #if CONFIG_VP9_HIGHBITDEPTH if (cm->bit_depth == VPX_BITS_8) { - vp9_scale_and_extend_frame(unscaled, scaled_temp, phase_scaler2); - vp9_scale_and_extend_frame(scaled_temp, scaled, phase_scaler); + vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, + phase_scaler2); + vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, + phase_scaler); } else { scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth, - phase_scaler2); + filter_type2, phase_scaler2); scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth, - phase_scaler); + filter_type, phase_scaler); } #else - vp9_scale_and_extend_frame(unscaled, scaled_temp, phase_scaler2); - vp9_scale_and_extend_frame(scaled_temp, scaled, phase_scaler); + vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, + phase_scaler2); + vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler); #endif // CONFIG_VP9_HIGHBITDEPTH return scaled; } else { @@ -3716,27 +3726,25 @@ YV12_BUFFER_CONFIG *vp9_svc_twostage_scale( } } -YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, - YV12_BUFFER_CONFIG *unscaled, - YV12_BUFFER_CONFIG *scaled, - int use_normative_scaler, - int phase_scaler) { +YV12_BUFFER_CONFIG *vp9_scale_if_required( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler) { if (cm->mi_cols * MI_SIZE != unscaled->y_width || cm->mi_rows * MI_SIZE != unscaled->y_height) { #if CONFIG_VP9_HIGHBITDEPTH if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) && unscaled->y_height <= (scaled->y_height << 1)) if (cm->bit_depth == VPX_BITS_8) - vp9_scale_and_extend_frame(unscaled, scaled, phase_scaler); + vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler); else scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth, - phase_scaler); + filter_type, phase_scaler); else scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth); #else if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) && unscaled->y_height <= (scaled->y_height << 1)) - vp9_scale_and_extend_frame(unscaled, scaled, phase_scaler); + vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler); else scale_and_extend_frame_nonnormative(unscaled, scaled); #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 1c7f17e19..c47926168 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -836,17 +836,14 @@ void vp9_update_reference_frames(VP9_COMP *cpi); void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv); -YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm, - YV12_BUFFER_CONFIG *unscaled, - YV12_BUFFER_CONFIG *scaled, - YV12_BUFFER_CONFIG *scaled_temp, - int phase_scaler, int phase_scaler2); - -YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, - YV12_BUFFER_CONFIG *unscaled, - YV12_BUFFER_CONFIG *scaled, - int use_normative_scaler, - int phase_scaler); +YV12_BUFFER_CONFIG *vp9_svc_twostage_scale( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type, + int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2); + +YV12_BUFFER_CONFIG *vp9_scale_if_required( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler); void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags); diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 55f425b6c..e398ff5e8 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -1400,7 +1400,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE); cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, - &cpi->scaled_source, 0, 0); + &cpi->scaled_source, 0, EIGHTTAP, 0); } vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); diff --git a/vp9/encoder/vp9_frame_scale.c b/vp9/encoder/vp9_frame_scale.c index 03f8fc480..e58628388 100644 --- a/vp9/encoder/vp9_frame_scale.c +++ b/vp9/encoder/vp9_frame_scale.c @@ -16,7 +16,8 @@ #include "vpx_scale/yv12config.h" void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, int phase_scaler) { + YV12_BUFFER_CONFIG *dst, + INTERP_FILTER filter_type, int phase_scaler) { const int src_w = src->y_crop_width; const int src_h = src->y_crop_height; const int dst_w = dst->y_crop_width; @@ -26,7 +27,7 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer }; const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride }; - const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; + const InterpKernel *const kernel = vp9_filter_kernels[filter_type]; int x, y, i; for (i = 0; i < MAX_MB_PLANE; ++i) { diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c index fc2e32448..ca0873e4c 100644 --- a/vp9/encoder/vp9_noise_estimate.c +++ b/vp9/encoder/vp9_noise_estimate.c @@ -26,25 +26,27 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) { ne->level = kLowLow; ne->value = 0; ne->count = 0; - ne->thresh = 100; + ne->thresh = 90; ne->last_w = 0; ne->last_h = 0; if (width * height >= 1920 * 1080) { ne->thresh = 200; } else if (width * height >= 1280 * 720) { - ne->thresh = 140; + ne->thresh = 150; + } else if (width * height >= 640 * 360) { + ne->thresh = 100; } - ne->num_frames_estimate = 20; + ne->num_frames_estimate = 15; } static int enable_noise_estimation(VP9_COMP *const cpi) { #if CONFIG_VP9_HIGHBITDEPTH if (cpi->common.use_highbitdepth) return 0; #endif -// Enable noise estimation if denoising is on, but not for low resolutions. +// Enable noise estimation if denoising is on. #if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && - cpi->common.width >= 640 && cpi->common.height >= 360) + cpi->common.width >= 320 && cpi->common.height >= 180) return 1; #endif // Only allow noise estimate under certain encoding mode. @@ -97,6 +99,7 @@ NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) { void vp9_update_noise_estimate(VP9_COMP *const cpi) { const VP9_COMMON *const cm = &cpi->common; NOISE_ESTIMATE *const ne = &cpi->noise_estimate; + const int low_res = (cm->width <= 352 && cm->height <= 288); // Estimate of noise level every frame_period frames. int frame_period = 8; int thresh_consec_zeromv = 6; @@ -111,6 +114,13 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi)) last_source = &cpi->denoiser.last_source; #endif + // Tune these thresholds for different resolutions. + if (cm->width > 640 && cm->width < 1920) { + thresh_consec_zeromv = 5; + thresh_sum_diff = 200; + thresh_sum_spatial = (120 * 120) << 8; + thresh_spatial_var = (48 * 48) << 8; + } ne->enabled = enable_noise_estimation(cpi); if (cpi->svc.number_spatial_layers > 1) frame_counter = cpi->svc.current_superframe; @@ -127,9 +137,12 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { ne->last_h = cm->height; } return; - } else if (cpi->rc.avg_frame_low_motion < 50) { + } else if (cm->current_video_frame > 60 && + cpi->rc.avg_frame_low_motion < (low_res ? 70 : 50)) { // Force noise estimation to 0 and denoiser off if content has high motion. ne->level = kLowLow; + ne->count = 0; + ne->num_frames_estimate = 10; #if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && cpi->svc.current_superframe > 1) { @@ -210,7 +223,8 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { // Avoid blocks with high brightness and high spatial variance. if ((sse2 - spatial_variance) < thresh_sum_spatial && spatial_variance < thresh_spatial_var) { - avg_est += variance / ((spatial_variance >> 9) + 1); + avg_est += low_res ? variance >> 4 + : variance / ((spatial_variance >> 9) + 1); num_samples++; } } diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index d4aa46e83..bf0fec3d8 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -601,26 +601,26 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16, 32, NULL, 0, NULL, 0, bs, bs, xd->bd); - recon = CONVERT_TO_BYTEPTR(recon16); if (xd->lossless) { - vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd); } else { switch (tx_size) { case TX_4X4: - vp9_highbd_idct4x4_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, *eob, xd->bd); break; case TX_8X8: - vp9_highbd_idct8x8_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, *eob, xd->bd); break; case TX_16X16: - vp9_highbd_idct16x16_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, *eob, xd->bd); break; case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, *eob, xd->bd); break; default: assert(0 && "Invalid transform size"); } } + recon = CONVERT_TO_BYTEPTR(recon16); } else { #endif // CONFIG_VP9_HIGHBITDEPTH vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs); @@ -1004,6 +1004,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, const int block = (row + idy) * 2 + (col + idx); const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride]; uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block); @@ -1025,7 +1026,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0); if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) goto next_highbd; - vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, + vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst16, dst_stride, p->eobs[block], xd->bd); } else { int64_t unused; @@ -1048,7 +1049,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) goto next_highbd; vp9_highbd_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), - dst, dst_stride, p->eobs[block], xd->bd); + dst16, dst_stride, p->eobs[block], xd->bd); } } } diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index ed628758b..5867a6c38 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -38,11 +38,12 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { svc->current_superframe = 0; for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1; for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { - cpi->svc.ext_frame_flags[sl] = 0; - cpi->svc.ext_lst_fb_idx[sl] = 0; - cpi->svc.ext_gld_fb_idx[sl] = 1; - cpi->svc.ext_alt_fb_idx[sl] = 2; - cpi->svc.filtertype_downsample_source[sl] = 0; + svc->ext_frame_flags[sl] = 0; + svc->ext_lst_fb_idx[sl] = 0; + svc->ext_gld_fb_idx[sl] = 1; + svc->ext_alt_fb_idx[sl] = 2; + svc->downsample_filter_type[sl] = EIGHTTAP; + svc->downsample_filter_phase[sl] = 0; // Set to 8 for averaging filter. } if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) { @@ -651,11 +652,17 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { lc->scaling_factor_num, lc->scaling_factor_den, &width, &height); + // For low resolutions: set phase of the filter = 8 (for symmetric averaging + // filter), use bilinear for now. + if (width <= 320 && height <= 240) { + cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] = BILINEAR; + cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] = 8; + } + // The usage of use_base_mv assumes down-scale of 2x2. For now, turn off use // of base motion vectors if spatial scale factors for any layers are not 2, // keep the case of 3 spatial layers with scale factor of 4x4 for base layer. // TODO(marpan): Fix this to allow for use_base_mv for scale factors != 2. - // Same condition applies to use of non-zero phase_scaler. if (cpi->svc.number_spatial_layers > 1) { int sl; for (sl = 0; sl < cpi->svc.number_spatial_layers - 1; ++sl) { @@ -664,10 +671,7 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { if ((lc->scaling_factor_num != lc->scaling_factor_den >> 1) && !(lc->scaling_factor_num == lc->scaling_factor_den >> 2 && sl == 0 && cpi->svc.number_spatial_layers == 3)) { - int sl2; cpi->svc.use_base_mv = 0; - for (sl2 = 0; sl2 < cpi->svc.number_spatial_layers - 1; ++sl2) - cpi->svc.filtertype_downsample_source[sl2] = 0; break; } } diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index 7442e6ff7..d8e6772b2 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -89,11 +89,12 @@ typedef struct { int current_superframe; int use_base_mv; // Used to control the downscaling filter for source scaling, for 1 pass CBR. - // 0 will do sub-sampling (no weighted average), 1 will center the target - // pixel and use the averaging filter, for the default eightap_regular: - // {-1, 6, -19, 78, 78, -19, 6, -1 }. - // TODO(marpan): Add option for bilinear. - int filtertype_downsample_source[VPX_SS_MAX_LAYERS]; + // downsample_filter_phase: = 0 will do sub-sampling (no weighted average), + // = 8 will center the target pixel and get a symmetric averaging filter. + // downsample_filter_type: 4 filters may be used: eighttap_regular, + // eighttap_smooth, eighttap_sharp, and bilinear. + INTERP_FILTER downsample_filter_type[VPX_SS_MAX_LAYERS]; + int downsample_filter_phase[VPX_SS_MAX_LAYERS]; } SVC; struct VP9_COMP; diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 31bcb246a..630794156 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -755,7 +755,8 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { "Failed to reallocate alt_ref_buffer"); } frames[frame] = vp9_scale_if_required( - cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0, 0); + cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0, + EIGHTTAP, 0); ++frame_used; } } diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c index 31e17f21f..b53714a02 100644 --- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c +++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c @@ -17,7 +17,7 @@ extern void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, - int phase_scaler); + uint8_t filter_type, int phase_scaler); static void downsample_2_to_1_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, @@ -170,7 +170,7 @@ static void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride, void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, - int phase_scaler) { + uint8_t filter_type, int phase_scaler) { const int src_w = src->y_crop_width; const int src_h = src->y_crop_height; const int dst_w = dst->y_crop_width; @@ -198,9 +198,9 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, dst->uv_stride, dst_uv_w, dst_uv_h); vpx_extend_frame_borders(dst); } else { - vp9_scale_and_extend_frame_c(src, dst, phase_scaler); + vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler); } } else { - vp9_scale_and_extend_frame_c(src, dst, phase_scaler); + vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler); } } diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 4b8ec5048..47846c941 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -121,6 +121,7 @@ VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_denoiser_neon.c endif VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c index 96f6de1be..b26920504 100644 --- a/vpx_dsp/arm/fwd_txfm_neon.c +++ b/vpx_dsp/arm/fwd_txfm_neon.c @@ -125,6 +125,8 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 } // transpose 8x8 + // Can't use transpose_s16_8x8() because the values are arranged in two 4x8 + // columns. { // 00 01 02 03 40 41 42 43 // 10 11 12 13 50 51 52 53 diff --git a/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/vpx_dsp/arm/highbd_idct16x16_add_neon.c index 1259bb380..98e42cd25 100644 --- a/vpx_dsp/arm/highbd_idct16x16_add_neon.c +++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c @@ -1268,10 +1268,8 @@ void vpx_highbd_idct16x16_10_add_half1d_pass2(const int32_t *input, } } -void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - if (bd == 8) { int16_t row_idct_output[16 * 16]; @@ -1313,10 +1311,8 @@ void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - if (bd == 8) { int16_t row_idct_output[16 * 16]; @@ -1349,10 +1345,8 @@ void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - if (bd == 8) { int16_t row_idct_output[4 * 16]; @@ -1414,7 +1408,7 @@ static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest, *dest += stride; } -void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const tran_low_t out0 = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); @@ -1422,7 +1416,6 @@ void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8, HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); const int16x8_t dc = vdupq_n_s16(a1); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); int i; if (a1 >= 0) { diff --git a/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c index 858342830..96a55c472 100644 --- a/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c +++ b/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c @@ -386,15 +386,14 @@ static INLINE void idct32_bands_end_2nd_pass(const int32_t *const out, } static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input, - uint8_t *const dest, - const int stride, const int bd) { + uint16_t *dst, const int stride, + const int bd) { int i, idct32_pass_loop; int32_t trans_buf[32 * 8]; int32_t pass1[32 * 32]; int32_t pass2[32 * 32]; int32_t *out; int32x4x2_t q[16]; - uint16_t *dst = CONVERT_TO_SHORTPTR(dest); for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; idct32_pass_loop++, input = pass1, out = pass2) { @@ -637,10 +636,10 @@ static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input, } } -void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, +void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { if (bd == 8) { - vpx_idct32_32_neon(input, dest, stride, 1); + vpx_idct32_32_neon(input, CAST_TO_BYTEPTR(dest), stride, 1); } else { vpx_highbd_idct32_32_neon(input, dest, stride, bd); } diff --git a/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c index 52f3d43e5..3970a5a86 100644 --- a/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c +++ b/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c @@ -726,10 +726,9 @@ static void vpx_highbd_idct32_16_neon(const int32_t *const input, highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd); } -void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); if (bd == 8) { int16_t temp[32 * 16]; diff --git a/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c index 195dcc92d..5d9063b15 100644 --- a/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c +++ b/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c @@ -594,10 +594,9 @@ static void vpx_highbd_idct32_8_neon(const int32_t *input, uint16_t *output, highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd); } -void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); if (bd == 8) { int16_t temp[32 * 8]; diff --git a/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_add_neon.c index d74331f80..63eb49678 100644 --- a/vpx_dsp/arm/highbd_idct32x32_add_neon.c +++ b/vpx_dsp/arm/highbd_idct32x32_add_neon.c @@ -59,7 +59,7 @@ static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest, *dest += stride; } -void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const tran_low_t out0 = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); @@ -67,7 +67,6 @@ void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8, HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); const int16x8_t dc = vdupq_n_s16(a1); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); int i; if (a1 >= 0) { diff --git a/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/vpx_dsp/arm/highbd_idct4x4_add_neon.c index 128f72b9c..20b09f683 100644 --- a/vpx_dsp/arm/highbd_idct4x4_add_neon.c +++ b/vpx_dsp/arm/highbd_idct4x4_add_neon.c @@ -51,7 +51,7 @@ static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest, *dest += stride; } -void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); const tran_low_t out0 = @@ -60,7 +60,6 @@ void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8, HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); const int16x8_t dc = vdupq_n_s16(a1); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); @@ -133,14 +132,13 @@ static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, *a3 = vsubq_s32(b0, b3); } -void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); int32x4_t c0 = vld1q_s32(input); int32x4_t c1 = vld1q_s32(input + 4); int32x4_t c2 = vld1q_s32(input + 8); int32x4_t c3 = vld1q_s32(input + 12); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); int16x8_t a0, a1; if (bd == 8) { diff --git a/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/vpx_dsp/arm/highbd_idct8x8_add_neon.c index f53f4c7fc..6687e7649 100644 --- a/vpx_dsp/arm/highbd_idct8x8_add_neon.c +++ b/vpx_dsp/arm/highbd_idct8x8_add_neon.c @@ -36,7 +36,7 @@ static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest, *dest += stride; } -void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const tran_low_t out0 = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); @@ -44,7 +44,6 @@ void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8, HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5); const int16x8_t dc = vdupq_n_s16(a1); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); if (a1 >= 0) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); @@ -292,9 +291,8 @@ static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2, vst1q_u16(dest, d7_u16); } -void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); int32x4_t a0 = vld1q_s32(input); int32x4_t a1 = vld1q_s32(input + 8); int32x4_t a2 = vld1q_s32(input + 16); @@ -553,9 +551,8 @@ static INLINE void idct8x8_64_half1d_bd12( *io7 = vsubq_s32(step1[0], step2[7]); } -void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); int32x4_t a0 = vld1q_s32(input); int32x4_t a1 = vld1q_s32(input + 4); int32x4_t a2 = vld1q_s32(input + 8); diff --git a/vpx_dsp/arm/idct32x32_add_neon.c b/vpx_dsp/arm/idct32x32_add_neon.c index 34b5baf72..91418c9e6 100644 --- a/vpx_dsp/arm/idct32x32_add_neon.c +++ b/vpx_dsp/arm/idct32x32_add_neon.c @@ -517,7 +517,7 @@ void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest, const int16_t *input_pass2 = pass1; // input of pass2 is the result of pass1 int16_t *out; int16x8_t q[16]; - uint16_t *dst = CONVERT_TO_SHORTPTR(dest); + uint16_t *dst = CAST_TO_SHORTPTR(dest); for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; idct32_pass_loop++, out = pass2) { diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index b6d7f86a4..c0828e8f6 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -31,77 +31,129 @@ static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { return vget_lane_s32(c, 0); } -// w * h must be less than 2048 or local variable v_sum may overflow. -static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, uint32_t *sse, - int *sum) { +// w * h must be less than 2048 or sum_s16 may overflow. +// Process a block of any size where the width is divisible by 16. +static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, uint32_t *sse, + int *sum) { int i, j; - int16x8_t v_sum = vdupq_n_s16(0); - int32x4_t v_sse_lo = vdupq_n_s32(0); - int32x4_t v_sse_hi = vdupq_n_s32(0); + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_lo_s32 = vdupq_n_s32(0); + int32x4_t sse_hi_s32 = vdupq_n_s32(0); for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - const uint8x8_t v_a = vld1_u8(&a[j]); - const uint8x8_t v_b = vld1_u8(&b[j]); - const uint16x8_t v_diff = vsubl_u8(v_a, v_b); - const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); - v_sum = vaddq_s16(v_sum, sv_diff); - v_sse_lo = - vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff)); - v_sse_hi = - vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff)); + for (j = 0; j < w; j += 16) { + const uint8x16_t a_u8 = vld1q_u8(a + j); + const uint8x16_t b_u8 = vld1q_u8(b + j); + + const uint16x8_t diff_lo_u16 = + vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)); + const uint16x8_t diff_hi_u16 = + vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)); + + const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16); + const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16); + + sum_s16 = vaddq_s16(sum_s16, diff_lo_s16); + sum_s16 = vaddq_s16(sum_s16, diff_hi_s16); + + sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16), + vget_low_s16(diff_lo_s16)); + sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16), + vget_high_s16(diff_lo_s16)); + + sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16), + vget_low_s16(diff_hi_s16)); + sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16), + vget_high_s16(diff_hi_s16)); } a += a_stride; b += b_stride; } - *sum = horizontal_add_s16x8(v_sum); - *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); + *sum = horizontal_add_s16x8(sum_s16); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32)); +} + +// w * h must be less than 2048 or sum_s16 may overflow. +// Process a block of width 8 two rows at a time. +static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int h, uint32_t *sse, int *sum) { + int i = 0; + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_lo_s32 = vdupq_n_s32(0); + int32x4_t sse_hi_s32 = vdupq_n_s32(0); + + do { + const uint8x8_t a_0_u8 = vld1_u8(a); + const uint8x8_t a_1_u8 = vld1_u8(a + a_stride); + const uint8x8_t b_0_u8 = vld1_u8(b); + const uint8x8_t b_1_u8 = vld1_u8(b + b_stride); + const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8); + const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8); + const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16); + const int16x8_t diff_1_s16 = vreinterpretq_s16_u16(diff_1_u16); + sum_s16 = vaddq_s16(sum_s16, diff_0_s16); + sum_s16 = vaddq_s16(sum_s16, diff_1_s16); + sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_0_s16), + vget_low_s16(diff_0_s16)); + sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_1_s16), + vget_low_s16(diff_1_s16)); + sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_0_s16), + vget_high_s16(diff_0_s16)); + sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16), + vget_high_s16(diff_1_s16)); + a += a_stride + a_stride; + b += b_stride + b_stride; + i += 2; + } while (i < h); + + *sum = horizontal_add_s16x8(sum_s16); + *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32)); } void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, unsigned int *sse, int *sum) { - variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum); + variance_neon_w8x2(a, a_stride, b, b_stride, 8, sse, sum); } void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, unsigned int *sse, int *sum) { - variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum); -} - -unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); - return *sse - ((sum * sum) >> 6); + variance_neon_w16(a, a_stride, b, b_stride, 16, 16, sse, sum); } -unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); - return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); -} +#define varianceNxM(n, m, shift) \ + unsigned int vpx_variance##n##x##m##_neon(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse) { \ + int sum; \ + if (n == 8) \ + variance_neon_w8x2(a, a_stride, b, b_stride, m, sse, &sum); \ + else \ + variance_neon_w16(a, a_stride, b, b_stride, n, m, sse, &sum); \ + if (n * m < 16 * 16) \ + return *sse - ((sum * sum) >> shift); \ + else \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ + } -unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); -} +varianceNxM(8, 4, 5); +varianceNxM(8, 8, 6); +varianceNxM(8, 16, 7); +varianceNxM(16, 8, 7); +varianceNxM(16, 16, 8); +varianceNxM(16, 32, 9); +varianceNxM(32, 16, 9); +varianceNxM(32, 32, 10); unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, unsigned int *sse) { int sum1, sum2; uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); - variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride, - 32, 32, &sse2, &sum2); + variance_neon_w16(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); + variance_neon_w16(a + (32 * a_stride), a_stride, b + (32 * b_stride), + b_stride, 32, 32, &sse2, &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); @@ -112,9 +164,9 @@ unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride, unsigned int *sse) { int sum1, sum2; uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, - 64, 16, &sse2, &sum2); + variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride), + b_stride, 64, 16, &sse2, &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); @@ -126,162 +178,24 @@ unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride, int sum1, sum2; uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, - 64, 16, &sse2, &sum2); + variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride), + b_stride, 64, 16, &sse2, &sum2); sse1 += sse2; sum1 += sum2; - variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride), - b_stride, 64, 16, &sse2, &sum2); + variance_neon_w16(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride), + b_stride, 64, 16, &sse2, &sum2); sse1 += sse2; sum1 += sum2; - variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride), - b_stride, 64, 16, &sse2, &sum2); + variance_neon_w16(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride), + b_stride, 64, 16, &sse2, &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12); } -unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 4; i++) { - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vpx_variance8x16_neon(const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, unsigned int *sse) { - int i; - uint8x8_t d0u8, d2u8, d4u8, d6u8; - int16x4_t d22s16, d23s16, d24s16, d25s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint16x8_t q11u16, q12u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d2u8, d6u8); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c index 210a9bed9..29323d1b8 100644 --- a/vpx_dsp/inv_txfm.c +++ b/vpx_dsp/inv_txfm.c @@ -1182,16 +1182,10 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, // Rows for (i = 0; i < 32; ++i) { - int16_t zero_coeff[16]; - for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; - for (j = 0; j < 8; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 4; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 2; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - - if (zero_coeff[0] | zero_coeff[1]) + int16_t zero_coeff = 0; + for (j = 0; j < 32; ++j) zero_coeff |= input[j]; + + if (zero_coeff) idct32_c(input, outptr); else memset(outptr, 0, sizeof(tran_low_t) * 32); @@ -1290,7 +1284,7 @@ static INLINE int detect_invalid_highbd_input(const tran_low_t *input, return 0; } -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ @@ -1299,7 +1293,6 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, tran_high_t a1, b1, c1, d1, e1; const tran_low_t *ip = input; tran_low_t *op = output; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); for (i = 0; i < 4; i++) { a1 = ip[0] >> UNIT_QUANT_SHIFT; @@ -1348,14 +1341,13 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest, int stride, int bd) { int i; tran_high_t a1, e1; tran_low_t tmp[4]; const tran_low_t *ip = in; tran_low_t *op = tmp; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); (void)bd; a1 = ip[0] >> UNIT_QUANT_SHIFT; @@ -1452,13 +1444,12 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd); } -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[4 * 4]; tran_low_t *outptr = out; tran_low_t temp_in[4], temp_out[4]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Rows for (i = 0; i < 4; ++i) { @@ -1478,13 +1469,12 @@ void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i; tran_high_t a1; tran_low_t out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 4); @@ -1636,13 +1626,12 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); } -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[8 * 8]; tran_low_t *outptr = out; tran_low_t temp_in[8], temp_out[8]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // First transform rows for (i = 0; i < 8; ++i) { @@ -1662,13 +1651,12 @@ void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[8 * 8] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[8], temp_out[8]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // First transform rows // Only first 4 row has non-zero coefs @@ -1689,13 +1677,12 @@ void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_high_t a1; tran_low_t out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 5); @@ -2056,13 +2043,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); } -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[16 * 16]; tran_low_t *outptr = out; tran_low_t temp_in[16], temp_out[16]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // First transform rows for (i = 0; i < 16; ++i) { @@ -2082,13 +2068,12 @@ void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[16 * 16] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[16], temp_out[16]; - uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8); // First transform rows. Since all non-zero dct coefficients are in // upper-left 8x8 area, we only need to calculate first 8 rows here. @@ -2111,13 +2096,12 @@ void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[16 * 16] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[16], temp_out[16]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. @@ -2138,13 +2122,12 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_high_t a1; tran_low_t out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); @@ -2531,26 +2514,19 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd); } -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[32 * 32]; tran_low_t *outptr = out; tran_low_t temp_in[32], temp_out[32]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Rows for (i = 0; i < 32; ++i) { - tran_low_t zero_coeff[16]; - for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; - for (j = 0; j < 8; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 4; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 2; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - - if (zero_coeff[0] | zero_coeff[1]) + tran_low_t zero_coeff = 0; + for (j = 0; j < 32; ++j) zero_coeff |= input[j]; + + if (zero_coeff) highbd_idct32_c(input, outptr, bd); else memset(outptr, 0, sizeof(tran_low_t) * 32); @@ -2569,13 +2545,12 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[32], temp_out[32]; - uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8); // Rows // Only upper-left 16x16 has non-zero coeff @@ -2598,13 +2573,12 @@ void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[32], temp_out[32]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Rows // Only upper-left 8x8 has non-zero coeff @@ -2625,11 +2599,10 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; int a1; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); tran_low_t out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index d1c991782..6daa58390 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -231,6 +231,11 @@ DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_34_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_135_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_1024_add_neon.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_inv_txfm_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct4x4_add_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct8x8_add_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct16x16_add_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct32x32_add_sse2.c endif # !CONFIG_VP9_HIGHBITDEPTH ifeq ($(HAVE_NEON_ASM),yes) @@ -351,6 +356,9 @@ DSP_SRCS-$(HAVE_VSX) += ppc/types_vsx.h DSP_SRCS-$(HAVE_VSX) += ppc/transpose_vsx.h DSP_SRCS-$(HAVE_VSX) += ppc/bitdepth_conversion_vsx.h +# X86 utilities +DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h + DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes) DSP_SRCS-yes += vpx_dsp_rtcd.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index d025a2f5b..3126ae6c8 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -629,39 +629,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. specialize qw/vpx_iwht4x4_16_add sse2/; - add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct4x4_1_add neon/; - add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct8x8_1_add neon/; - add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct16x16_1_add neon/; - add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct32x32_1_add neon sse2/; - add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { specialize qw/vpx_highbd_idct4x4_16_add neon sse2/; @@ -1177,10 +1177,10 @@ add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int sourc specialize qw/vpx_variance32x32 sse2 avx2 neon msa/; add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x16 sse2 avx2 msa/; + specialize qw/vpx_variance32x16 sse2 avx2 neon msa/; add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x32 sse2 msa/; + specialize qw/vpx_variance16x32 sse2 neon msa/; add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance16x16 sse2 avx2 neon msa/; @@ -1195,12 +1195,14 @@ add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_ specialize qw/vpx_variance8x8 sse2 neon msa/; add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x4 sse2 msa/; + specialize qw/vpx_variance8x4 sse2 neon msa/; add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +# TODO(johannkoenig): neon specialize qw/vpx_variance4x8 sse2 msa/; add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +# TODO(johannkoenig): neon specialize qw/vpx_variance4x4 sse2 msa/; # diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c new file mode 100644 index 000000000..f16e4d071 --- /dev/null +++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[32]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + const __m128i zero = _mm_set1_epi16(0); + const __m128i rounding = _mm_set1_epi16(32); + const __m128i max = _mm_set1_epi16(3155); + const __m128i min = _mm_set1_epi16(-3155); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 16; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); + inptr[i + 16] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 32; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform + idct16_sse2(inptr, inptr + 16); + + // Find the min & max for the column transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 32; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + array_transpose_16x16(inptr, inptr + 16); + for (i = 0; i < 16; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); + sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); + temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 16; ++i) { + vpx_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; + } + } + + if (optimised_cols) { + idct16_sse2(inptr, inptr + 16); + + // Final round & shift and Reconstruction and Store + { + __m128i d[2]; + for (i = 0; i < 16; i++) { + inptr[i] = _mm_add_epi16(inptr[i], rounding); + inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); + d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); + d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); + inptr[i] = _mm_srai_epi16(inptr[i], 6); + inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); + d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); + d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); + _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[16], temp_out[16]; + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + vpx_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } + } +} + +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[32]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + const __m128i zero = _mm_set1_epi16(0); + const __m128i rounding = _mm_set1_epi16(32); + const __m128i max = _mm_set1_epi16(3155); + const __m128i min = _mm_set1_epi16(-3155); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 16; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); + inptr[i + 16] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + // Since all non-zero dct coefficients are in upper-left 4x4 area, + // we only need to consider first 4 rows here. + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 4; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform (N.B. This transposes inptr) + idct16_sse2(inptr, inptr + 16); + + // Find the min & max for the column transform + // N.B. Only first 4 cols contain non-zero coeffs + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 16; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + // Use fact only first 4 rows contain non-zero coeffs + array_transpose_8x8(inptr, inptr); + array_transpose_8x8(inptr + 8, inptr + 16); + for (i = 0; i < 4; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); + sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); + temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + vpx_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; + } + } + + if (optimised_cols) { + idct16_sse2(inptr, inptr + 16); + + // Final round & shift and Reconstruction and Store + { + __m128i d[2]; + for (i = 0; i < 16; i++) { + inptr[i] = _mm_add_epi16(inptr[i], rounding); + inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); + d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); + d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); + inptr[i] = _mm_srai_epi16(inptr[i], 6); + inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); + d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); + d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); + _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[16], temp_out[16]; + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + vpx_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } + } +} diff --git a/vpx_dsp/x86/highbd_idct32x32_add_sse2.c b/vpx_dsp/x86/highbd_idct32x32_add_sse2.c new file mode 100644 index 000000000..bc9debf31 --- /dev/null +++ b/vpx_dsp/x86/highbd_idct32x32_add_sse2.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + __m128i dc_value, d; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + int a, i, j; + tran_low_t out; + + out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + a = ROUND_POWER_OF_TWO(out, 6); + + d = _mm_set1_epi32(a); + dc_value = _mm_packs_epi32(d, d); + for (i = 0; i < 32; ++i) { + for (j = 0; j < 4; ++j) { + d = _mm_loadu_si128((const __m128i *)(&dest[j * 8])); + d = _mm_adds_epi16(d, dc_value); + d = _mm_max_epi16(d, zero); + d = _mm_min_epi16(d, max); + _mm_storeu_si128((__m128i *)(&dest[j * 8]), d); + } + dest += stride; + } +} diff --git a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c new file mode 100644 index 000000000..f3a69795e --- /dev/null +++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + int i, j; + __m128i inptr[4]; + __m128i sign_bits[2]; + __m128i temp_mm, min_input, max_input; + int test; + int optimised_cols = 0; + const __m128i zero = _mm_set1_epi16(0); + const __m128i eight = _mm_set1_epi16(8); + const __m128i max = _mm_set1_epi16(12043); + const __m128i min = _mm_set1_epi16(-12043); + // Load input into __m128i + inptr[0] = _mm_loadu_si128((const __m128i *)input); + inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); + inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); + inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); + + // Pack to 16 bits + inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); + inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); + + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp_mm = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp_mm); + + if (!test) { + // Do the row transform + idct4_sse2(inptr); + + // Check the min & max values + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp_mm = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp_mm); + + if (test) { + transpose_4x4(inptr); + sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); + sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); + inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); + inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); + inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); + inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); + _mm_storeu_si128((__m128i *)outptr, inptr[0]); + _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); + _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); + _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + vpx_highbd_idct4_c(input, outptr, bd); + input += 4; + outptr += 4; + } + } + + if (optimised_cols) { + idct4_sse2(inptr); + + // Final round and shift + inptr[0] = _mm_add_epi16(inptr[0], eight); + inptr[1] = _mm_add_epi16(inptr[1], eight); + + inptr[0] = _mm_srai_epi16(inptr[0], 4); + inptr[1] = _mm_srai_epi16(inptr[1], 4); + + // Reconstruction and Store + { + __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); + __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi64( + d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); + d2 = _mm_unpacklo_epi64( + d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); + d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); + d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); + // store input0 + _mm_storel_epi64((__m128i *)dest, d0); + // store input1 + d0 = _mm_srli_si128(d0, 8); + _mm_storel_epi64((__m128i *)(dest + stride), d0); + // store input2 + _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); + // store input3 + d2 = _mm_srli_si128(d2, 8); + _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[4], temp_out[4]; + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; + vpx_highbd_idct4_c(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } + } + } +} diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c new file mode 100644 index 000000000..6a2e18064 --- /dev/null +++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[8]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + const __m128i zero = _mm_set1_epi16(0); + const __m128i sixteen = _mm_set1_epi16(16); + const __m128i max = _mm_set1_epi16(6201); + const __m128i min = _mm_set1_epi16(-6201); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 8; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 8; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform + idct8_sse2(inptr); + + // Find the min & max for the column transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 8; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + array_transpose_8x8(inptr, inptr); + for (i = 0; i < 8; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); + temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 8; ++i) { + vpx_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + } + + if (optimised_cols) { + idct8_sse2(inptr); + + // Final round & shift and Reconstruction and Store + { + __m128i d[8]; + for (i = 0; i < 8; i++) { + inptr[i] = _mm_add_epi16(inptr[i], sixteen); + d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); + inptr[i] = _mm_srai_epi16(inptr[i], 5); + d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[8], temp_out[8]; + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + vpx_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } + } +} + +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[8]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + const __m128i zero = _mm_set1_epi16(0); + const __m128i sixteen = _mm_set1_epi16(16); + const __m128i max = _mm_set1_epi16(6201); + const __m128i min = _mm_set1_epi16(-6201); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 8; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + // only first 4 row has non-zero coefs + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 4; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform + idct8_sse2(inptr); + + // Find the min & max for the column transform + // N.B. Only first 4 cols contain non-zero coeffs + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 8; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + // Use fact only first 4 rows contain non-zero coeffs + array_transpose_4X8(inptr, inptr); + for (i = 0; i < 4; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); + temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + vpx_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + } + + if (optimised_cols) { + idct8_sse2(inptr); + + // Final round & shift and Reconstruction and Store + { + __m128i d[8]; + for (i = 0; i < 8; i++) { + inptr[i] = _mm_add_epi16(inptr[i], sixteen); + d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); + inptr[i] = _mm_srai_epi16(inptr[i], 5); + d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[8], temp_out[8]; + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + vpx_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } + } +} diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h new file mode 100644 index 000000000..774cce1d4 --- /dev/null +++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ +#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ + +#include // SSE2 +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { + __m128i ubounded, retval; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + ubounded = _mm_cmpgt_epi16(value, max); + retval = _mm_andnot_si128(ubounded, value); + ubounded = _mm_and_si128(ubounded, max); + retval = _mm_or_si128(retval, ubounded); + retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); + return retval; +} + +#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index 8c33caedb..4b201b987 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -10,153 +10,36 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" -#define RECON_AND_STORE4X4(dest, in_x) \ - { \ - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - *(int *)(dest) = _mm_cvtsi128_si32(d0); \ - } - void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); - const __m128i cst = _mm_setr_epi16( - (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, - (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, - (int16_t)cospi_8_64, (int16_t)cospi_24_64); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i input0, input1, input2, input3; + __m128i in[2]; // Rows - input0 = load_input_data(input); - input2 = load_input_data(input + 8); - - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_shufflelo_epi16(input0, 0xd8); - input0 = _mm_shufflehi_epi16(input0, 0xd8); - input2 = _mm_shufflelo_epi16(input2, 0xd8); - input2 = _mm_shufflehi_epi16(input2, 0xd8); - - input1 = _mm_unpackhi_epi32(input0, input0); - input0 = _mm_unpacklo_epi32(input0, input0); - input3 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpacklo_epi32(input2, input2); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input1); - input1 = _mm_packs_epi32(input2, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8); + idct4_sse2(in); // Columns - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_unpacklo_epi32(input2, input2); - input1 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpackhi_epi32(input3, input3); - input3 = _mm_unpacklo_epi32(input3, input3); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input2); - input1 = _mm_packs_epi32(input1, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); + idct4_sse2(in); // Final round and shift - input2 = _mm_add_epi16(input2, eight); - input3 = _mm_add_epi16(input3, eight); - - input2 = _mm_srai_epi16(input2, 4); - input3 = _mm_srai_epi16(input3, 4); + in[0] = _mm_add_epi16(in[0], eight); + in[1] = _mm_add_epi16(in[1], eight); + in[0] = _mm_srai_epi16(in[0], 4); + in[1] = _mm_srai_epi16(in[1], 4); - // Reconstruction and Store - { - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); - __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi32(d0, - _mm_cvtsi32_si128(*(const int *)(dest + stride))); - d2 = _mm_unpacklo_epi32( - _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); - d0 = _mm_unpacklo_epi8(d0, zero); - d2 = _mm_unpacklo_epi8(d2, zero); - d0 = _mm_add_epi16(d0, input2); - d2 = _mm_add_epi16(d2, input3); - d0 = _mm_packus_epi16(d0, d2); - // store input0 - *(int *)dest = _mm_cvtsi128_si32(d0); - // store input1 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); - // store input2 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); - // store input3 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); - } + recon_and_store4x4_sse2(in, dest, stride); } void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { - __m128i dc_value; const __m128i zero = _mm_setzero_si128(); int a; + __m128i dc_value, d[2]; a = (int)dct_const_round_shift(input[0] * cospi_16_64); a = (int)dct_const_round_shift(a * cospi_16_64); @@ -164,18 +47,26 @@ void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, dc_value = _mm_set1_epi16(a); - RECON_AND_STORE4X4(dest + 0 * stride, dc_value); - RECON_AND_STORE4X4(dest + 1 * stride, dc_value); - RECON_AND_STORE4X4(dest + 2 * stride, dc_value); - RECON_AND_STORE4X4(dest + 3 * stride, dc_value); -} - -static INLINE void transpose_4x4(__m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); - const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); - - res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); - res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); + // Reconstruction and Store + d[0] = _mm_cvtsi32_si128(*(const int *)(dest)); + d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); + d[0] = _mm_unpacklo_epi32(d[0], + _mm_cvtsi32_si128(*(const int *)(dest + stride))); + d[1] = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]); + d[0] = _mm_unpacklo_epi8(d[0], zero); + d[1] = _mm_unpacklo_epi8(d[1], zero); + d[0] = _mm_add_epi16(d[0], dc_value); + d[1] = _mm_add_epi16(d[1], dc_value); + d[0] = _mm_packus_epi16(d[0], d[1]); + + *(int *)dest = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); } void idct4_sse2(__m128i *in) { @@ -3349,595 +3240,3 @@ void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, RECON_AND_STORE(dest + 24 + j * stride, dc_value); } } - -#if CONFIG_VP9_HIGHBITDEPTH -static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { - __m128i ubounded, retval; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); - ubounded = _mm_cmpgt_epi16(value, max); - retval = _mm_andnot_si128(ubounded, value); - ubounded = _mm_and_si128(ubounded, max); - retval = _mm_or_si128(retval, ubounded); - retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); - return retval; -} - -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - __m128i inptr[4]; - __m128i sign_bits[2]; - __m128i temp_mm, min_input, max_input; - int test; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - int optimised_cols = 0; - const __m128i zero = _mm_set1_epi16(0); - const __m128i eight = _mm_set1_epi16(8); - const __m128i max = _mm_set1_epi16(12043); - const __m128i min = _mm_set1_epi16(-12043); - // Load input into __m128i - inptr[0] = _mm_loadu_si128((const __m128i *)input); - inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); - inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); - inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); - - // Pack to 16 bits - inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); - inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); - - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (!test) { - // Do the row transform - idct4_sse2(inptr); - - // Check the min & max values - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (test) { - transpose_4x4(inptr); - sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); - sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); - inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); - inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); - inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); - inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); - _mm_storeu_si128((__m128i *)outptr, inptr[0]); - _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); - _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); - _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct4_c(input, outptr, bd); - input += 4; - outptr += 4; - } - } - - if (optimised_cols) { - idct4_sse2(inptr); - - // Final round and shift - inptr[0] = _mm_add_epi16(inptr[0], eight); - inptr[1] = _mm_add_epi16(inptr[1], eight); - - inptr[0] = _mm_srai_epi16(inptr[0], 4); - inptr[1] = _mm_srai_epi16(inptr[1], 4); - - // Reconstruction and Store - { - __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); - __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi64( - d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); - d2 = _mm_unpacklo_epi64( - d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); - d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); - d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); - // store input0 - _mm_storel_epi64((__m128i *)dest, d0); - // store input1 - d0 = _mm_srli_si128(d0, 8); - _mm_storel_epi64((__m128i *)(dest + stride), d0); - // store input2 - _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); - // store input3 - d2 = _mm_srli_si128(d2, 8); - _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[4], temp_out[4]; - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - vpx_highbd_idct4_c(temp_in, temp_out, bd); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); - } - } - } -} - -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct8_sse2(inptr); - - // Find the min & max for the column transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - array_transpose_8x8(inptr, inptr); - for (i = 0; i < 8; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 8; ++i) { - vpx_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - } - - if (optimised_cols) { - idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - vpx_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } - } -} - -void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - // only first 4 row has non-zero coefs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct8_sse2(inptr); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - array_transpose_4X8(inptr, inptr); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - } - - if (optimised_cols) { - idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - vpx_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } - } -} - -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16]; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[32]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_set1_epi16(32); - const __m128i max = _mm_set1_epi16(3155); - const __m128i min = _mm_set1_epi16(-3155); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 16; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); - inptr[i + 16] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 32; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct16_sse2(inptr, inptr + 16); - - // Find the min & max for the column transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 32; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - array_transpose_16x16(inptr, inptr + 16); - for (i = 0; i < 16; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); - sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); - temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 16; ++i) { - vpx_highbd_idct16_c(input, outptr, bd); - input += 16; - outptr += 16; - } - } - - if (optimised_cols) { - idct16_sse2(inptr, inptr + 16); - - // Final round & shift and Reconstruction and Store - { - __m128i d[2]; - for (i = 0; i < 16; i++) { - inptr[i] = _mm_add_epi16(inptr[i], rounding); - inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); - inptr[i] = _mm_srai_epi16(inptr[i], 6); - inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[16], temp_out[16]; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - vpx_highbd_idct16_c(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } - } -} - -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[32]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_set1_epi16(32); - const __m128i max = _mm_set1_epi16(3155); - const __m128i min = _mm_set1_epi16(-3155); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 16; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); - inptr[i + 16] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - // Since all non-zero dct coefficients are in upper-left 4x4 area, - // we only need to consider first 4 rows here. - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform (N.B. This transposes inptr) - idct16_sse2(inptr, inptr + 16); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 16; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - array_transpose_8x8(inptr, inptr); - array_transpose_8x8(inptr + 8, inptr + 16); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); - sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); - temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct16_c(input, outptr, bd); - input += 16; - outptr += 16; - } - } - - if (optimised_cols) { - idct16_sse2(inptr, inptr + 16); - - // Final round & shift and Reconstruction and Store - { - __m128i d[2]; - for (i = 0; i < 16; i++) { - inptr[i] = _mm_add_epi16(inptr[i], rounding); - inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); - inptr[i] = _mm_srai_epi16(inptr[i], 6); - inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[16], temp_out[16]; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - vpx_highbd_idct16_c(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } - } -} - -void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - __m128i dc_value, d; - const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); - int a, i, j; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - tran_low_t out; - - out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); - a = ROUND_POWER_OF_TWO(out, 6); - - d = _mm_set1_epi32(a); - dc_value = _mm_packs_epi32(d, d); - for (i = 0; i < 32; ++i) { - for (j = 0; j < 4; ++j) { - d = _mm_loadu_si128((const __m128i *)(&dest[j * 8])); - d = _mm_adds_epi16(d, dc_value); - d = _mm_max_epi16(d, zero); - d = _mm_min_epi16(d, max); - _mm_storeu_si128((__m128i *)(&dest[j * 8]), d); - } - dest += stride; - } -} -#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h index d5683ab1c..0460ab13b 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.h +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -279,6 +279,34 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { res3 = _mm_packs_epi32(tmp6, tmp7); \ } +static INLINE void recon_and_store4x4_sse2(const __m128i *const in, + uint8_t *const dest, + const int stride) { + const __m128i zero = _mm_setzero_si128(); + __m128i d[2]; + + // Reconstruction and Store + d[0] = _mm_cvtsi32_si128(*(const int *)(dest)); + d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); + d[0] = _mm_unpacklo_epi32(d[0], + _mm_cvtsi32_si128(*(const int *)(dest + stride))); + d[1] = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]); + d[0] = _mm_unpacklo_epi8(d[0], zero); + d[1] = _mm_unpacklo_epi8(d[1], zero); + d[0] = _mm_add_epi16(d[0], in[0]); + d[1] = _mm_add_epi16(d[1], in[1]); + d[0] = _mm_packus_epi16(d[0], d[1]); + + *(int *)dest = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); +} + void idct4_sse2(__m128i *in); void idct8_sse2(__m128i *in); void idct16_sse2(__m128i *in0, __m128i *in1); diff --git a/vpx_dsp/x86/transpose_sse2.h b/vpx_dsp/x86/transpose_sse2.h new file mode 100644 index 000000000..7292723e3 --- /dev/null +++ b/vpx_dsp/x86/transpose_sse2.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_ +#define VPX_DSP_X86_TRANSPOSE_SSE2_H_ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void transpose_4x4(__m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); + + res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); + res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); +} + +#endif // VPX_DSP_X86_TRANSPOSE_SSE2_H_