From: Yi Luo Date: Tue, 8 Mar 2016 22:10:24 +0000 (-0800) Subject: Implemented DST 16x16 SSE2 intrinsics optimization X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=50a164a1f6eb7b32d34b0e9fc40f5f6067fdfb57;p=libvpx Implemented DST 16x16 SSE2 intrinsics optimization - Implemented fdst16_sse2(), fdst16_8col() against C version: fdst16(). - Turned on 7 DST related hybrid txfm types in vp10_fht16x16_sse2(). - Replaced vp10_fht10x10_c() with vp10_fht16x16_sse2() in fwd_txfm_16x16(). - Added vp10_fht16x16_sse2() unit test against C version: vp10_fht16x16_c() (--gtest_filter=*VP10Trans16x16*). - Unit test passed. - Speed improvement: 2.4%, 3.2%, 3.2%, for city_cif.y4m, garden_sif.y4m, and mobile_cif.y4m. Change-Id: Ib30a67ce5d5964bef143d588d0f8fa438be8901f --- diff --git a/test/test.mk b/test/test.mk index 5983f42b4..1f120ce6d 100644 --- a/test/test.mk +++ b/test/test.mk @@ -168,6 +168,7 @@ LIBVPX_TEST_SRCS-yes += vp10_inv_txfm_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht4x4_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht8x8_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht16x16_test.cc LIBVPX_TEST_SRCS-$(CONFIG_ANS) += vp10_ans_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc diff --git a/test/vp10_fht16x16_test.cc b/test/vp10_fht16x16_test.cc new file mode 100644 index 000000000..d501e10d6 --- /dev/null +++ b/test/vp10_fht16x16_test.cc @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp10_rtcd.h" +#include "./vpx_dsp_rtcd.h" + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/transform_test_base.h" +#include "test/util.h" +#include "vpx_ports/mem.h" + +using libvpx_test::ACMRandom; + +namespace { +typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type); + +using libvpx_test::FhtFunc; +typedef std::tr1::tuple +Ht16x16Param; + +void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride, + int tx_type) { + vp10_fht16x16_c(in, out, stride, tx_type); +} + +class VP10Trans16x16HT + : public libvpx_test::TransformTestBase, + public ::testing::TestWithParam { + public: + virtual ~VP10Trans16x16HT() {} + + virtual void SetUp() { + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + tx_type_ = GET_PARAM(2); + pitch_ = 16; + fwd_txfm_ref = fht16x16_ref; + bit_depth_ = GET_PARAM(3); + mask_ = (1 << bit_depth_) - 1; + num_coeffs_ = GET_PARAM(4); + } + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) { + fwd_txfm_(in, out, stride, tx_type_); + } + + void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) { + inv_txfm_(out, dst, stride, tx_type_); + } + + FhtFunc fwd_txfm_; + IhtFunc inv_txfm_; +}; + +TEST_P(VP10Trans16x16HT, CoeffCheck) { + RunCoeffCheck(); +} + +using std::tr1::make_tuple; + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, VP10Trans16x16HT, + ::testing::Values( +#if !CONFIG_EXT_TX + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 0, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 1, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 2, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 3, + VPX_BITS_8, 256))); +#else + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 0, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 1, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 2, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 3, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 4, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 5, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 6, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 7, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 8, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 9, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 10, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 11, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 12, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 13, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 14, + VPX_BITS_8, 256), + make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 15, + VPX_BITS_8, 256))); +#endif // !CONFIG_EXT_TX +#endif // HAVE_SSE2 + +} // namespace diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl index 2344ce2b2..5adaaeb4d 100644 --- a/vp10/common/vp10_rtcd_defs.pl +++ b/vp10/common/vp10_rtcd_defs.pl @@ -155,7 +155,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp10_iht8x8_64_add sse2/; add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; - specialize qw/vp10_iht16x16_256_add/; + specialize qw/vp10_iht16x16_256_add sse2/; add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp10_fdct4x4 sse2/; diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c index 029240f71..ffc925c3a 100644 --- a/vp10/encoder/hybrid_fwd_txfm.c +++ b/vp10/encoder/hybrid_fwd_txfm.c @@ -134,8 +134,6 @@ static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff, case FLIPADST_FLIPADST: case ADST_FLIPADST: case FLIPADST_ADST: - vp10_fht16x16(src_diff, coeff, diff_stride, tx_type); - break; case DST_DST: case DCT_DST: case DST_DCT: @@ -143,8 +141,7 @@ static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff, case ADST_DST: case DST_FLIPADST: case FLIPADST_DST: - // Use C version since DST exists only in C - vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type); + vp10_fht16x16(src_diff, coeff, diff_stride, tx_type); break; case H_DCT: case V_DCT: diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_sse2.c index aaf1e6ab0..8ff7c9c79 100644 --- a/vp10/encoder/x86/dct_sse2.c +++ b/vp10/encoder/x86/dct_sse2.c @@ -2420,6 +2420,351 @@ static void fadst16_8col(__m128i *in) { in[15] = _mm_sub_epi16(kZero, s[1]); } +#if CONFIG_EXT_TX +static void fdst16_8col(__m128i *in) { + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t) cospi_16_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + + const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t) -cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); + const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + + const __m128i k__cospi_m08_m24 = pair_set_epi16(-cospi_8_64, -cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + + const __m128i k__cospi_m30_p02 = pair_set_epi16(-cospi_30_64, cospi_2_64); + const __m128i k__cospi_m14_p18 = pair_set_epi16(-cospi_14_64, cospi_18_64); + const __m128i k__cospi_m22_p10 = pair_set_epi16(-cospi_22_64, cospi_10_64); + const __m128i k__cospi_m06_p26 = pair_set_epi16(-cospi_6_64, cospi_26_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i x0, x1, x2, x3, t0, t1, t2, t3; + __m128i y0, y1, y2, y3, y4, y5, y6, y7; + __m128i w0, w1, w2, w3, w4, w5, w6, w7; + + // (1) + u0 = _mm_sub_epi16(in[0], in[15]); + v7 = _mm_add_epi16(in[0], in[15]); + + u1 = _mm_sub_epi16(in[1], in[14]); // -u1 + v6 = _mm_add_epi16(in[1], in[14]); // -v6 + + u2 = _mm_sub_epi16(in[2], in[13]); + v5 = _mm_add_epi16(in[2], in[13]); + + u3 = _mm_sub_epi16(in[3], in[12]); // -u3 + v4 = _mm_add_epi16(in[3], in[12]); // -v4 + + u4 = _mm_sub_epi16(in[4], in[11]); + v3 = _mm_add_epi16(in[4], in[11]); + + u5 = _mm_sub_epi16(in[5], in[10]); // -u5 + v2 = _mm_add_epi16(in[5], in[10]); // -v2 + + u6 = _mm_sub_epi16(in[6], in[9]); + v1 = _mm_add_epi16(in[6], in[9]); + + u7 = _mm_sub_epi16(in[7], in[8]); // -u7 + v0 = _mm_add_epi16(in[7], in[8]); // -v0 + + s0 = _mm_sub_epi16(u0, u7); + s1 = _mm_sub_epi16(u1, u6); // -s1 + s2 = _mm_sub_epi16(u2, u5); + s3 = _mm_sub_epi16(u3, u4); // -s3 + s4 = _mm_add_epi16(u3, u4); // -s4 + s5 = _mm_add_epi16(u2, u5); + s6 = _mm_add_epi16(u1, u6); // -s6 + s7 = _mm_add_epi16(u0, u7); + + x0 = _mm_sub_epi16(s0, s3); + x1 = _mm_sub_epi16(s1, s2); // -x1 + x2 = _mm_add_epi16(s1, s2); // -x2 + x3 = _mm_add_epi16(s0, s3); + + y0 = _mm_unpacklo_epi16(x0, x1); + y1 = _mm_unpackhi_epi16(x0, x1); + y2 = _mm_unpacklo_epi16(x2, x3); + y3 = _mm_unpackhi_epi16(x2, x3); + + t0 = _mm_madd_epi16(y0, k__cospi_p16_m16); + t1 = _mm_madd_epi16(y1, k__cospi_p16_m16); + t2 = _mm_madd_epi16(y0, k__cospi_p16_p16); + t3 = _mm_madd_epi16(y1, k__cospi_p16_p16); + x0 = _mm_madd_epi16(y2, k__cospi_m24_p08); + x1 = _mm_madd_epi16(y3, k__cospi_m24_p08); + x2 = _mm_madd_epi16(y2, k__cospi_p08_p24); + x3 = _mm_madd_epi16(y3, k__cospi_p08_p24); + + y0 = _mm_add_epi32(t0, k__DCT_CONST_ROUNDING); + y1 = _mm_add_epi32(t1, k__DCT_CONST_ROUNDING); + y2 = _mm_add_epi32(t2, k__DCT_CONST_ROUNDING); + y3 = _mm_add_epi32(t3, k__DCT_CONST_ROUNDING); + y4 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING); + y5 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING); + y6 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING); + y7 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING); + + t0 = _mm_srai_epi32(y0, DCT_CONST_BITS); + t1 = _mm_srai_epi32(y1, DCT_CONST_BITS); + t2 = _mm_srai_epi32(y2, DCT_CONST_BITS); + t3 = _mm_srai_epi32(y3, DCT_CONST_BITS); + x0 = _mm_srai_epi32(y4, DCT_CONST_BITS); + x1 = _mm_srai_epi32(y5, DCT_CONST_BITS); + x2 = _mm_srai_epi32(y6, DCT_CONST_BITS); + x3 = _mm_srai_epi32(y7, DCT_CONST_BITS); + + in[15] = _mm_packs_epi32(t0, t1); + in[11] = _mm_packs_epi32(x0, x1); + in[7] = _mm_packs_epi32(t2, t3); + in[3] = _mm_packs_epi32(x2, x3); + + // (2) + t0 = _mm_unpacklo_epi16(s6, s5); + t1 = _mm_unpackhi_epi16(s6, s5); + + y0 = _mm_madd_epi16(t0, k__cospi_m16_m16); + y1 = _mm_madd_epi16(t1, k__cospi_m16_m16); + y2 = _mm_madd_epi16(t0, k__cospi_m16_p16); + y3 = _mm_madd_epi16(t1, k__cospi_m16_p16); + + x0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING); + x1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING); + x2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING); + x3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING); + + y4 = _mm_srai_epi32(x0, DCT_CONST_BITS); + y5 = _mm_srai_epi32(x1, DCT_CONST_BITS); + y6 = _mm_srai_epi32(x2, DCT_CONST_BITS); + y7 = _mm_srai_epi32(x3, DCT_CONST_BITS); + + t2 = _mm_packs_epi32(y4, y5); + t3 = _mm_packs_epi32(y6, y7); + + x0 = _mm_sub_epi16(s4, t2); // -x0 + x1 = _mm_add_epi16(s4, t2); // -x1 + x2 = _mm_sub_epi16(s7, t3); + x3 = _mm_add_epi16(s7, t3); + + y0 = _mm_unpacklo_epi16(x0, x3); + y1 = _mm_unpackhi_epi16(x0, x3); + y2 = _mm_unpacklo_epi16(x1, x2); + y3 = _mm_unpackhi_epi16(x1, x2); + + w0 = _mm_madd_epi16(y0, k__cospi_m28_p04); + w1 = _mm_madd_epi16(y1, k__cospi_m28_p04); + w2 = _mm_madd_epi16(y2, k__cospi_m12_p20); + w3 = _mm_madd_epi16(y3, k__cospi_m12_p20); + w4 = _mm_madd_epi16(y2, k__cospi_p20_p12); + w5 = _mm_madd_epi16(y3, k__cospi_p20_p12); + w6 = _mm_madd_epi16(y0, k__cospi_p04_p28); + w7 = _mm_madd_epi16(y1, k__cospi_p04_p28); + + u0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + + y0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + y1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + y2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + y3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + y4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + y5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + y6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + y7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + in[13] = _mm_packs_epi32(y0, y1); + in[9] = _mm_packs_epi32(y4, y5); + in[5] = _mm_packs_epi32(y2, y3); + in[1] = _mm_packs_epi32(y6, y7); + + // (3) + y0 = _mm_unpacklo_epi16(v5, v2); + y1 = _mm_unpackhi_epi16(v5, v2); + y2 = _mm_unpacklo_epi16(v4, v3); + y3 = _mm_unpackhi_epi16(v4, v3); + + u0 = _mm_madd_epi16(y0, k__cospi_p16_p16); + u1 = _mm_madd_epi16(y1, k__cospi_p16_p16); + u2 = _mm_madd_epi16(y2, k__cospi_m16_m16); + u3 = _mm_madd_epi16(y3, k__cospi_m16_m16); + u4 = _mm_madd_epi16(y2, k__cospi_m16_p16); + u5 = _mm_madd_epi16(y3, k__cospi_m16_p16); + u6 = _mm_madd_epi16(y0, k__cospi_p16_m16); + u7 = _mm_madd_epi16(y1, k__cospi_p16_m16); + + w0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + w1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + w2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + w3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + w4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + w5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + w6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + w7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + + s0 = _mm_srai_epi32(w0, DCT_CONST_BITS); + s1 = _mm_srai_epi32(w1, DCT_CONST_BITS); + s2 = _mm_srai_epi32(w2, DCT_CONST_BITS); + s3 = _mm_srai_epi32(w3, DCT_CONST_BITS); + s4 = _mm_srai_epi32(w4, DCT_CONST_BITS); + s5 = _mm_srai_epi32(w5, DCT_CONST_BITS); + s6 = _mm_srai_epi32(w6, DCT_CONST_BITS); + s7 = _mm_srai_epi32(w7, DCT_CONST_BITS); + + y2 = _mm_packs_epi32(s0, s1); + y3 = _mm_packs_epi32(s2, s3); + y4 = _mm_packs_epi32(s4, s5); + y5 = _mm_packs_epi32(s6, s7); + + // step 3 + w0 = _mm_sub_epi16(v0, y3); // -w0 + w1 = _mm_add_epi16(v1, y2); + w2 = _mm_sub_epi16(v1, y2); + w3 = _mm_add_epi16(v0, y3); // -w3 + w4 = _mm_sub_epi16(v7, y4); + w5 = _mm_add_epi16(v6, y5); // -w5 + w6 = _mm_sub_epi16(v6, y5); // -w6 + w7 = _mm_add_epi16(v7, y4); + + // step 4 + x0 = _mm_unpacklo_epi16(w1, w6); + x1 = _mm_unpackhi_epi16(w1, w6); + x2 = _mm_unpacklo_epi16(w2, w5); + x3 = _mm_unpackhi_epi16(w2, w5); + + u0 = _mm_madd_epi16(x0, k__cospi_m08_m24); + u1 = _mm_madd_epi16(x1, k__cospi_m08_m24); + u2 = _mm_madd_epi16(x2, k__cospi_p24_m08); + u3 = _mm_madd_epi16(x3, k__cospi_p24_m08); + u4 = _mm_madd_epi16(x2, k__cospi_p08_p24); + u5 = _mm_madd_epi16(x3, k__cospi_p08_p24); + u6 = _mm_madd_epi16(x0, k__cospi_p24_m08); + u7 = _mm_madd_epi16(x1, k__cospi_p24_m08); + + s0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + s1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + s2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + s3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + s4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + s5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + s6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + s7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(s0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(s1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(s2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(s3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(s4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(s5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(s6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(s7, DCT_CONST_BITS); + + y1 = _mm_packs_epi32(u0, u1); + y2 = _mm_packs_epi32(u2, u3); + y5 = _mm_packs_epi32(u4, u5); + y6 = _mm_packs_epi32(u6, u7); + + // step 5 + v0 = _mm_sub_epi16(w0, y1); // -v0 + v1 = _mm_add_epi16(w0, y1); // -v1 + v2 = _mm_sub_epi16(w3, y2); // -v2 + v3 = _mm_add_epi16(w3, y2); // -v3 + v4 = _mm_sub_epi16(w4, y5); + v5 = _mm_add_epi16(w4, y5); + v6 = _mm_sub_epi16(w7, y6); + v7 = _mm_add_epi16(w7, y6); + + u0 = _mm_unpacklo_epi16(v0, v7); + u1 = _mm_unpackhi_epi16(v0, v7); + u2 = _mm_unpacklo_epi16(v1, v6); + u3 = _mm_unpackhi_epi16(v1, v6); + u4 = _mm_unpacklo_epi16(v2, v5); + u5 = _mm_unpackhi_epi16(v2, v5); + u6 = _mm_unpacklo_epi16(v3, v4); + u7 = _mm_unpackhi_epi16(v3, v4); + + s0 = _mm_madd_epi16(u0, k__cospi_m30_p02); // x0 + s1 = _mm_madd_epi16(u1, k__cospi_m30_p02); + s2 = _mm_madd_epi16(u2, k__cospi_m14_p18); // x1 + s3 = _mm_madd_epi16(u3, k__cospi_m14_p18); + s4 = _mm_madd_epi16(u4, k__cospi_m22_p10); // x2 + s5 = _mm_madd_epi16(u5, k__cospi_m22_p10); + s6 = _mm_madd_epi16(u6, k__cospi_m06_p26); // x3 + s7 = _mm_madd_epi16(u7, k__cospi_m06_p26); + + w0 = _mm_madd_epi16(u6, k__cospi_p26_p06); // x4 + w1 = _mm_madd_epi16(u7, k__cospi_p26_p06); + w2 = _mm_madd_epi16(u4, k__cospi_p10_p22); // x5 + w3 = _mm_madd_epi16(u5, k__cospi_p10_p22); + w4 = _mm_madd_epi16(u2, k__cospi_p18_p14); // x6 + w5 = _mm_madd_epi16(u3, k__cospi_p18_p14); + w6 = _mm_madd_epi16(u0, k__cospi_p02_p30); // x7 + w7 = _mm_madd_epi16(u1, k__cospi_p02_p30); + + v0 = _mm_add_epi32(s0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(s1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(s2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(s3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(s4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(s5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(s6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(s7, k__DCT_CONST_ROUNDING); + + y0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + y1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + y2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + y3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + y4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + y5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + y6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + y7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + s0 = _mm_srai_epi32(y0, DCT_CONST_BITS); + s1 = _mm_srai_epi32(y1, DCT_CONST_BITS); + s2 = _mm_srai_epi32(y2, DCT_CONST_BITS); + s3 = _mm_srai_epi32(y3, DCT_CONST_BITS); + s4 = _mm_srai_epi32(y4, DCT_CONST_BITS); + s5 = _mm_srai_epi32(y5, DCT_CONST_BITS); + s6 = _mm_srai_epi32(y6, DCT_CONST_BITS); + s7 = _mm_srai_epi32(y7, DCT_CONST_BITS); + + in[14] = _mm_packs_epi32(u0, u1); + in[6] = _mm_packs_epi32(u2, u3); + in[10] = _mm_packs_epi32(u4, u5); + in[2] = _mm_packs_epi32(u6, u7); + in[12] = _mm_packs_epi32(s0, s1); + in[4] = _mm_packs_epi32(s2, s3); + in[8] = _mm_packs_epi32(s4, s5); + in[0] = _mm_packs_epi32(s6, s7); +} +#endif // CONFIG_EXT_TX + static void fdct16_sse2(__m128i *in0, __m128i *in1) { fdct16_8col(in0); fdct16_8col(in1); @@ -2432,6 +2777,14 @@ static void fadst16_sse2(__m128i *in0, __m128i *in1) { array_transpose_16x16(in0, in1); } +#if CONFIG_EXT_TX +static void fdst16_sse2(__m128i *in0, __m128i *in1) { + fdst16_8col(in0); + fdst16_8col(in1); + array_transpose_16x16(in0, in1); +} +#endif // CONFIG_EXT_TX + void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type) { __m128i in0[16], in1[16]; @@ -2497,6 +2850,55 @@ void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output, fadst16_sse2(in0, in1); write_buffer_16x16(output, in0, in1, 16); break; + case DST_DST: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fdst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fdst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case DCT_DST: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fdct16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fdst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case DST_DCT: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fdst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fdct16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case DST_ADST: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fdst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case ADST_DST: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fadst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fdst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case DST_FLIPADST: + load_buffer_16x16(input, in0, in1, stride, 0, 1); + fdst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case FLIPADST_DST: + load_buffer_16x16(input, in0, in1, stride, 1, 0); + fadst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fdst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; #endif // CONFIG_EXT_TX default: assert(0);