From: Yi Luo Date: Thu, 28 Apr 2016 23:10:14 +0000 (-0700) Subject: HBD hybrid transform 8x8 SSE4.1 optimization X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=299c5fc202d4a041b3f87de359121d03aa314c65;p=libvpx HBD hybrid transform 8x8 SSE4.1 optimization - Tx_type: DCT_DCT, DCT_ADST, ADST_DCT, ADST_ADST. - Update bit-exact unit test against current C version. - HBD encoder speed improves ~3.8%. Change-Id: Ie13925ba11214eef2b5326814940638507bf68ec --- diff --git a/test/vp10_fht8x8_test.cc b/test/vp10_fht8x8_test.cc index df1826f5d..468b8c9b8 100644 --- a/test/vp10_fht8x8_test.cc +++ b/test/vp10_fht8x8_test.cc @@ -27,13 +27,28 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, int tx_type); using libvpx_test::FhtFunc; -typedef std::tr1::tuple Ht8x8Param; +using std::tr1::tuple; +typedef tuple Ht8x8Param; void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) { vp10_fht8x8_c(in, out, stride, tx_type); } +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*IhighbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type, int bd); +typedef void (*HBDFhtFunc)(const int16_t *input, int32_t *output, int stride, + int tx_type, int bd); +// Target optimized function, tx_type, bit depth +typedef tuple HighbdHt8x8Param; + +void highbe_fht8x8_ref(const int16_t *in, int32_t *out, int stride, + int tx_type, int bd) { + vp10_fwd_txfm2d_8x8_c(in, out, stride, tx_type, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + class VP10Trans8x8HT : public libvpx_test::TransformTestBase, public ::testing::TestWithParam { @@ -69,60 +84,76 @@ TEST_P(VP10Trans8x8HT, CoeffCheck) { RunCoeffCheck(); } -#if CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH -TEST(VP10Trans8x8HTSpeedTest, C_version) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - const int count_test_block = 20000; - int bit_depth = 8; - int mask = (1 << bit_depth) - 1; - const int num_coeffs = 64; - int16_t *input = new int16_t[num_coeffs]; - tran_low_t *output = new tran_low_t[num_coeffs]; - const int stride = 8; - int tx_type; - - for (int i = 0; i < count_test_block; ++i) { - for (int j = 0; j < num_coeffs; ++j) { - input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask); - } - for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) { - vp10_fht8x8_c(input, output, stride, tx_type); - } +#if CONFIG_VP9_HIGHBITDEPTH +class VP10HighbdTrans8x8HT : public ::testing::TestWithParam { + public: + virtual ~VP10HighbdTrans8x8HT() {} + + virtual void SetUp() { + fwd_txfm_ = GET_PARAM(0); + fwd_txfm_ref_ = highbe_fht8x8_ref; + tx_type_ = GET_PARAM(1); + bit_depth_ = GET_PARAM(2); + mask_ = (1 << bit_depth_) - 1; + num_coeffs_ = 64; + + input_ = reinterpret_cast + (vpx_memalign(16, sizeof(int16_t) * num_coeffs_)); + output_ = reinterpret_cast + (vpx_memalign(16, sizeof(int32_t) * num_coeffs_)); + output_ref_ = reinterpret_cast + (vpx_memalign(16, sizeof(int32_t) * num_coeffs_)); + } + + virtual void TearDown() { + vpx_free(input_); + vpx_free(output_); + vpx_free(output_ref_); + libvpx_test::ClearSystemState(); + } + + protected: + void RunBitexactCheck(); + + private: + HBDFhtFunc fwd_txfm_; + HBDFhtFunc fwd_txfm_ref_; + int tx_type_; + int bit_depth_; + int mask_; + int num_coeffs_; + int16_t *input_; + int32_t *output_; + int32_t *output_ref_; +}; + +void VP10HighbdTrans8x8HT::RunBitexactCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + int i, j; + const int stride = 8; + const int num_tests = 200000; + const int num_coeffs = 64; + + for (i = 0; i < num_tests; ++i) { + for (j = 0; j < num_coeffs; ++j) { + input_[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_); } - delete[] input; - delete[] output; -} -#endif // CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH - -#if HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH -TEST(VP10Trans8x8HTSpeedTest, SSE2_version) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - const int count_test_block = 20000; - int bit_depth = 8; - int mask = (1 << bit_depth) - 1; - const int num_coeffs = 64; - int16_t *input = reinterpret_cast - (vpx_memalign(16, sizeof(int16_t) * num_coeffs)); - tran_low_t *output = reinterpret_cast - (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs)); - - const int stride = 8; - int tx_type; - - for (int i = 0; i < count_test_block; ++i) { - for (int j = 0; j < num_coeffs; ++j) { - input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask); - } - for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) { - vp10_fht8x8_sse2(input, output, stride, tx_type); - } + fwd_txfm_ref_(input_, output_ref_, stride, tx_type_, bit_depth_); + fwd_txfm_(input_, output_, stride, tx_type_, bit_depth_); + + for (j = 0; j < num_coeffs; ++j) { + EXPECT_EQ(output_[j], output_ref_[j]) + << "Not bit-exact result at index: " << j + << " at test block: " << i; } + } +} - vpx_free(input); - vpx_free(output); +TEST_P(VP10HighbdTrans8x8HT, HighbdCoeffCheck) { + RunBitexactCheck(); } -#endif // HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH +#endif // CONFIG_VP9_HIGHBITDEPTH using std::tr1::make_tuple; @@ -166,4 +197,20 @@ INSTANTIATE_TEST_CASE_P( ::testing::ValuesIn(kArrayHt8x8Param_sse2)); #endif // HAVE_SSE2 +#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH +const HighbdHt8x8Param kArrayHBDHt8x8Param_sse4_1[] = { + make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 0, 10), + make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 0, 12), + make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 1, 10), + make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 1, 12), + make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 2, 10), + make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 2, 12), + make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 3, 10), + make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 3, 12) +}; +INSTANTIATE_TEST_CASE_P( + SSE4_1, VP10HighbdTrans8x8HT, + ::testing::ValuesIn(kArrayHBDHt8x8Param_sse4_1)); +#endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH + } // namespace diff --git a/vp10/common/x86/idct_intrin_sse2.c b/vp10/common/x86/idct_intrin_sse2.c index 900f09188..d8b6d957b 100644 --- a/vp10/common/x86/idct_intrin_sse2.c +++ b/vp10/common/x86/idct_intrin_sse2.c @@ -14,13 +14,6 @@ #include "vp10/common/enums.h" #if CONFIG_EXT_TX -// Reverse the 8 16 bit words in __m128i -static INLINE __m128i mm_reverse_epi16(const __m128i x) { - const __m128i a = _mm_shufflelo_epi16(x, 0x1b); - const __m128i b = _mm_shufflehi_epi16(a, 0x1b); - return _mm_shuffle_epi32(b, 0x4e); -} - static INLINE void fliplr_4x4(__m128i in[2]) { in[0] = _mm_shufflelo_epi16(in[0], 0x1b); in[0] = _mm_shufflehi_epi16(in[0], 0x1b); diff --git a/vp10/common/x86/vp10_fwd_txfm2d_sse4.c b/vp10/common/x86/vp10_fwd_txfm2d_sse4.c index 1b1108723..daed25ef5 100644 --- a/vp10/common/x86/vp10_fwd_txfm2d_sse4.c +++ b/vp10/common/x86/vp10_fwd_txfm2d_sse4.c @@ -87,15 +87,6 @@ static inline void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output, transpose_32(txfm_size, buf_128, out_128); } -void vp10_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output, - const int stride, int tx_type, - const int bd) { - int32_t txfm_buf[64]; - const TXFM_2D_CFG* cfg = vp10_get_txfm_8x8_cfg(tx_type); - (void)bd; - fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf); -} - void vp10_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *output, const int stride, int tx_type, const int bd) { diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_sse2.c index 47422adb2..ea0ccb87b 100644 --- a/vp10/encoder/x86/dct_sse2.c +++ b/vp10/encoder/x86/dct_sse2.c @@ -18,13 +18,6 @@ #include "vpx_dsp/x86/txfm_common_sse2.h" #include "vpx_ports/mem.h" -// Reverse the 8 16 bit words in __m128i -static INLINE __m128i mm_reverse_epi16(const __m128i x) { - const __m128i a = _mm_shufflelo_epi16(x, 0x1b); - const __m128i b = _mm_shufflehi_epi16(a, 0x1b); - return _mm_shuffle_epi32(b, 0x4e); -} - static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, int stride, int flipud, int fliplr) { const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); diff --git a/vp10/encoder/x86/highbd_fwd_txfm_sse4.c b/vp10/encoder/x86/highbd_fwd_txfm_sse4.c index 3cda783a7..949816c65 100644 --- a/vp10/encoder/x86/highbd_fwd_txfm_sse4.c +++ b/vp10/encoder/x86/highbd_fwd_txfm_sse4.c @@ -16,6 +16,7 @@ #include "vp10/common/vp10_fwd_txfm2d_cfg.h" #include "vp10/common/vp10_txfm.h" #include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" #include "vpx_ports/mem.h" static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, @@ -244,3 +245,726 @@ void vp10_fwd_txfm2d_4x4_sse4_1(const int16_t *input, tran_low_t *coeff, } (void)bd; } + +static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, + int stride, int flipud, int fliplr, + int shift) { + __m128i u; + if (!flipud) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + } else { + in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + } + + if (fliplr) { + in[0] = mm_reverse_epi16(in[0]); + in[1] = mm_reverse_epi16(in[1]); + in[2] = mm_reverse_epi16(in[2]); + in[3] = mm_reverse_epi16(in[3]); + in[4] = mm_reverse_epi16(in[4]); + in[5] = mm_reverse_epi16(in[5]); + in[6] = mm_reverse_epi16(in[6]); + in[7] = mm_reverse_epi16(in[7]); + } + + u = _mm_unpackhi_epi64(in[4], in[4]); + in[8] = _mm_cvtepi16_epi32(in[4]); + in[9] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[5], in[5]); + in[10] = _mm_cvtepi16_epi32(in[5]); + in[11] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[6], in[6]); + in[12] = _mm_cvtepi16_epi32(in[6]); + in[13] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[7], in[7]); + in[14] = _mm_cvtepi16_epi32(in[7]); + in[15] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[3], in[3]); + in[6] = _mm_cvtepi16_epi32(in[3]); + in[7] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[2], in[2]); + in[4] = _mm_cvtepi16_epi32(in[2]); + in[5] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[1], in[1]); + in[2] = _mm_cvtepi16_epi32(in[1]); + in[3] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[0], in[0]); + in[0] = _mm_cvtepi16_epi32(in[0]); + in[1] = _mm_cvtepi16_epi32(u); + + in[0] = _mm_slli_epi32(in[0], shift); + in[1] = _mm_slli_epi32(in[1], shift); + in[2] = _mm_slli_epi32(in[2], shift); + in[3] = _mm_slli_epi32(in[3], shift); + in[4] = _mm_slli_epi32(in[4], shift); + in[5] = _mm_slli_epi32(in[5], shift); + in[6] = _mm_slli_epi32(in[6], shift); + in[7] = _mm_slli_epi32(in[7], shift); + + in[8] = _mm_slli_epi32(in[8], shift); + in[9] = _mm_slli_epi32(in[9], shift); + in[10] = _mm_slli_epi32(in[10], shift); + in[11] = _mm_slli_epi32(in[11], shift); + in[12] = _mm_slli_epi32(in[12], shift); + in[13] = _mm_slli_epi32(in[13], shift); + in[14] = _mm_slli_epi32(in[14], shift); + in[15] = _mm_slli_epi32(in[15], shift); +} + +static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) { + const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); + + in[0] = _mm_add_epi32(in[0], rounding); + in[1] = _mm_add_epi32(in[1], rounding); + in[2] = _mm_add_epi32(in[2], rounding); + in[3] = _mm_add_epi32(in[3], rounding); + in[4] = _mm_add_epi32(in[4], rounding); + in[5] = _mm_add_epi32(in[5], rounding); + in[6] = _mm_add_epi32(in[6], rounding); + in[7] = _mm_add_epi32(in[7], rounding); + in[8] = _mm_add_epi32(in[8], rounding); + in[9] = _mm_add_epi32(in[9], rounding); + in[10] = _mm_add_epi32(in[10], rounding); + in[11] = _mm_add_epi32(in[11], rounding); + in[12] = _mm_add_epi32(in[12], rounding); + in[13] = _mm_add_epi32(in[13], rounding); + in[14] = _mm_add_epi32(in[14], rounding); + in[15] = _mm_add_epi32(in[15], rounding); + + in[0] = _mm_srai_epi32(in[0], shift); + in[1] = _mm_srai_epi32(in[1], shift); + in[2] = _mm_srai_epi32(in[2], shift); + in[3] = _mm_srai_epi32(in[3], shift); + in[4] = _mm_srai_epi32(in[4], shift); + in[5] = _mm_srai_epi32(in[5], shift); + in[6] = _mm_srai_epi32(in[6], shift); + in[7] = _mm_srai_epi32(in[7], shift); + in[8] = _mm_srai_epi32(in[8], shift); + in[9] = _mm_srai_epi32(in[9], shift); + in[10] = _mm_srai_epi32(in[10], shift); + in[11] = _mm_srai_epi32(in[11], shift); + in[12] = _mm_srai_epi32(in[12], shift); + in[13] = _mm_srai_epi32(in[13], shift); + in[14] = _mm_srai_epi32(in[14], shift); + in[15] = _mm_srai_epi32(in[15], shift); +} + +#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \ + do { \ + __m128i u0, u1, u2, u3; \ + u0 = _mm_unpacklo_epi32(x0, x1); \ + u1 = _mm_unpackhi_epi32(x0, x1); \ + u2 = _mm_unpacklo_epi32(x2, x3); \ + u3 = _mm_unpackhi_epi32(x2, x3); \ + y0 = _mm_unpacklo_epi64(u0, u2); \ + y1 = _mm_unpackhi_epi64(u0, u2); \ + y2 = _mm_unpacklo_epi64(u1, u3); \ + y3 = _mm_unpackhi_epi64(u1, u3); \ + } while (0) + +static INLINE void transpose_8x8(__m128i *in) { + __m128i t[4]; + + TRANSPOSE_4X4(in[0], in[2], in[4], in[6], in[0], in[2], in[4], in[6]); + TRANSPOSE_4X4(in[1], in[3], in[5], in[7], t[0], t[1], t[2], t[3]); + TRANSPOSE_4X4(in[8], in[10], in[12], in[14], in[1], in[3], in[5], in[7]); + in[8] = t[0]; + in[10] = t[1]; + in[12] = t[2]; + in[14] = t[3]; + TRANSPOSE_4X4(in[9], in[11], in[13], in[15], in[9], in[11], in[13], in[15]); +} + +static INLINE void write_buffer_8x8(__m128i *res, tran_low_t *output) { + _mm_store_si128((__m128i *)(output + 0 * 4), res[0]); + _mm_store_si128((__m128i *)(output + 1 * 4), res[1]); + _mm_store_si128((__m128i *)(output + 2 * 4), res[2]); + _mm_store_si128((__m128i *)(output + 3 * 4), res[3]); + + _mm_store_si128((__m128i *)(output + 4 * 4), res[4]); + _mm_store_si128((__m128i *)(output + 5 * 4), res[5]); + _mm_store_si128((__m128i *)(output + 6 * 4), res[6]); + _mm_store_si128((__m128i *)(output + 7 * 4), res[7]); + + _mm_store_si128((__m128i *)(output + 8 * 4), res[8]); + _mm_store_si128((__m128i *)(output + 9 * 4), res[9]); + _mm_store_si128((__m128i *)(output + 10 * 4), res[10]); + _mm_store_si128((__m128i *)(output + 11 * 4), res[11]); + + _mm_store_si128((__m128i *)(output + 12 * 4), res[12]); + _mm_store_si128((__m128i *)(output + 13 * 4), res[13]); + _mm_store_si128((__m128i *)(output + 14 * 4), res[14]); + _mm_store_si128((__m128i *)(output + 15 * 4), res[15]); +} + +static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) { + const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + __m128i u[8], v[8]; + + // Even 8 points 0, 2, ..., 14 + // stage 0 + // stage 1 + u[0] = _mm_add_epi32(in[0], in[14]); + v[7] = _mm_sub_epi32(in[0], in[14]); // v[7] + u[1] = _mm_add_epi32(in[2], in[12]); + u[6] = _mm_sub_epi32(in[2], in[12]); + u[2] = _mm_add_epi32(in[4], in[10]); + u[5] = _mm_sub_epi32(in[4], in[10]); + u[3] = _mm_add_epi32(in[6], in[8]); + v[4] = _mm_sub_epi32(in[6], in[8]); // v[4] + + // stage 2 + v[0] = _mm_add_epi32(u[0], u[3]); + v[3] = _mm_sub_epi32(u[0], u[3]); + v[1] = _mm_add_epi32(u[1], u[2]); + v[2] = _mm_sub_epi32(u[1], u[2]); + + v[5] = _mm_mullo_epi32(u[5], cospim32); + v[6] = _mm_mullo_epi32(u[6], cospi32); + v[5] = _mm_add_epi32(v[5], v[6]); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + u[0] = _mm_mullo_epi32(u[5], cospi32); + v[6] = _mm_mullo_epi32(u[6], cospim32); + v[6] = _mm_sub_epi32(u[0], v[6]); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + // stage 3 + // type 0 + v[0] = _mm_mullo_epi32(v[0], cospi32); + v[1] = _mm_mullo_epi32(v[1], cospi32); + u[0] = _mm_add_epi32(v[0], v[1]); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_sub_epi32(v[0], v[1]); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // type 1 + v[0] = _mm_mullo_epi32(v[2], cospi48); + v[1] = _mm_mullo_epi32(v[3], cospi16); + u[2] = _mm_add_epi32(v[0], v[1]); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + v[0] = _mm_mullo_epi32(v[2], cospi16); + v[1] = _mm_mullo_epi32(v[3], cospi48); + u[3] = _mm_sub_epi32(v[1], v[0]); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + u[4] = _mm_add_epi32(v[4], v[5]); + u[5] = _mm_sub_epi32(v[4], v[5]); + u[6] = _mm_sub_epi32(v[7], v[6]); + u[7] = _mm_add_epi32(v[7], v[6]); + + // stage 4 + // stage 5 + v[0] = _mm_mullo_epi32(u[4], cospi56); + v[1] = _mm_mullo_epi32(u[7], cospi8); + v[0] = _mm_add_epi32(v[0], v[1]); + v[0] = _mm_add_epi32(v[0], rnding); + out[2] = _mm_srai_epi32(v[0], bit); // buf0[4] + + v[0] = _mm_mullo_epi32(u[4], cospi8); + v[1] = _mm_mullo_epi32(u[7], cospi56); + v[0] = _mm_sub_epi32(v[1], v[0]); + v[0] = _mm_add_epi32(v[0], rnding); + out[14] = _mm_srai_epi32(v[0], bit); // buf0[7] + + v[0] = _mm_mullo_epi32(u[5], cospi24); + v[1] = _mm_mullo_epi32(u[6], cospi40); + v[0] = _mm_add_epi32(v[0], v[1]); + v[0] = _mm_add_epi32(v[0], rnding); + out[10] = _mm_srai_epi32(v[0], bit); // buf0[5] + + v[0] = _mm_mullo_epi32(u[5], cospi40); + v[1] = _mm_mullo_epi32(u[6], cospi24); + v[0] = _mm_sub_epi32(v[1], v[0]); + v[0] = _mm_add_epi32(v[0], rnding); + out[6] = _mm_srai_epi32(v[0], bit); // buf0[6] + + out[0] = u[0]; // buf0[0] + out[8] = u[1]; // buf0[1] + out[4] = u[2]; // buf0[2] + out[12] = u[3]; // buf0[3] + + // Odd 8 points: 1, 3, ..., 15 + // stage 0 + // stage 1 + u[0] = _mm_add_epi32(in[1], in[15]); + v[7] = _mm_sub_epi32(in[1], in[15]); // v[7] + u[1] = _mm_add_epi32(in[3], in[13]); + u[6] = _mm_sub_epi32(in[3], in[13]); + u[2] = _mm_add_epi32(in[5], in[11]); + u[5] = _mm_sub_epi32(in[5], in[11]); + u[3] = _mm_add_epi32(in[7], in[9]); + v[4] = _mm_sub_epi32(in[7], in[9]); // v[4] + + // stage 2 + v[0] = _mm_add_epi32(u[0], u[3]); + v[3] = _mm_sub_epi32(u[0], u[3]); + v[1] = _mm_add_epi32(u[1], u[2]); + v[2] = _mm_sub_epi32(u[1], u[2]); + + v[5] = _mm_mullo_epi32(u[5], cospim32); + v[6] = _mm_mullo_epi32(u[6], cospi32); + v[5] = _mm_add_epi32(v[5], v[6]); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + u[0] = _mm_mullo_epi32(u[5], cospi32); + v[6] = _mm_mullo_epi32(u[6], cospim32); + v[6] = _mm_sub_epi32(u[0], v[6]); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + // stage 3 + // type 0 + v[0] = _mm_mullo_epi32(v[0], cospi32); + v[1] = _mm_mullo_epi32(v[1], cospi32); + u[0] = _mm_add_epi32(v[0], v[1]); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_sub_epi32(v[0], v[1]); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // type 1 + v[0] = _mm_mullo_epi32(v[2], cospi48); + v[1] = _mm_mullo_epi32(v[3], cospi16); + u[2] = _mm_add_epi32(v[0], v[1]); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + v[0] = _mm_mullo_epi32(v[2], cospi16); + v[1] = _mm_mullo_epi32(v[3], cospi48); + u[3] = _mm_sub_epi32(v[1], v[0]); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + u[4] = _mm_add_epi32(v[4], v[5]); + u[5] = _mm_sub_epi32(v[4], v[5]); + u[6] = _mm_sub_epi32(v[7], v[6]); + u[7] = _mm_add_epi32(v[7], v[6]); + + // stage 4 + // stage 5 + v[0] = _mm_mullo_epi32(u[4], cospi56); + v[1] = _mm_mullo_epi32(u[7], cospi8); + v[0] = _mm_add_epi32(v[0], v[1]); + v[0] = _mm_add_epi32(v[0], rnding); + out[3] = _mm_srai_epi32(v[0], bit); // buf0[4] + + v[0] = _mm_mullo_epi32(u[4], cospi8); + v[1] = _mm_mullo_epi32(u[7], cospi56); + v[0] = _mm_sub_epi32(v[1], v[0]); + v[0] = _mm_add_epi32(v[0], rnding); + out[15] = _mm_srai_epi32(v[0], bit); // buf0[7] + + v[0] = _mm_mullo_epi32(u[5], cospi24); + v[1] = _mm_mullo_epi32(u[6], cospi40); + v[0] = _mm_add_epi32(v[0], v[1]); + v[0] = _mm_add_epi32(v[0], rnding); + out[11] = _mm_srai_epi32(v[0], bit); // buf0[5] + + v[0] = _mm_mullo_epi32(u[5], cospi40); + v[1] = _mm_mullo_epi32(u[6], cospi24); + v[0] = _mm_sub_epi32(v[1], v[0]); + v[0] = _mm_add_epi32(v[0], rnding); + out[7] = _mm_srai_epi32(v[0], bit); // buf0[6] + + out[1] = u[0]; // buf0[0] + out[9] = u[1]; // buf0[1] + out[5] = u[2]; // buf0[2] + out[13] = u[3]; // buf0[3] +} + +static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) { + const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i kZero = _mm_setzero_si128(); + __m128i u[8], v[8], x; + + // Even 8 points: 0, 2, ..., 14 + // stage 0 + // stage 1 + // stage 2 + // (1) + u[0] = _mm_mullo_epi32(in[14], cospi4); + x = _mm_mullo_epi32(in[0], cospi60); + u[0] = _mm_add_epi32(u[0], x); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_mullo_epi32(in[14], cospi60); + x = _mm_mullo_epi32(in[0], cospi4); + u[1] = _mm_sub_epi32(u[1], x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // (2) + u[2] = _mm_mullo_epi32(in[10], cospi20); + x = _mm_mullo_epi32(in[4], cospi44); + u[2] = _mm_add_epi32(u[2], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_mullo_epi32(in[10], cospi44); + x = _mm_mullo_epi32(in[4], cospi20); + u[3] = _mm_sub_epi32(u[3], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + // (3) + u[4] = _mm_mullo_epi32(in[6], cospi36); + x = _mm_mullo_epi32(in[8], cospi28); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(in[6], cospi28); + x = _mm_mullo_epi32(in[8], cospi36); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + // (4) + u[6] = _mm_mullo_epi32(in[2], cospi52); + x = _mm_mullo_epi32(in[12], cospi12); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(in[2], cospi12); + x = _mm_mullo_epi32(in[12], cospi52); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 3 + v[0] = _mm_add_epi32(u[0], u[4]); + v[4] = _mm_sub_epi32(u[0], u[4]); + v[1] = _mm_add_epi32(u[1], u[5]); + v[5] = _mm_sub_epi32(u[1], u[5]); + v[2] = _mm_add_epi32(u[2], u[6]); + v[6] = _mm_sub_epi32(u[2], u[6]); + v[3] = _mm_add_epi32(u[3], u[7]); + v[7] = _mm_sub_epi32(u[3], u[7]); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[6], cospim48); + x = _mm_mullo_epi32(v[7], cospi16); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(v[6], cospi16); + x = _mm_mullo_epi32(v[7], cospim48); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 5 + v[0] = _mm_add_epi32(u[0], u[2]); + v[2] = _mm_sub_epi32(u[0], u[2]); + v[1] = _mm_add_epi32(u[1], u[3]); + v[3] = _mm_sub_epi32(u[1], u[3]); + v[4] = _mm_add_epi32(u[4], u[6]); + v[6] = _mm_sub_epi32(u[4], u[6]); + v[5] = _mm_add_epi32(u[5], u[7]); + v[7] = _mm_sub_epi32(u[5], u[7]); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + u[2] = _mm_add_epi32(v[0], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(v[0], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + v[0] = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + u[6] = _mm_add_epi32(v[0], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(v[0], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + out[0] = u[0]; + out[2] = _mm_sub_epi32(kZero, u[4]); + out[4] = u[6]; + out[6] = _mm_sub_epi32(kZero, u[2]); + out[8] = u[3]; + out[10] = _mm_sub_epi32(kZero, u[7]); + out[12] = u[5]; + out[14] = _mm_sub_epi32(kZero, u[1]); + + // Odd 8 points: 1, 3, ..., 15 + // stage 0 + // stage 1 + // stage 2 + // (1) + u[0] = _mm_mullo_epi32(in[15], cospi4); + x = _mm_mullo_epi32(in[1], cospi60); + u[0] = _mm_add_epi32(u[0], x); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_mullo_epi32(in[15], cospi60); + x = _mm_mullo_epi32(in[1], cospi4); + u[1] = _mm_sub_epi32(u[1], x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // (2) + u[2] = _mm_mullo_epi32(in[11], cospi20); + x = _mm_mullo_epi32(in[5], cospi44); + u[2] = _mm_add_epi32(u[2], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_mullo_epi32(in[11], cospi44); + x = _mm_mullo_epi32(in[5], cospi20); + u[3] = _mm_sub_epi32(u[3], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + // (3) + u[4] = _mm_mullo_epi32(in[7], cospi36); + x = _mm_mullo_epi32(in[9], cospi28); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(in[7], cospi28); + x = _mm_mullo_epi32(in[9], cospi36); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + // (4) + u[6] = _mm_mullo_epi32(in[3], cospi52); + x = _mm_mullo_epi32(in[13], cospi12); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(in[3], cospi12); + x = _mm_mullo_epi32(in[13], cospi52); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 3 + v[0] = _mm_add_epi32(u[0], u[4]); + v[4] = _mm_sub_epi32(u[0], u[4]); + v[1] = _mm_add_epi32(u[1], u[5]); + v[5] = _mm_sub_epi32(u[1], u[5]); + v[2] = _mm_add_epi32(u[2], u[6]); + v[6] = _mm_sub_epi32(u[2], u[6]); + v[3] = _mm_add_epi32(u[3], u[7]); + v[7] = _mm_sub_epi32(u[3], u[7]); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[6], cospim48); + x = _mm_mullo_epi32(v[7], cospi16); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(v[6], cospi16); + x = _mm_mullo_epi32(v[7], cospim48); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 5 + v[0] = _mm_add_epi32(u[0], u[2]); + v[2] = _mm_sub_epi32(u[0], u[2]); + v[1] = _mm_add_epi32(u[1], u[3]); + v[3] = _mm_sub_epi32(u[1], u[3]); + v[4] = _mm_add_epi32(u[4], u[6]); + v[6] = _mm_sub_epi32(u[4], u[6]); + v[5] = _mm_add_epi32(u[5], u[7]); + v[7] = _mm_sub_epi32(u[5], u[7]); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + u[2] = _mm_add_epi32(v[0], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(v[0], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + v[0] = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + u[6] = _mm_add_epi32(v[0], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(v[0], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + out[1] = u[0]; + out[3] = _mm_sub_epi32(kZero, u[4]); + out[5] = u[6]; + out[7] = _mm_sub_epi32(kZero, u[2]); + out[9] = u[3]; + out[11] = _mm_sub_epi32(kZero, u[7]); + out[13] = u[5]; + out[15] = _mm_sub_epi32(kZero, u[1]); +} + +void vp10_fwd_txfm2d_8x8_sse4_1(const int16_t *input, tran_low_t *coeff, + int stride, int tx_type, int bd) { + __m128i in[16], out[16]; + const TXFM_2D_CFG *cfg = NULL; + + switch (tx_type) { + case DCT_DCT: + cfg = &fwd_txfm_2d_cfg_dct_dct_8; + load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]); + fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]); + col_txfm_8x8_rounding(out, -cfg->shift[1]); + transpose_8x8(out); + fdct8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + transpose_8x8(in); + write_buffer_8x8(in, coeff); + break; + case ADST_DCT: + cfg = &fwd_txfm_2d_cfg_adst_dct_8; + load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]); + fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]); + col_txfm_8x8_rounding(out, -cfg->shift[1]); + transpose_8x8(out); + fdct8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + transpose_8x8(in); + write_buffer_8x8(in, coeff); + break; + case DCT_ADST: + cfg = &fwd_txfm_2d_cfg_dct_adst_8; + load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]); + fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]); + col_txfm_8x8_rounding(out, -cfg->shift[1]); + transpose_8x8(out); + fadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + transpose_8x8(in); + write_buffer_8x8(in, coeff); + break; + case ADST_ADST: + cfg = &fwd_txfm_2d_cfg_adst_adst_8; + load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]); + fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]); + col_txfm_8x8_rounding(out, -cfg->shift[1]); + transpose_8x8(out); + fadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + transpose_8x8(in); + write_buffer_8x8(in, coeff); + break; + default: + assert(0); + } + (void)bd; +} diff --git a/vpx_dsp/x86/txfm_common_sse2.h b/vpx_dsp/x86/txfm_common_sse2.h index 536b20687..f886d30de 100644 --- a/vpx_dsp/x86/txfm_common_sse2.h +++ b/vpx_dsp/x86/txfm_common_sse2.h @@ -26,4 +26,11 @@ _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \ (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h)) +// Reverse the 8 16 bit words in __m128i +static INLINE __m128i mm_reverse_epi16(const __m128i x) { + const __m128i a = _mm_shufflelo_epi16(x, 0x1b); + const __m128i b = _mm_shufflehi_epi16(a, 0x1b); + return _mm_shuffle_epi32(b, 0x4e); +} + #endif // VPX_DSP_X86_TXFM_COMMON_SSE2_H_