From 0fa59a4baf837d527459eacb1184a07df2dc71bc Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Wed, 28 Jun 2017 16:17:39 -0700 Subject: [PATCH] Refactor highbd idct 4x4 sse4.1 code and add highbd_inv_txfm_sse4.h Also clean highbd_inv_txfm_sse2.h BUG=webm:1412 Change-Id: I0722841d824ce602874019bd9779b10d49d10c0b --- vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/x86/highbd_idct16x16_add_sse2.c | 8 +-- vpx_dsp/x86/highbd_idct4x4_add_sse2.c | 8 +-- vpx_dsp/x86/highbd_idct4x4_add_sse4.c | 50 +++------------- vpx_dsp/x86/highbd_idct8x8_add_sse2.c | 4 +- vpx_dsp/x86/highbd_inv_txfm_sse2.h | 78 ++++++++++++------------- vpx_dsp/x86/highbd_inv_txfm_sse4.h | 60 +++++++++++++++++++ vpx_dsp/x86/inv_txfm_sse2.h | 44 ++++++++------ 8 files changed, 139 insertions(+), 114 deletions(-) create mode 100644 vpx_dsp/x86/highbd_inv_txfm_sse4.h diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index e8b89436b..badebc650 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -240,6 +240,7 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct4x4_add_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct8x8_add_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct16x16_add_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct32x32_add_sse2.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_inv_txfm_sse4.h DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct4x4_add_sse4.c endif # !CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c index 1df91f08f..154f11ba3 100644 --- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c @@ -105,8 +105,8 @@ void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); inptr[i] = _mm_srai_epi16(inptr[i], 6); inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); + d[0] = add_clamp(d[0], inptr[i], bd); + d[1] = add_clamp(d[1], inptr[i + 16], bd); // Store _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); @@ -222,8 +222,8 @@ void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); inptr[i] = _mm_srai_epi16(inptr[i], 6); inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); + d[0] = add_clamp(d[0], inptr[i], bd); + d[1] = add_clamp(d[1], inptr[i + 16], bd); // Store _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); diff --git a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c index ac6f73d83..1eb3a8347 100644 --- a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c @@ -191,15 +191,11 @@ void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, io[1] = wraplow_16bit(io[2], io[3], _mm_set1_epi32(8)); } - recon_and_store_4(dest, io, stride, bd); + recon_and_store_4(io, dest, stride, bd); } void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - const __m128i zero = _mm_setzero_si128(); - // Faster than _mm_set1_epi16((1 << bd) - 1). - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); int a1, i; tran_low_t out; __m128i dc, d; @@ -211,7 +207,7 @@ void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest, for (i = 0; i < 4; ++i) { d = _mm_loadl_epi64((const __m128i *)dest); - d = add_dc_clamp(&zero, &max, &dc, &d); + d = add_clamp(d, dc, bd); _mm_storel_epi64((__m128i *)dest, d); dest += stride; } diff --git a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c index 9d1c6f4b7..fc522c1c0 100644 --- a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c +++ b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c @@ -12,15 +12,10 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/transpose_sse2.h" -static INLINE void extend_64bit(const __m128i in, - __m128i *const out /*out[2]*/) { - out[0] = _mm_unpacklo_epi32(in, in); // 0, 0, 1, 1 - out[1] = _mm_unpackhi_epi32(in, in); // 2, 2, 3, 3 -} - static INLINE void highbd_idct4(__m128i *const io) { const __m128i cospi_p16_p16 = _mm_setr_epi32(cospi_16_64 << 2, 0, cospi_16_64 << 2, 0); @@ -28,46 +23,19 @@ static INLINE void highbd_idct4(__m128i *const io) { _mm_setr_epi32(cospi_8_64 << 2, 0, cospi_8_64 << 2, 0); const __m128i cospi_p24_p24 = _mm_setr_epi32(cospi_24_64 << 2, 0, cospi_24_64 << 2, 0); - __m128i temp1[4], temp2[4], step[4]; + __m128i temp1[4], step[4]; transpose_32bit_4x4(&io[0], &io[1], &io[2], &io[3]); // stage 1 temp1[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] - temp2[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] extend_64bit(temp1[0], temp1); - extend_64bit(temp2[0], temp2); - temp1[0] = _mm_mul_epi32(temp1[0], cospi_p16_p16); - temp1[1] = _mm_mul_epi32(temp1[1], cospi_p16_p16); - temp2[0] = _mm_mul_epi32(temp2[0], cospi_p16_p16); - temp2[1] = _mm_mul_epi32(temp2[1], cospi_p16_p16); - temp1[0] = dct_const_round_shift_64bit(temp1[0]); - temp1[1] = dct_const_round_shift_64bit(temp1[1]); - temp2[0] = dct_const_round_shift_64bit(temp2[0]); - temp2[1] = dct_const_round_shift_64bit(temp2[1]); - step[0] = pack_4(temp1[0], temp1[1]); - step[1] = pack_4(temp2[0], temp2[1]); - - extend_64bit(io[1], temp1); - extend_64bit(io[3], temp2); - temp1[2] = _mm_mul_epi32(temp1[0], cospi_p08_p08); - temp1[3] = _mm_mul_epi32(temp1[1], cospi_p08_p08); - temp1[0] = _mm_mul_epi32(temp1[0], cospi_p24_p24); - temp1[1] = _mm_mul_epi32(temp1[1], cospi_p24_p24); - temp2[2] = _mm_mul_epi32(temp2[0], cospi_p24_p24); - temp2[3] = _mm_mul_epi32(temp2[1], cospi_p24_p24); - temp2[0] = _mm_mul_epi32(temp2[0], cospi_p08_p08); - temp2[1] = _mm_mul_epi32(temp2[1], cospi_p08_p08); - temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); // [1]*cospi_24 - [3]*cospi_8 - temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); // [1]*cospi_24 - [3]*cospi_8 - temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); // [1]*cospi_8 + [3]*cospi_24 - temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); // [1]*cospi_8 + [3]*cospi_24 - temp1[0] = dct_const_round_shift_64bit(temp1[0]); - temp1[1] = dct_const_round_shift_64bit(temp1[1]); - temp2[0] = dct_const_round_shift_64bit(temp2[0]); - temp2[1] = dct_const_round_shift_64bit(temp2[1]); - step[2] = pack_4(temp1[0], temp1[1]); - step[3] = pack_4(temp2[0], temp2[1]); + step[0] = multiplication_round_shift(temp1, cospi_p16_p16); + temp1[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] + extend_64bit(temp1[0], temp1); + step[1] = multiplication_round_shift(temp1, cospi_p16_p16); + multiplication_and_add_2_ssse4_1(&io[1], &io[3], &cospi_p24_p24, + &cospi_p08_p08, &step[2], &step[3]); // stage 2 io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] @@ -103,5 +71,5 @@ void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest, io[1] = wraplow_16bit(io[2], io[3], _mm_set1_epi32(8)); } - recon_and_store_4(dest, io, stride, bd); + recon_and_store_4(io, dest, stride, bd); } diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c index c12e3e1b9..8eae17581 100644 --- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c @@ -94,7 +94,7 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, inptr[i] = _mm_add_epi16(inptr[i], sixteen); d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); + d[i] = add_clamp(d[i], inptr[i], bd); // Store _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); } @@ -196,7 +196,7 @@ void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, inptr[i] = _mm_add_epi16(inptr[i], sixteen); d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); + d[i] = add_clamp(d[i], inptr[i], bd); // Store _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); } diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h index be740a8c7..5bdabc4d6 100644 --- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h +++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -17,6 +17,12 @@ #include "vpx_dsp/inv_txfm.h" #include "vpx_dsp/x86/txfm_common_sse2.h" +static INLINE void extend_64bit(const __m128i in, + __m128i *const out /*out[2]*/) { + out[0] = _mm_unpacklo_epi32(in, in); // 0, 0, 1, 1 + out[1] = _mm_unpackhi_epi32(in, in); // 2, 2, 3, 3 +} + static INLINE __m128i wraplow_16bit(const __m128i in0, const __m128i in1, const __m128i rounding) { __m128i temp[2]; @@ -40,24 +46,24 @@ static INLINE __m128i pack_4(const __m128i in0, const __m128i in1) { return _mm_unpacklo_epi32(t0, t1); // 0, 1, 2, 3 } -static INLINE __m128i add_dc_clamp(const __m128i *const min, - const __m128i *const max, - const __m128i *const dc, - const __m128i *const in) { - __m128i out; - out = _mm_adds_epi16(*in, *dc); - out = _mm_max_epi16(out, *min); - out = _mm_min_epi16(out, *max); - return out; +static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1, + const int bd) { + const __m128i zero = _mm_set1_epi16(0); + // Faster than _mm_set1_epi16((1 << bd) - 1). + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + __m128i d; + + d = _mm_adds_epi16(in0, in1); + d = _mm_max_epi16(d, zero); + d = _mm_min_epi16(d, max); + + return d; } static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input, uint16_t *dest, int stride, int bd, const int size) { - const __m128i zero = _mm_setzero_si128(); - // Faster than _mm_set1_epi16((1 << bd) - 1). - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); int a1, i, j; tran_low_t out; __m128i dc, d; @@ -70,43 +76,31 @@ static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input, for (i = 0; i < size; ++i) { for (j = 0; j < (size >> 3); ++j) { d = _mm_load_si128((const __m128i *)(&dest[j * 8])); - d = add_dc_clamp(&zero, &max, &dc, &d); + d = add_clamp(d, dc, bd); _mm_store_si128((__m128i *)(&dest[j * 8]), d); } dest += stride; } } -static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { - __m128i ubounded, retval; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); - ubounded = _mm_cmpgt_epi16(value, max); - retval = _mm_andnot_si128(ubounded, value); - ubounded = _mm_and_si128(ubounded, max); - retval = _mm_or_si128(retval, ubounded); - retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); - return retval; +static INLINE void recon_and_store_4_dual(const __m128i in, + uint16_t *const dest, + const int stride, const int bd) { + __m128i d; + + d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride)); + d = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(d), (const __m64 *)(dest + 1 * stride))); + d = add_clamp(d, in, bd); + _mm_storel_epi64((__m128i *)(dest + 0 * stride), d); + _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d)); } -static INLINE void recon_and_store_4(uint16_t *const dest, - const __m128i *const io, const int stride, - int bd) { - __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); - __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); - d0 = - _mm_unpacklo_epi64(d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); - d2 = _mm_unpacklo_epi64( - d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); - d0 = clamp_high_sse2(_mm_adds_epi16(d0, io[0]), bd); - d2 = clamp_high_sse2(_mm_adds_epi16(d2, io[1]), bd); - _mm_storel_epi64((__m128i *)dest, d0); - d0 = _mm_srli_si128(d0, 8); - _mm_storel_epi64((__m128i *)(dest + stride), d0); - _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); - d2 = _mm_srli_si128(d2, 8); - _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); +static INLINE void recon_and_store_4(const __m128i *const in, uint16_t *dest, + const int stride, const int bd) { + recon_and_store_4_dual(in[0], dest, stride, bd); + dest += 2 * stride; + recon_and_store_4_dual(in[1], dest, stride, bd); } #endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/vpx_dsp/x86/highbd_inv_txfm_sse4.h new file mode 100644 index 000000000..72d3d5327 --- /dev/null +++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ +#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ + +#include // SSE4.1 + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE __m128i multiplication_round_shift(const __m128i *const in, + const __m128i cospi) { + __m128i t0, t1; + t0 = _mm_mul_epi32(in[0], cospi); + t1 = _mm_mul_epi32(in[1], cospi); + t0 = dct_const_round_shift_64bit(t0); + t1 = dct_const_round_shift_64bit(t1); + return pack_4(t0, t1); +} + +static INLINE void multiplication_and_add_2_ssse4_1(const __m128i *const in0, + const __m128i *const in1, + const __m128i *const cst0, + const __m128i *const cst1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp1[4], temp2[4]; + extend_64bit(*in0, temp1); + extend_64bit(*in1, temp2); + temp1[2] = _mm_mul_epi32(temp1[0], *cst1); + temp1[3] = _mm_mul_epi32(temp1[1], *cst1); + temp1[0] = _mm_mul_epi32(temp1[0], *cst0); + temp1[1] = _mm_mul_epi32(temp1[1], *cst0); + temp2[2] = _mm_mul_epi32(temp2[0], *cst0); + temp2[3] = _mm_mul_epi32(temp2[1], *cst0); + temp2[0] = _mm_mul_epi32(temp2[0], *cst1); + temp2[1] = _mm_mul_epi32(temp2[1], *cst1); + temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); + temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); + temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); + temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); + temp1[0] = dct_const_round_shift_64bit(temp1[0]); + temp1[1] = dct_const_round_shift_64bit(temp1[1]); + temp2[0] = dct_const_round_shift_64bit(temp2[0]); + temp2[1] = dct_const_round_shift_64bit(temp2[1]); + *out0 = pack_4(temp1[0], temp1[1]); + *out1 = pack_4(temp2[0], temp2[1]); +} + +#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h index e6e1cd403..1499b59e5 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.h +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -152,28 +152,34 @@ static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) { _mm_storel_epi64((__m128i *)(dest), d0); } +static INLINE void round_shift_8x8(const __m128i *const in, + __m128i *const out) { + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + + out[0] = _mm_add_epi16(in[0], final_rounding); + out[1] = _mm_add_epi16(in[1], final_rounding); + out[2] = _mm_add_epi16(in[2], final_rounding); + out[3] = _mm_add_epi16(in[3], final_rounding); + out[4] = _mm_add_epi16(in[4], final_rounding); + out[5] = _mm_add_epi16(in[5], final_rounding); + out[6] = _mm_add_epi16(in[6], final_rounding); + out[7] = _mm_add_epi16(in[7], final_rounding); + + out[0] = _mm_srai_epi16(out[0], 5); + out[1] = _mm_srai_epi16(out[1], 5); + out[2] = _mm_srai_epi16(out[2], 5); + out[3] = _mm_srai_epi16(out[3], 5); + out[4] = _mm_srai_epi16(out[4], 5); + out[5] = _mm_srai_epi16(out[5], 5); + out[6] = _mm_srai_epi16(out[6], 5); + out[7] = _mm_srai_epi16(out[7], 5); +} + static INLINE void write_buffer_8x8(const __m128i *const in, uint8_t *const dest, const int stride) { - const __m128i final_rounding = _mm_set1_epi16(1 << 4); __m128i t[8]; - // Final rounding and shift - t[0] = _mm_adds_epi16(in[0], final_rounding); - t[1] = _mm_adds_epi16(in[1], final_rounding); - t[2] = _mm_adds_epi16(in[2], final_rounding); - t[3] = _mm_adds_epi16(in[3], final_rounding); - t[4] = _mm_adds_epi16(in[4], final_rounding); - t[5] = _mm_adds_epi16(in[5], final_rounding); - t[6] = _mm_adds_epi16(in[6], final_rounding); - t[7] = _mm_adds_epi16(in[7], final_rounding); - - t[0] = _mm_srai_epi16(t[0], 5); - t[1] = _mm_srai_epi16(t[1], 5); - t[2] = _mm_srai_epi16(t[2], 5); - t[3] = _mm_srai_epi16(t[3], 5); - t[4] = _mm_srai_epi16(t[4], 5); - t[5] = _mm_srai_epi16(t[5], 5); - t[6] = _mm_srai_epi16(t[6], 5); - t[7] = _mm_srai_epi16(t[7], 5); + + round_shift_8x8(in, t); recon_and_store(dest + 0 * stride, t[0]); recon_and_store(dest + 1 * stride, t[1]); -- 2.40.0