From: Steinar Midtskogen Date: Wed, 28 Sep 2016 15:38:46 +0000 (+0200) Subject: Improve v128 and v64 8 bit shifts for x86 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9d6a53b8fde3614cc50ff103f61fa3004a18755e;p=libvpx Improve v128 and v64 8 bit shifts for x86 Change-Id: I25dc61bab46895d425ce49f89fceb164bee36906 --- diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h index b9b920da8..450499602 100644 --- a/aom_dsp/simd/v128_intrinsics_x86.h +++ b/aom_dsp/simd/v128_intrinsics_x86.h @@ -420,26 +420,19 @@ SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) { SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); } SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { - __m128i x = _mm_cvtsi32_si128(c); - return _mm_packus_epi16( - _mm_srli_epi16( - _mm_sll_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), x), 8), - _mm_srli_epi16( - _mm_sll_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), x), 8)); + return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)), + _mm_sll_epi16(a, _mm_cvtsi32_si128(c))); } SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { - __m128i x = _mm_cvtsi32_si128(c + 8); - return _mm_packus_epi16( - _mm_srl_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), x), - _mm_srl_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), x)); + return _mm_and_si128(_mm_set1_epi8(0xff >> c), + _mm_srl_epi16(a, _mm_cvtsi32_si128(c))); } SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { __m128i x = _mm_cvtsi32_si128(c + 8); - return _mm_packs_epi16( - _mm_sra_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), x), - _mm_sra_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), x)); + return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x), + _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x)); } SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { @@ -470,20 +463,13 @@ SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { to enforce that. */ #define v128_shl_n_byte(a, c) _mm_slli_si128(a, c) #define v128_shr_n_byte(a, c) _mm_srli_si128(a, c) -#define v128_shl_n_8(a, c) \ - _mm_packus_epi16( \ - _mm_srli_epi16( \ - _mm_slli_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), c), 8), \ - _mm_srli_epi16( \ - _mm_slli_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), c), 8)) -#define v128_shr_n_u8(a, c) \ - _mm_packus_epi16( \ - _mm_srli_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), c + 8), \ - _mm_srli_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), c + 8)) -#define v128_shr_n_s8(a, c) \ - _mm_packs_epi16( \ - _mm_srai_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), c + 8), \ - _mm_srai_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), c + 8)) +#define v128_shl_n_8(a, c) \ + _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c)) +#define v128_shr_n_u8(a, c) \ + _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c)) +#define v128_shr_n_s8(a, c) \ + _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \ + _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8)) #define v128_shl_n_16(a, c) _mm_slli_epi16(a, c) #define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c) #define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c) diff --git a/aom_dsp/simd/v64_intrinsics_x86.h b/aom_dsp/simd/v64_intrinsics_x86.h index e1b873b2b..c7e470906 100644 --- a/aom_dsp/simd/v64_intrinsics_x86.h +++ b/aom_dsp/simd/v64_intrinsics_x86.h @@ -389,25 +389,18 @@ SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); } SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); } SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) { - return _mm_packus_epi16( - _mm_srli_epi16(_mm_sll_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), - _mm_cvtsi32_si128(c)), - 8), - _mm_setzero_si128()); + return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)), + _mm_sll_epi16(a, _mm_cvtsi32_si128(c))); } SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) { - __m128i cp8 = _mm_cvtsi32_si128(c + 8); - return _mm_packus_epi16( - _mm_srl_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), cp8), - _mm_setzero_si128()); + return _mm_and_si128(_mm_set1_epi8(0xff >> c), + _mm_srl_epi16(a, _mm_cvtsi32_si128(c))); } SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) { - __m128i cp8 = _mm_cvtsi32_si128(c + 8); return _mm_packs_epi16( - _mm_sra_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), cp8), - _mm_setzero_si128()); + _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a); } SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) { @@ -438,19 +431,12 @@ SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { to enforce that. */ #define v64_shl_n_byte(a, c) _mm_slli_si128(a, c) #define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8) -#define v64_shl_n_8(a, c) \ - _mm_packus_epi16( \ - _mm_srli_epi16( \ - _mm_sll_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), c), 8), \ - _mm_setzero_si128()) -#define v64_shr_n_u8(a, c) \ - _mm_packus_epi16( \ - _mm_srl_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), (c) + 8), \ - _mm_setzero_si128()) -#define v64_shr_n_s8(a, c) \ - _mm_packs_epi16( \ - _mm_sra_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), (c) + 8), \ - _mm_setzero_si128()) +#define v64_shl_n_8(a, c) \ + _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c)) +#define v64_shr_n_u8(a, c) \ + _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c)) +#define v64_shr_n_s8(a, c) \ + _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a) #define v64_shl_n_16(a, c) _mm_slli_epi16(a, c) #define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c) #define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c)