]> granicus.if.org Git - libvpx/commitdiff
Improve v128 and v64 8 bit shifts for x86
authorSteinar Midtskogen <stemidts@cisco.com>
Wed, 28 Sep 2016 15:38:46 +0000 (17:38 +0200)
committerYaowu Xu <yaowu@google.com>
Tue, 11 Oct 2016 19:36:17 +0000 (12:36 -0700)
Change-Id: I25dc61bab46895d425ce49f89fceb164bee36906

aom_dsp/simd/v128_intrinsics_x86.h
aom_dsp/simd/v64_intrinsics_x86.h

index b9b920da89e8f532a705cebf092eae02bb3cf437..45049960231e2448925e22016093106a102ff35e 100644 (file)
@@ -420,26 +420,19 @@ SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
 SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
 
 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
-  __m128i x = _mm_cvtsi32_si128(c);
-  return _mm_packus_epi16(
-      _mm_srli_epi16(
-          _mm_sll_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), x), 8),
-      _mm_srli_epi16(
-          _mm_sll_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), x), 8));
+  return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
+                       _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
 }
 
 SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
-  __m128i x = _mm_cvtsi32_si128(c + 8);
-  return _mm_packus_epi16(
-      _mm_srl_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), x),
-      _mm_srl_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), x));
+  return _mm_and_si128(_mm_set1_epi8(0xff >> c),
+                       _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
 }
 
 SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
   __m128i x = _mm_cvtsi32_si128(c + 8);
-  return _mm_packs_epi16(
-      _mm_sra_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), x),
-      _mm_sra_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), x));
+  return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
+                         _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
 }
 
 SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
@@ -470,20 +463,13 @@ SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
    to enforce that. */
 #define v128_shl_n_byte(a, c) _mm_slli_si128(a, c)
 #define v128_shr_n_byte(a, c) _mm_srli_si128(a, c)
-#define v128_shl_n_8(a, c)                                                  \
-  _mm_packus_epi16(                                                         \
-      _mm_srli_epi16(                                                       \
-          _mm_slli_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), c), 8), \
-      _mm_srli_epi16(                                                       \
-          _mm_slli_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), c), 8))
-#define v128_shr_n_u8(a, c)                                             \
-  _mm_packus_epi16(                                                     \
-      _mm_srli_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), c + 8), \
-      _mm_srli_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), c + 8))
-#define v128_shr_n_s8(a, c)                                             \
-  _mm_packs_epi16(                                                      \
-      _mm_srai_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), c + 8), \
-      _mm_srai_epi16(_mm_unpackhi_epi8(_mm_setzero_si128(), a), c + 8))
+#define v128_shl_n_8(a, c) \
+  _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
+#define v128_shr_n_u8(a, c) \
+  _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
+#define v128_shr_n_s8(a, c)                                         \
+  _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \
+                  _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8))
 #define v128_shl_n_16(a, c) _mm_slli_epi16(a, c)
 #define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c)
 #define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c)
index e1b873b2b2e9c6c2ad33bf5e21c7691a01aabc6d..c7e470906ed1d95667cce826ca41220b4b0b2484 100644 (file)
@@ -389,25 +389,18 @@ SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); }
 SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
 
 SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
-  return _mm_packus_epi16(
-      _mm_srli_epi16(_mm_sll_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a),
-                                   _mm_cvtsi32_si128(c)),
-                     8),
-      _mm_setzero_si128());
+  return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
+                       _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
 }
 
 SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
-  __m128i cp8 = _mm_cvtsi32_si128(c + 8);
-  return _mm_packus_epi16(
-      _mm_srl_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), cp8),
-      _mm_setzero_si128());
+  return _mm_and_si128(_mm_set1_epi8(0xff >> c),
+                       _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
 }
 
 SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
-  __m128i cp8 = _mm_cvtsi32_si128(c + 8);
   return _mm_packs_epi16(
-      _mm_sra_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), cp8),
-      _mm_setzero_si128());
+      _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a);
 }
 
 SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
@@ -438,19 +431,12 @@ SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
    to enforce that. */
 #define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
 #define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
-#define v64_shl_n_8(a, c)                                                  \
-  _mm_packus_epi16(                                                        \
-      _mm_srli_epi16(                                                      \
-          _mm_sll_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), c), 8), \
-      _mm_setzero_si128())
-#define v64_shr_n_u8(a, c)                                               \
-  _mm_packus_epi16(                                                      \
-      _mm_srl_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), (c) + 8), \
-      _mm_setzero_si128())
-#define v64_shr_n_s8(a, c)                                               \
-  _mm_packs_epi16(                                                       \
-      _mm_sra_epi16(_mm_unpacklo_epi8(_mm_setzero_si128(), a), (c) + 8), \
-      _mm_setzero_si128())
+#define v64_shl_n_8(a, c) \
+  _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
+#define v64_shr_n_u8(a, c) \
+  _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
+#define v64_shr_n_s8(a, c) \
+  _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
 #define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
 #define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c)
 #define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c)