Make generic SIMD work with clang.

author Steinar Midtskogen <stemidts@cisco.com>

Thu, 1 Sep 2016 17:45:29 +0000 (19:45 +0200)

committer Yaowu Xu <yaowu@google.com>

Mon, 10 Oct 2016 22:18:57 +0000 (15:18 -0700)
author Steinar Midtskogen <stemidts@cisco.com>
Thu, 1 Sep 2016 17:45:29 +0000 (19:45 +0200)
committer Yaowu Xu <yaowu@google.com>
Mon, 10 Oct 2016 22:18:57 +0000 (15:18 -0700)
diff --git a/aom_dsp/simd/v128_intrinsics_arm.h b/aom_dsp/simd/v128_intrinsics_arm.h

index 13d1314778c311020805a22f1fbeab7fae68db43..73549b85aafd967d2f941ab4881479127f5efa65 100644 (file)
--- a/aom_dsp/simd/v128_intrinsics_arm.h
+++ b/aom_dsp/simd/v128_intrinsics_arm.h
@@ -28,7 +28,7 @@ SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); }
  SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); }
  
  SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
-  return vcombine_s64(b, a);
+  return vcombine_s64((uint64x1_t)b, (uint64x1_t)a);
  }
  
  SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
@@ -52,7 +52,9 @@ SIMD_INLINE void v128_store_unaligned(void *p, v128 r) {
  }
  
  SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
-#if __OPTIMIZE__
+// The following functions require an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if __OPTIMIZE__ && !__clang__
    return c ? vreinterpretq_s64_s8(
                   vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c))
             : b;
@@ -122,7 +124,7 @@ typedef struct { ssd64_internal hi, lo; } ssd128_internal;
  
  SIMD_INLINE ssd128_internal v128_ssd_u8_init() {
    ssd128_internal s;
-  s.hi = s.lo = 0;
+  s.hi = s.lo = (ssd64_internal)(uint64_t)0;
    return s;
  }
  
@@ -430,11 +432,11 @@ SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
  
  SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
    return v128_from_64(
-      vreinterpret_s64_u8(
+      (uint64_t)vreinterpret_s64_u8(
            vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
                                      vget_high_u8(vreinterpretq_u8_s64(x)) } },
                     vreinterpret_u8_s64(vget_high_s64(pattern)))),
-      vreinterpret_s64_u8(
+      (uint64_t)vreinterpret_s64_u8(
            vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
                                      vget_high_u8(vreinterpretq_u8_s64(x)) } },
                     vreinterpret_u8_s64(vget_low_s64(pattern)))));
@@ -521,21 +523,24 @@ SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
                          vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c)));
  }
  
-#if __OPTIMIZE__
+#if __OPTIMIZE__ && !__clang__
  
  SIMD_INLINE v128 v128_shl_n_byte(v128 a, const unsigned int n) {
    return n < 8
               ? v128_from_64(
-                   vorr_u64(vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
-                                       n * 8),
-                            vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
-                                       (8 - n) * 8)),
-                   vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8))
-             : (n == 8 ? v128_from_64(vreinterpret_u64_s64(vget_low_s64(a)), 0)
-                       : v128_from_64(
-                             vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
-                                        (n - 8) * 8),
-                             0));
+                   (uint64_t)vorr_u64(
+                       vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
+                                  n * 8),
+                       vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
+                                  (8 - n) * 8)),
+                   (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
+                                        n * 8))
+             : (n == 8 ? v128_from_64(
+                             (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0)
+                       : v128_from_64((uint64_t)vshl_n_u64(
+                                          vreinterpret_u64_s64(vget_low_s64(a)),
+                                          (n - 8) * 8),
+                                      0));
  }
  
  SIMD_INLINE v128 v128_shr_n_byte(v128 a, const unsigned int n) {
diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h

index e09cbb930139608ff5023e11861bf97bc23dd3d5..b9b920da89e8f532a705cebf092eae02bb3cf437 100644 (file)
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -58,7 +58,9 @@ SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
    _mm_storeu_si128((__m128i *)p, a);
  }
  
-#if defined(__OPTIMIZE__)
+// The following function requires an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if __OPTIMIZE__ && !__clang__
  #if defined(__SSSE3__)
  SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
    return c ? _mm_alignr_epi8(a, b, c) : b;
diff --git a/aom_dsp/simd/v64_intrinsics_arm.h b/aom_dsp/simd/v64_intrinsics_arm.h

index 49bafbc2774aa98a16cb2f1e89b5eb6db191a1cc..bf92167358265cbaf575b486df339b20309b78cd 100644 (file)
--- a/aom_dsp/simd/v64_intrinsics_arm.h
+++ b/aom_dsp/simd/v64_intrinsics_arm.h
@@ -14,12 +14,10 @@
  
  #include <arm_neon.h>
  #include "./v64_intrinsics_arm.h"
+#include "aom_ports/arm.h"
  
-/* vzip in gcc is broken.  Fixed in 4.6.1? */
-#if __GNUC__ &&                                                       \
-    ((__GNUC__ << 16) + (__GNUC_MINOR__ << 8) + __GNUC_PATCHLEVEL__ < \
-     (4 << 16) + (6 << 8) + 1)
-#error vzip buggy in gcc.  Get at least gcc 4.6.1.
+#ifdef AOM_INCOMPATIBLE_GCC
+#error Incompatible gcc
  #endif
  
  typedef int64x1_t v64;
@@ -51,7 +49,7 @@ SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
  
  SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); }
  
-SIMD_INLINE uint64_t v64_u64(v64 x) { return x; }
+SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; }
  
  SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
    return *((uint32_t *)p);
@@ -66,12 +64,16 @@ SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
  }
  
  SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
-#if __CC_ARM
+#if __clang__
+  vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
+                0);
+#elif __CC_ARM
    *(__packed uint32_t *)p) = a;
  #elif __GNUC__
    *((__attribute((packed)) uint32_t *)p) = a;
  #else
-  vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64(a), 0);
+  vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
+                0);
  #endif
  }
  
@@ -91,13 +93,16 @@ SIMD_INLINE void v64_store_unaligned(void *p, v64 r) {
    vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
  }
  
+// The following function requires an immediate.
+// Some compilers will check this if it's optimising, others wont.
  SIMD_INLINE v64 v64_align(v64 a, v64 b, const unsigned int c) {
-#if __OPTIMIZE__
+#if __OPTIMIZE__ && !__clang__
    return c ? vreinterpret_s64_s8(
                   vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c))
             : b;
  #else
-  return c ? v64_from_64(b >> c * 8) | (a << (8 - c) * 8) : b;
+  return c ? v64_from_64((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8)
+           : b;
  #endif
  }
  
@@ -121,21 +126,21 @@ SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) {
    int64x2_t r = vpaddlq_s32(vpaddlq_s16(
        vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
                  vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))))));
-  return vadd_s64(vget_high_s64(r), vget_low_s64(r));
+  return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
  }
  
  SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
    int64x2_t r =
        vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-  return vget_high_s64(r) + vget_low_s64(r);
+  return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
  }
  
  SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
-  return vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
+  return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
  }
  
  SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
-  return vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
+  return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
  }
  
  typedef uint16x8_t sad64_internal;
@@ -151,12 +156,14 @@ SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
  
  SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
    uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
-  return (uint32_t)(vget_high_u64(r) + vget_low_u64(r));
+  return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
  }
  
  typedef int64x1_t ssd64_internal;
  
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return 0; }
+SIMD_INLINE ssd64_internal v64_ssd_u8_init() {
+  return (ssd64_internal)(uint64_t)0;
+}
  
  /* Implementation dependent return value.  Result must be finalised with
   * v64_ssd_u8_sum(). */
@@ -166,7 +173,9 @@ SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
    return vadd_u64(s, vadd_u64(vget_high_u64(r), vget_low_u64(r)));
  }
  
-SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) { return (uint32_t)s; }
+SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
+  return (uint32_t)(uint64_t)s;
+}
  
  SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); }
  
@@ -470,7 +479,9 @@ SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
        vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c)));
  }
  
-#if __OPTIMIZE__
+// The following functions require an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if __OPTIMIZE__ && !__clang__
  
  SIMD_INLINE v64 v64_shl_n_byte(v64 a, const unsigned int c) {
    return vshl_n_s64(a, c * 8);
diff --git a/aom_dsp/simd/v64_intrinsics_x86.h b/aom_dsp/simd/v64_intrinsics_x86.h

index 502df23988936a1e9748b0c7ed659bdcabbe5ea7..e1b873b2b2e9c6c2ad33bf5e21c7691a01aabc6d 100644 (file)
--- a/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/aom_dsp/simd/v64_intrinsics_x86.h
@@ -86,6 +86,7 @@ SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
    _mm_storel_epi64((__m128i *)p, a);
  }
  
+// The following function requires an immediate.
  #if __OPTIMIZE__
  #define v64_align(a, b, c) \
    (c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b;
author	Steinar Midtskogen <stemidts@cisco.com>
	Thu, 1 Sep 2016 17:45:29 +0000 (19:45 +0200)
committer	Yaowu Xu <yaowu@google.com>
	Mon, 10 Oct 2016 22:18:57 +0000 (15:18 -0700)
aom_dsp/simd/v128_intrinsics_arm.h		patch \| blob \| history
aom_dsp/simd/v128_intrinsics_x86.h		patch \| blob \| history
aom_dsp/simd/v64_intrinsics_arm.h		patch \| blob \| history
aom_dsp/simd/v64_intrinsics_x86.h		patch \| blob \| history