From: Steinar Midtskogen Date: Thu, 1 Sep 2016 17:45:29 +0000 (+0200) Subject: Make generic SIMD work with clang. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7b7624e89e9be4da1f402ead8c3e8bceb329bb40;p=libvpx Make generic SIMD work with clang. Change-Id: I2c504a078a7137bea6ba50c5768c1295878e9ea1 --- diff --git a/aom_dsp/simd/v128_intrinsics_arm.h b/aom_dsp/simd/v128_intrinsics_arm.h index 13d131477..73549b85a 100644 --- a/aom_dsp/simd/v128_intrinsics_arm.h +++ b/aom_dsp/simd/v128_intrinsics_arm.h @@ -28,7 +28,7 @@ SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); } SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); } SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) { - return vcombine_s64(b, a); + return vcombine_s64((uint64x1_t)b, (uint64x1_t)a); } SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { @@ -52,7 +52,9 @@ SIMD_INLINE void v128_store_unaligned(void *p, v128 r) { } SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) { -#if __OPTIMIZE__ +// The following functions require an immediate. +// Some compilers will check this during optimisation, others wont. +#if __OPTIMIZE__ && !__clang__ return c ? vreinterpretq_s64_s8( vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c)) : b; @@ -122,7 +124,7 @@ typedef struct { ssd64_internal hi, lo; } ssd128_internal; SIMD_INLINE ssd128_internal v128_ssd_u8_init() { ssd128_internal s; - s.hi = s.lo = 0; + s.hi = s.lo = (ssd64_internal)(uint64_t)0; return s; } @@ -430,11 +432,11 @@ SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { return v128_from_64( - vreinterpret_s64_u8( + (uint64_t)vreinterpret_s64_u8( vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)), vget_high_u8(vreinterpretq_u8_s64(x)) } }, vreinterpret_u8_s64(vget_high_s64(pattern)))), - vreinterpret_s64_u8( + (uint64_t)vreinterpret_s64_u8( vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)), vget_high_u8(vreinterpretq_u8_s64(x)) } }, vreinterpret_u8_s64(vget_low_s64(pattern))))); @@ -521,21 +523,24 @@ SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c))); } -#if __OPTIMIZE__ +#if __OPTIMIZE__ && !__clang__ SIMD_INLINE v128 v128_shl_n_byte(v128 a, const unsigned int n) { return n < 8 ? v128_from_64( - vorr_u64(vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), - n * 8), - vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), - (8 - n) * 8)), - vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8)) - : (n == 8 ? v128_from_64(vreinterpret_u64_s64(vget_low_s64(a)), 0) - : v128_from_64( - vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), - (n - 8) * 8), - 0)); + (uint64_t)vorr_u64( + vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), + n * 8), + vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), + (8 - n) * 8)), + (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), + n * 8)) + : (n == 8 ? v128_from_64( + (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0) + : v128_from_64((uint64_t)vshl_n_u64( + vreinterpret_u64_s64(vget_low_s64(a)), + (n - 8) * 8), + 0)); } SIMD_INLINE v128 v128_shr_n_byte(v128 a, const unsigned int n) { diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h index e09cbb930..b9b920da8 100644 --- a/aom_dsp/simd/v128_intrinsics_x86.h +++ b/aom_dsp/simd/v128_intrinsics_x86.h @@ -58,7 +58,9 @@ SIMD_INLINE void v128_store_unaligned(void *p, v128 a) { _mm_storeu_si128((__m128i *)p, a); } -#if defined(__OPTIMIZE__) +// The following function requires an immediate. +// Some compilers will check this during optimisation, others wont. +#if __OPTIMIZE__ && !__clang__ #if defined(__SSSE3__) SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) { return c ? _mm_alignr_epi8(a, b, c) : b; diff --git a/aom_dsp/simd/v64_intrinsics_arm.h b/aom_dsp/simd/v64_intrinsics_arm.h index 49bafbc27..bf9216735 100644 --- a/aom_dsp/simd/v64_intrinsics_arm.h +++ b/aom_dsp/simd/v64_intrinsics_arm.h @@ -14,12 +14,10 @@ #include #include "./v64_intrinsics_arm.h" +#include "aom_ports/arm.h" -/* vzip in gcc is broken. Fixed in 4.6.1? */ -#if __GNUC__ && \ - ((__GNUC__ << 16) + (__GNUC_MINOR__ << 8) + __GNUC_PATCHLEVEL__ < \ - (4 << 16) + (6 << 8) + 1) -#error vzip buggy in gcc. Get at least gcc 4.6.1. +#ifdef AOM_INCOMPATIBLE_GCC +#error Incompatible gcc #endif typedef int64x1_t v64; @@ -51,7 +49,7 @@ SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); } -SIMD_INLINE uint64_t v64_u64(v64 x) { return x; } +SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; } SIMD_INLINE uint32_t u32_load_aligned(const void *p) { return *((uint32_t *)p); @@ -66,12 +64,16 @@ SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { } SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { -#if __CC_ARM +#if __clang__ + vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a), + 0); +#elif __CC_ARM *(__packed uint32_t *)p) = a; #elif __GNUC__ *((__attribute((packed)) uint32_t *)p) = a; #else - vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64(a), 0); + vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a), + 0); #endif } @@ -91,13 +93,16 @@ SIMD_INLINE void v64_store_unaligned(void *p, v64 r) { vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r)); } +// The following function requires an immediate. +// Some compilers will check this if it's optimising, others wont. SIMD_INLINE v64 v64_align(v64 a, v64 b, const unsigned int c) { -#if __OPTIMIZE__ +#if __OPTIMIZE__ && !__clang__ return c ? vreinterpret_s64_s8( vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c)) : b; #else - return c ? v64_from_64(b >> c * 8) | (a << (8 - c) * 8) : b; + return c ? v64_from_64((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8) + : b; #endif } @@ -121,21 +126,21 @@ SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) { int64x2_t r = vpaddlq_s32(vpaddlq_s16( vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)), vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y)))))); - return vadd_s64(vget_high_s64(r), vget_low_s64(r)); + return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r)); } SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) { int64x2_t r = vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); - return vget_high_s64(r) + vget_low_s64(r); + return (int64_t)(vget_high_s64(r) + vget_low_s64(r)); } SIMD_INLINE uint64_t v64_hadd_u8(v64 x) { - return vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x)))); + return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x)))); } SIMD_INLINE int64_t v64_hadd_s16(v64 a) { - return vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a))); + return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a))); } typedef uint16x8_t sad64_internal; @@ -151,12 +156,14 @@ SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s)); - return (uint32_t)(vget_high_u64(r) + vget_low_u64(r)); + return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r)); } typedef int64x1_t ssd64_internal; -SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return 0; } +SIMD_INLINE ssd64_internal v64_ssd_u8_init() { + return (ssd64_internal)(uint64_t)0; +} /* Implementation dependent return value. Result must be finalised with * v64_ssd_u8_sum(). */ @@ -166,7 +173,9 @@ SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { return vadd_u64(s, vadd_u64(vget_high_u64(r), vget_low_u64(r))); } -SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) { return (uint32_t)s; } +SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) { + return (uint32_t)(uint64_t)s; +} SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); } @@ -470,7 +479,9 @@ SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c))); } -#if __OPTIMIZE__ +// The following functions require an immediate. +// Some compilers will check this during optimisation, others wont. +#if __OPTIMIZE__ && !__clang__ SIMD_INLINE v64 v64_shl_n_byte(v64 a, const unsigned int c) { return vshl_n_s64(a, c * 8); diff --git a/aom_dsp/simd/v64_intrinsics_x86.h b/aom_dsp/simd/v64_intrinsics_x86.h index 502df2398..e1b873b2b 100644 --- a/aom_dsp/simd/v64_intrinsics_x86.h +++ b/aom_dsp/simd/v64_intrinsics_x86.h @@ -86,6 +86,7 @@ SIMD_INLINE void v64_store_unaligned(void *p, v64 a) { _mm_storel_epi64((__m128i *)p, a); } +// The following function requires an immediate. #if __OPTIMIZE__ #define v64_align(a, b, c) \ (c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b;